Duplicate source files for Vulkan.

The Vulkan implementation needs a directory for each architectural
layer, similar to the OpenGL ES stack. The entire rendering stack is
duplicated, leaving only Reactor common between them:

Renderer -> Device
Shader -> Pipeline
Common -> System
Main -> WSI

Bug b/117152542

Change-Id: I9c26b23654016d637f88ec2416f019ef65b9afbd
Reviewed-on: https://swiftshader-review.googlesource.com/c/21248
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
new file mode 100644
index 0000000..6522a13
--- /dev/null
+++ b/src/Device/Blitter.cpp
@@ -0,0 +1,1481 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Blitter.hpp"
+
+#include "Shader/ShaderCore.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	Blitter::Blitter()
+	{
+		blitCache = new RoutineCache<State>(1024);
+	}
+
+	Blitter::~Blitter()
+	{
+		delete blitCache;
+	}
+
+	void Blitter::clear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+	{
+		if(fastClear(pixel, format, dest, dRect, rgbaMask))
+		{
+			return;
+		}
+
+		sw::Surface *color = sw::Surface::create(1, 1, 1, format, pixel, sw::Surface::bytes(format), sw::Surface::bytes(format));
+		SliceRectF sRect(0.5f, 0.5f, 0.5f, 0.5f, 0);   // Sample from the middle.
+		blit(color, sRect, dest, dRect, {rgbaMask});
+		delete color;
+	}
+
+	bool Blitter::fastClear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+	{
+		if(format != FORMAT_A32B32G32R32F)
+		{
+			return false;
+		}
+
+		float *color = (float*)pixel;
+		float r = color[0];
+		float g = color[1];
+		float b = color[2];
+		float a = color[3];
+
+		uint32_t packed;
+
+		switch(dest->getFormat())
+		{
+		case FORMAT_R5G6B5:
+			if((rgbaMask & 0x7) != 0x7) return false;
+			packed = ((uint16_t)(31 * b + 0.5f) << 0) |
+			         ((uint16_t)(63 * g + 0.5f) << 5) |
+			         ((uint16_t)(31 * r + 0.5f) << 11);
+			break;
+		case FORMAT_X8B8G8R8:
+			if((rgbaMask & 0x7) != 0x7) return false;
+			packed = ((uint32_t)(255) << 24) |
+			         ((uint32_t)(255 * b + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * r + 0.5f) << 0);
+			break;
+		case FORMAT_A8B8G8R8:
+			if((rgbaMask & 0xF) != 0xF) return false;
+			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+			         ((uint32_t)(255 * b + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * r + 0.5f) << 0);
+			break;
+		case FORMAT_X8R8G8B8:
+			if((rgbaMask & 0x7) != 0x7) return false;
+			packed = ((uint32_t)(255) << 24) |
+			         ((uint32_t)(255 * r + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * b + 0.5f) << 0);
+			break;
+		case FORMAT_A8R8G8B8:
+			if((rgbaMask & 0xF) != 0xF) return false;
+			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+			         ((uint32_t)(255 * r + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * b + 0.5f) << 0);
+			break;
+		default:
+			return false;
+		}
+
+		bool useDestInternal = !dest->isExternalDirty();
+		uint8_t *slice = (uint8_t*)dest->lock(dRect.x0, dRect.y0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC, useDestInternal);
+
+		for(int j = 0; j < dest->getSamples(); j++)
+		{
+			uint8_t *d = slice;
+
+			switch(Surface::bytes(dest->getFormat()))
+			{
+			case 2:
+				for(int i = dRect.y0; i < dRect.y1; i++)
+				{
+					sw::clear((uint16_t*)d, packed, dRect.x1 - dRect.x0);
+					d += dest->getPitchB(useDestInternal);
+				}
+				break;
+			case 4:
+				for(int i = dRect.y0; i < dRect.y1; i++)
+				{
+					sw::clear((uint32_t*)d, packed, dRect.x1 - dRect.x0);
+					d += dest->getPitchB(useDestInternal);
+				}
+				break;
+			default:
+				assert(false);
+			}
+
+			slice += dest->getSliceB(useDestInternal);
+		}
+
+		dest->unlock(useDestInternal);
+
+		return true;
+	}
+
+	void Blitter::blit(Surface *source, const SliceRectF &sourceRect, Surface *dest, const SliceRect &destRect, const Blitter::Options& options)
+	{
+		if(dest->getInternalFormat() == FORMAT_NULL)
+		{
+			return;
+		}
+
+		if(blitReactor(source, sourceRect, dest, destRect, options))
+		{
+			return;
+		}
+
+		SliceRectF sRect = sourceRect;
+		SliceRect dRect = destRect;
+
+		bool flipX = destRect.x0 > destRect.x1;
+		bool flipY = destRect.y0 > destRect.y1;
+
+		if(flipX)
+		{
+			swap(dRect.x0, dRect.x1);
+			swap(sRect.x0, sRect.x1);
+		}
+		if(flipY)
+		{
+			swap(dRect.y0, dRect.y1);
+			swap(sRect.y0, sRect.y1);
+		}
+
+		source->lockInternal(0, 0, sRect.slice, sw::LOCK_READONLY, sw::PUBLIC);
+		dest->lockInternal(0, 0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC);
+
+		float w = sRect.width() / dRect.width();
+		float h = sRect.height() / dRect.height();
+
+		float xStart = sRect.x0 + (0.5f - dRect.x0) * w;
+		float yStart = sRect.y0 + (0.5f - dRect.y0) * h;
+
+		for(int j = dRect.y0; j < dRect.y1; j++)
+		{
+			float y = yStart + j * h;
+
+			for(int i = dRect.x0; i < dRect.x1; i++)
+			{
+				float x = xStart + i * w;
+
+				// FIXME: Support RGBA mask
+				dest->copyInternal(source, i, j, x, y, options.filter);
+			}
+		}
+
+		source->unlockInternal();
+		dest->unlockInternal();
+	}
+
+	void Blitter::blit3D(Surface *source, Surface *dest)
+	{
+		source->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PUBLIC);
+		dest->lockInternal(0, 0, 0, sw::LOCK_WRITEONLY, sw::PUBLIC);
+
+		float w = static_cast<float>(source->getWidth())  / static_cast<float>(dest->getWidth());
+		float h = static_cast<float>(source->getHeight()) / static_cast<float>(dest->getHeight());
+		float d = static_cast<float>(source->getDepth())  / static_cast<float>(dest->getDepth());
+
+		for(int k = 0; k < dest->getDepth(); k++)
+		{
+			float z = (k + 0.5f) * d;
+
+			for(int j = 0; j < dest->getHeight(); j++)
+			{
+				float y = (j + 0.5f) * h;
+
+				for(int i = 0; i < dest->getWidth(); i++)
+				{
+					float x = (i + 0.5f) * w;
+
+					dest->copyInternal(source, i, j, k, x, y, z, true);
+				}
+			}
+		}
+
+		source->unlockInternal();
+		dest->unlockInternal();
+	}
+
+	bool Blitter::read(Float4 &c, Pointer<Byte> element, const State &state)
+	{
+		c = Float4(0.0f, 0.0f, 0.0f, 1.0f);
+
+		switch(state.sourceFormat)
+		{
+		case FORMAT_L8:
+			c.xyz = Float(Int(*Pointer<Byte>(element)));
+			c.w = float(0xFF);
+			break;
+		case FORMAT_A8:
+			c.w = Float(Int(*Pointer<Byte>(element)));
+			break;
+		case FORMAT_R8I:
+		case FORMAT_R8_SNORM:
+			c.x = Float(Int(*Pointer<SByte>(element)));
+			c.w = float(0x7F);
+			break;
+		case FORMAT_R8:
+		case FORMAT_R8UI:
+			c.x = Float(Int(*Pointer<Byte>(element)));
+			c.w = float(0xFF);
+			break;
+		case FORMAT_R16I:
+			c.x = Float(Int(*Pointer<Short>(element)));
+			c.w = float(0x7FFF);
+			break;
+		case FORMAT_R16UI:
+			c.x = Float(Int(*Pointer<UShort>(element)));
+			c.w = float(0xFFFF);
+			break;
+		case FORMAT_R32I:
+			c.x = Float(*Pointer<Int>(element));
+			c.w = float(0x7FFFFFFF);
+			break;
+		case FORMAT_R32UI:
+			c.x = Float(*Pointer<UInt>(element));
+			c.w = float(0xFFFFFFFF);
+			break;
+		case FORMAT_A8R8G8B8:
+			c = Float4(*Pointer<Byte4>(element)).zyxw;
+			break;
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8_SNORM:
+			c = Float4(*Pointer<SByte4>(element));
+			break;
+		case FORMAT_A8B8G8R8:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_SRGB8_A8:
+			c = Float4(*Pointer<Byte4>(element));
+			break;
+		case FORMAT_X8R8G8B8:
+			c = Float4(*Pointer<Byte4>(element)).zyxw;
+			c.w = float(0xFF);
+			break;
+		case FORMAT_R8G8B8:
+			c.z = Float(Int(*Pointer<Byte>(element + 0)));
+			c.y = Float(Int(*Pointer<Byte>(element + 1)));
+			c.x = Float(Int(*Pointer<Byte>(element + 2)));
+			c.w = float(0xFF);
+			break;
+		case FORMAT_B8G8R8:
+			c.x = Float(Int(*Pointer<Byte>(element + 0)));
+			c.y = Float(Int(*Pointer<Byte>(element + 1)));
+			c.z = Float(Int(*Pointer<Byte>(element + 2)));
+			c.w = float(0xFF);
+			break;
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8_SNORM:
+			c = Float4(*Pointer<SByte4>(element));
+			c.w = float(0x7F);
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_SRGB8_X8:
+			c = Float4(*Pointer<Byte4>(element));
+			c.w = float(0xFF);
+			break;
+		case FORMAT_A16B16G16R16I:
+			c = Float4(*Pointer<Short4>(element));
+			break;
+		case FORMAT_A16B16G16R16:
+		case FORMAT_A16B16G16R16UI:
+			c = Float4(*Pointer<UShort4>(element));
+			break;
+		case FORMAT_X16B16G16R16I:
+			c = Float4(*Pointer<Short4>(element));
+			c.w = float(0x7FFF);
+			break;
+		case FORMAT_X16B16G16R16UI:
+			c = Float4(*Pointer<UShort4>(element));
+			c.w = float(0xFFFF);
+			break;
+		case FORMAT_A32B32G32R32I:
+			c = Float4(*Pointer<Int4>(element));
+			break;
+		case FORMAT_A32B32G32R32UI:
+			c = Float4(*Pointer<UInt4>(element));
+			break;
+		case FORMAT_X32B32G32R32I:
+			c = Float4(*Pointer<Int4>(element));
+			c.w = float(0x7FFFFFFF);
+			break;
+		case FORMAT_X32B32G32R32UI:
+			c = Float4(*Pointer<UInt4>(element));
+			c.w = float(0xFFFFFFFF);
+			break;
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8_SNORM:
+			c.x = Float(Int(*Pointer<SByte>(element + 0)));
+			c.y = Float(Int(*Pointer<SByte>(element + 1)));
+			c.w = float(0x7F);
+			break;
+		case FORMAT_G8R8:
+		case FORMAT_G8R8UI:
+			c.x = Float(Int(*Pointer<Byte>(element + 0)));
+			c.y = Float(Int(*Pointer<Byte>(element + 1)));
+			c.w = float(0xFF);
+			break;
+		case FORMAT_G16R16I:
+			c.x = Float(Int(*Pointer<Short>(element + 0)));
+			c.y = Float(Int(*Pointer<Short>(element + 2)));
+			c.w = float(0x7FFF);
+			break;
+		case FORMAT_G16R16:
+		case FORMAT_G16R16UI:
+			c.x = Float(Int(*Pointer<UShort>(element + 0)));
+			c.y = Float(Int(*Pointer<UShort>(element + 2)));
+			c.w = float(0xFFFF);
+			break;
+		case FORMAT_G32R32I:
+			c.x = Float(*Pointer<Int>(element + 0));
+			c.y = Float(*Pointer<Int>(element + 4));
+			c.w = float(0x7FFFFFFF);
+			break;
+		case FORMAT_G32R32UI:
+			c.x = Float(*Pointer<UInt>(element + 0));
+			c.y = Float(*Pointer<UInt>(element + 4));
+			c.w = float(0xFFFFFFFF);
+			break;
+		case FORMAT_A32B32G32R32F:
+			c = *Pointer<Float4>(element);
+			break;
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_B32G32R32F:
+			c.z = *Pointer<Float>(element + 8);
+		case FORMAT_G32R32F:
+			c.x = *Pointer<Float>(element + 0);
+			c.y = *Pointer<Float>(element + 4);
+			break;
+		case FORMAT_R32F:
+			c.x = *Pointer<Float>(element);
+			break;
+		case FORMAT_R5G6B5:
+			c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
+			c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
+			c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
+			break;
+		case FORMAT_A2B10G10R10:
+		case FORMAT_A2B10G10R10UI:
+			c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
+			c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
+			c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
+			c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
+			break;
+		case FORMAT_D16:
+			c.x = Float(Int((*Pointer<UShort>(element))));
+			break;
+		case FORMAT_D24S8:
+		case FORMAT_D24X8:
+			c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
+			break;
+		case FORMAT_D32:
+			c.x = Float(Int((*Pointer<UInt>(element))));
+			break;
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+			c.x = 1.0f - *Pointer<Float>(element);
+			break;
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+			c.x = *Pointer<Float>(element);
+			break;
+		case FORMAT_S8:
+			c.x = Float(Int(*Pointer<Byte>(element)));
+			break;
+		default:
+			return false;
+		}
+
+		return true;
+	}
+
+	bool Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
+	{
+		bool writeR = state.writeRed;
+		bool writeG = state.writeGreen;
+		bool writeB = state.writeBlue;
+		bool writeA = state.writeAlpha;
+		bool writeRGBA = writeR && writeG && writeB && writeA;
+
+		switch(state.destFormat)
+		{
+		case FORMAT_L8:
+			*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
+			break;
+		case FORMAT_A8:
+			if(writeA) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.w))); }
+			break;
+		case FORMAT_A8R8G8B8:
+			if(writeRGBA)
+			{
+				Short4 c0 = RoundShort4(c.zyxw);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+			}
+			else
+			{
+				if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+				if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+				if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+			}
+			break;
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_A8:
+			if(writeRGBA)
+			{
+				Short4 c0 = RoundShort4(c);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+			}
+			else
+			{
+				if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+				if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+			}
+			break;
+		case FORMAT_X8R8G8B8:
+			if(writeRGBA)
+			{
+				Short4 c0 = RoundShort4(c.zyxw) | Short4(0x0000, 0x0000, 0x0000, 0x00FF);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+			}
+			else
+			{
+				if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+				if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+				if(writeA) { *Pointer<Byte>(element + 3) = Byte(0xFF); }
+			}
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_SRGB8_X8:
+			if(writeRGBA)
+			{
+				Short4 c0 = RoundShort4(c) | Short4(0x0000, 0x0000, 0x0000, 0x00FF);
+				*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
+			}
+			else
+			{
+				if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+				if(writeA) { *Pointer<Byte>(element + 3) = Byte(0xFF); }
+			}
+			break;
+		case FORMAT_R8G8B8:
+			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
+			break;
+		case FORMAT_B8G8R8:
+			if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+			break;
+		case FORMAT_A32B32G32R32F:
+			if(writeRGBA)
+			{
+				*Pointer<Float4>(element) = c;
+			}
+			else
+			{
+				if(writeR) { *Pointer<Float>(element) = c.x; }
+				if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+				if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+				if(writeA) { *Pointer<Float>(element + 12) = c.w; }
+			}
+			break;
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+			if(writeA) { *Pointer<Float>(element + 12) = 1.0f; }
+		case FORMAT_B32G32R32F:
+			if(writeR) { *Pointer<Float>(element) = c.x; }
+			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
+			break;
+		case FORMAT_G32R32F:
+			if(writeR && writeG)
+			{
+				*Pointer<Float2>(element) = Float2(c);
+			}
+			else
+			{
+				if(writeR) { *Pointer<Float>(element) = c.x; }
+				if(writeG) { *Pointer<Float>(element + 4) = c.y; }
+			}
+			break;
+		case FORMAT_R32F:
+			if(writeR) { *Pointer<Float>(element) = c.x; }
+			break;
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8_SNORM:
+			if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8_SNORM:
+			if(writeA && (state.destFormat == FORMAT_X8B8G8R8I || state.destFormat == FORMAT_X8B8G8R8_SNORM))
+			{
+				*Pointer<SByte>(element + 3) = SByte(0x7F);
+			}
+			if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8_SNORM:
+			if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
+		case FORMAT_R8I:
+		case FORMAT_R8_SNORM:
+			if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
+			break;
+		case FORMAT_A8B8G8R8UI:
+			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
+		case FORMAT_X8B8G8R8UI:
+			if(writeA && (state.destFormat == FORMAT_X8B8G8R8UI))
+			{
+				*Pointer<Byte>(element + 3) = Byte(0xFF);
+			}
+			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
+		case FORMAT_G8R8UI:
+		case FORMAT_G8R8:
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
+		case FORMAT_R8UI:
+		case FORMAT_R8:
+			if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
+			break;
+		case FORMAT_A16B16G16R16I:
+			if(writeRGBA)
+			{
+				*Pointer<Short4>(element) = Short4(RoundInt(c));
+			}
+			else
+			{
+				if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+				if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
+			}
+			break;
+		case FORMAT_X16B16G16R16I:
+			if(writeRGBA)
+			{
+				*Pointer<Short4>(element) = Short4(RoundInt(c));
+			}
+			else
+			{
+				if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
+			}
+			if(writeA) { *Pointer<Short>(element + 6) = Short(0x7F); }
+			break;
+		case FORMAT_G16R16I:
+			if(writeR && writeG)
+			{
+				*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
+			}
+			else
+			{
+				if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
+			}
+			break;
+		case FORMAT_R16I:
+			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
+			break;
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_A16B16G16R16:
+			if(writeRGBA)
+			{
+				*Pointer<UShort4>(element) = UShort4(RoundInt(c));
+			}
+			else
+			{
+				if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+				if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
+			}
+			break;
+		case FORMAT_X16B16G16R16UI:
+			if(writeRGBA)
+			{
+				*Pointer<UShort4>(element) = UShort4(RoundInt(c));
+			}
+			else
+			{
+				if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
+			}
+			if(writeA) { *Pointer<UShort>(element + 6) = UShort(0xFF); }
+			break;
+		case FORMAT_G16R16UI:
+		case FORMAT_G16R16:
+			if(writeR && writeG)
+			{
+				*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
+			}
+			else
+			{
+				if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
+			}
+			break;
+		case FORMAT_R16UI:
+			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
+			break;
+		case FORMAT_A32B32G32R32I:
+			if(writeRGBA)
+			{
+				*Pointer<Int4>(element) = RoundInt(c);
+			}
+			else
+			{
+				if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+				if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+				if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+				if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
+			}
+			break;
+		case FORMAT_X32B32G32R32I:
+			if(writeRGBA)
+			{
+				*Pointer<Int4>(element) = RoundInt(c);
+			}
+			else
+			{
+				if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+				if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+				if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
+			}
+			if(writeA) { *Pointer<Int>(element + 12) = Int(0x7FFFFFFF); }
+			break;
+		case FORMAT_G32R32I:
+			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
+		case FORMAT_R32I:
+			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
+			break;
+		case FORMAT_A32B32G32R32UI:
+			if(writeRGBA)
+			{
+				*Pointer<UInt4>(element) = UInt4(RoundInt(c));
+			}
+			else
+			{
+				if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+				if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
+			}
+			break;
+		case FORMAT_X32B32G32R32UI:
+			if(writeRGBA)
+			{
+				*Pointer<UInt4>(element) = UInt4(RoundInt(c));
+			}
+			else
+			{
+				if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
+			}
+			if(writeA) { *Pointer<UInt4>(element + 12) = UInt4(0xFFFFFFFF); }
+			break;
+		case FORMAT_G32R32UI:
+			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
+		case FORMAT_R32UI:
+			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
+			break;
+		case FORMAT_R5G6B5:
+			if(writeR && writeG && writeB)
+			{
+				*Pointer<UShort>(element) = UShort(RoundInt(Float(c.z)) |
+				                                  (RoundInt(Float(c.y)) << Int(5)) |
+				                                  (RoundInt(Float(c.x)) << Int(11)));
+			}
+			else
+			{
+				unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
+				unsigned short unmask = ~mask;
+				*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
+				                            (UShort(RoundInt(Float(c.z)) |
+				                                   (RoundInt(Float(c.y)) << Int(5)) |
+				                                   (RoundInt(Float(c.x)) << Int(11))) & UShort(mask));
+			}
+			break;
+		case FORMAT_A2B10G10R10:
+		case FORMAT_A2B10G10R10UI:
+			if(writeRGBA)
+			{
+				*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) |
+				                              (RoundInt(Float(c.y)) << 10) |
+				                              (RoundInt(Float(c.z)) << 20) |
+				                              (RoundInt(Float(c.w)) << 30));
+			}
+			else
+			{
+				unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
+				                    (writeB ? 0x3FF00000 : 0x0000) |
+				                    (writeG ? 0x000FFC00 : 0x0000) |
+				                    (writeR ? 0x000003FF : 0x0000);
+				unsigned int unmask = ~mask;
+				*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
+				                            (UInt(RoundInt(Float(c.x)) |
+				                                  (RoundInt(Float(c.y)) << 10) |
+				                                  (RoundInt(Float(c.z)) << 20) |
+				                                  (RoundInt(Float(c.w)) << 30)) & UInt(mask));
+			}
+			break;
+		case FORMAT_D16:
+			*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
+			break;
+		case FORMAT_D24S8:
+		case FORMAT_D24X8:
+			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
+			break;
+		case FORMAT_D32:
+			*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)));
+			break;
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+			*Pointer<Float>(element) = 1.0f - c.x;
+			break;
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+			*Pointer<Float>(element) = c.x;
+			break;
+		case FORMAT_S8:
+			*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
+			break;
+		default:
+			return false;
+		}
+		return true;
+	}
+
+	bool Blitter::read(Int4 &c, Pointer<Byte> element, const State &state)
+	{
+		c = Int4(0, 0, 0, 1);
+
+		switch(state.sourceFormat)
+		{
+		case FORMAT_A8B8G8R8I:
+			c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
+		case FORMAT_X8B8G8R8I:
+			c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
+		case FORMAT_G8R8I:
+			c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
+		case FORMAT_R8I:
+			c = Insert(c, Int(*Pointer<SByte>(element)), 0);
+			break;
+		case FORMAT_A8B8G8R8UI:
+			c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
+		case FORMAT_X8B8G8R8UI:
+			c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
+		case FORMAT_G8R8UI:
+			c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
+		case FORMAT_R8UI:
+			c = Insert(c, Int(*Pointer<Byte>(element)), 0);
+			break;
+		case FORMAT_A16B16G16R16I:
+			c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
+		case FORMAT_X16B16G16R16I:
+			c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
+		case FORMAT_G16R16I:
+			c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
+		case FORMAT_R16I:
+			c = Insert(c, Int(*Pointer<Short>(element)), 0);
+			break;
+		case FORMAT_A16B16G16R16UI:
+			c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
+		case FORMAT_X16B16G16R16UI:
+			c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
+		case FORMAT_G16R16UI:
+			c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
+		case FORMAT_R16UI:
+			c = Insert(c, Int(*Pointer<UShort>(element)), 0);
+			break;
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+			c = *Pointer<Int4>(element);
+			break;
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+			c = Insert(c, *Pointer<Int>(element + 8), 2);
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+			c = Insert(c, *Pointer<Int>(element + 4), 1);
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+			c = Insert(c, *Pointer<Int>(element), 0);
+			break;
+		default:
+			return false;
+		}
+
+		return true;
+	}
+
+	bool Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
+	{
+		bool writeR = state.writeRed;
+		bool writeG = state.writeGreen;
+		bool writeB = state.writeBlue;
+		bool writeA = state.writeAlpha;
+		bool writeRGBA = writeR && writeG && writeB && writeA;
+
+		switch(state.destFormat)
+		{
+		case FORMAT_A8B8G8R8I:
+			if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
+		case FORMAT_X8B8G8R8I:
+			if(writeA && (state.destFormat != FORMAT_A8B8G8R8I))
+			{
+				*Pointer<SByte>(element + 3) = SByte(0x7F);
+			}
+			if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
+		case FORMAT_G8R8I:
+			if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
+		case FORMAT_R8I:
+			if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
+			break;
+		case FORMAT_A8B8G8R8UI:
+			if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
+		case FORMAT_X8B8G8R8UI:
+			if(writeA && (state.destFormat != FORMAT_A8B8G8R8UI))
+			{
+				*Pointer<Byte>(element + 3) = Byte(0xFF);
+			}
+			if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
+		case FORMAT_G8R8UI:
+			if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
+		case FORMAT_R8UI:
+			if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
+			break;
+		case FORMAT_A16B16G16R16I:
+			if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
+		case FORMAT_X16B16G16R16I:
+			if(writeA && (state.destFormat != FORMAT_A16B16G16R16I))
+			{
+				*Pointer<Short>(element + 6) = Short(0x7FFF);
+			}
+			if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
+		case FORMAT_G16R16I:
+			if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
+		case FORMAT_R16I:
+			if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
+			break;
+		case FORMAT_A16B16G16R16UI:
+			if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
+		case FORMAT_X16B16G16R16UI:
+			if(writeA && (state.destFormat != FORMAT_A16B16G16R16UI))
+			{
+				*Pointer<UShort>(element + 6) = UShort(0xFFFF);
+			}
+			if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
+		case FORMAT_G16R16UI:
+			if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
+		case FORMAT_R16UI:
+			if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
+			break;
+		case FORMAT_A32B32G32R32I:
+			if(writeRGBA)
+			{
+				*Pointer<Int4>(element) = c;
+			}
+			else
+			{
+				if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+				if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+				if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+				if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
+			}
+			break;
+		case FORMAT_X32B32G32R32I:
+			if(writeRGBA)
+			{
+				*Pointer<Int4>(element) = c;
+			}
+			else
+			{
+				if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+				if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+				if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
+			}
+			if(writeA) { *Pointer<Int>(element + 12) = Int(0x7FFFFFFF); }
+			break;
+		case FORMAT_G32R32I:
+			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
+			break;
+		case FORMAT_R32I:
+			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
+			break;
+		case FORMAT_A32B32G32R32UI:
+			if(writeRGBA)
+			{
+				*Pointer<UInt4>(element) = As<UInt4>(c);
+			}
+			else
+			{
+				if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+				if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
+			}
+			break;
+		case FORMAT_X32B32G32R32UI:
+			if(writeRGBA)
+			{
+				*Pointer<UInt4>(element) = As<UInt4>(c);
+			}
+			else
+			{
+				if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+				if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+				if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
+			}
+			if(writeA) { *Pointer<UInt>(element + 3) = UInt(0xFFFFFFFF); }
+			break;
+		case FORMAT_G32R32UI:
+			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
+			break;
+		case FORMAT_R32UI:
+			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
+			break;
+		default:
+			return false;
+		}
+
+		return true;
+	}
+
+	bool Blitter::GetScale(float4 &scale, Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_L8:
+		case FORMAT_A8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_R8:
+		case FORMAT_G8R8:
+		case FORMAT_R8G8B8:
+		case FORMAT_B8G8R8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+			scale = vector(0xFF, 0xFF, 0xFF, 0xFF);
+			break;
+		case FORMAT_R8_SNORM:
+		case FORMAT_G8R8_SNORM:
+		case FORMAT_X8B8G8R8_SNORM:
+		case FORMAT_A8B8G8R8_SNORM:
+			scale = vector(0x7F, 0x7F, 0x7F, 0x7F);
+			break;
+		case FORMAT_A16B16G16R16:
+			scale = vector(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+			break;
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_B32G32R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_R32F:
+		case FORMAT_A2B10G10R10UI:
+			scale = vector(1.0f, 1.0f, 1.0f, 1.0f);
+			break;
+		case FORMAT_R5G6B5:
+			scale = vector(0x1F, 0x3F, 0x1F, 1.0f);
+			break;
+		case FORMAT_A2B10G10R10:
+			scale = vector(0x3FF, 0x3FF, 0x3FF, 0x03);
+			break;
+		case FORMAT_D16:
+			scale = vector(0xFFFF, 0.0f, 0.0f, 0.0f);
+			break;
+		case FORMAT_D24S8:
+		case FORMAT_D24X8:
+			scale = vector(0xFFFFFF, 0.0f, 0.0f, 0.0f);
+			break;
+		case FORMAT_D32:
+			scale = vector(static_cast<float>(0xFFFFFFFF), 0.0f, 0.0f, 0.0f);
+			break;
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_S8:
+			scale = vector(1.0f, 1.0f, 1.0f, 1.0f);
+			break;
+		default:
+			return false;
+		}
+
+		return true;
+	}
+
+	bool Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
+	{
+		float4 scale, unscale;
+		if(state.clearOperation &&
+		   Surface::isNonNormalizedInteger(state.sourceFormat) &&
+		   !Surface::isNonNormalizedInteger(state.destFormat))
+		{
+			// If we're clearing a buffer from an int or uint color into a normalized color,
+			// then the whole range of the int or uint color must be scaled between 0 and 1.
+			switch(state.sourceFormat)
+			{
+			case FORMAT_A32B32G32R32I:
+				unscale = replicate(static_cast<float>(0x7FFFFFFF));
+				break;
+			case FORMAT_A32B32G32R32UI:
+				unscale = replicate(static_cast<float>(0xFFFFFFFF));
+				break;
+			default:
+				return false;
+			}
+		}
+		else if(!GetScale(unscale, state.sourceFormat))
+		{
+			return false;
+		}
+
+		if(!GetScale(scale, state.destFormat))
+		{
+			return false;
+		}
+
+		bool srcSRGB = Surface::isSRGBformat(state.sourceFormat);
+		bool dstSRGB = Surface::isSRGBformat(state.destFormat);
+
+		if(state.convertSRGB && ((srcSRGB && !preScaled) || dstSRGB))   // One of the formats is sRGB encoded.
+		{
+			value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
+			                     Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
+			value = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : LinearToSRGB(value);
+			value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
+		}
+		else if(unscale != scale)
+		{
+			value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
+		}
+
+		if(state.destFormat == FORMAT_X32B32G32R32F_UNSIGNED)
+		{
+			value = Max(value, Float4(0.0f));  // TODO: Only necessary if source is signed.
+		}
+		else if(Surface::isFloatFormat(state.sourceFormat) && !Surface::isFloatFormat(state.destFormat))
+		{
+			value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
+
+			value = Max(value, Float4(Surface::isUnsignedComponent(state.destFormat, 0) ? 0.0f : -scale.x,
+			                          Surface::isUnsignedComponent(state.destFormat, 1) ? 0.0f : -scale.y,
+			                          Surface::isUnsignedComponent(state.destFormat, 2) ? 0.0f : -scale.z,
+			                          Surface::isUnsignedComponent(state.destFormat, 3) ? 0.0f : -scale.w));
+		}
+
+		return true;
+	}
+
+	Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes, bool quadLayout)
+	{
+		if(!quadLayout)
+		{
+			return y * pitchB + x * bytes;
+		}
+		else
+		{
+			// (x & ~1) * 2 + (x & 1) == (x - (x & 1)) * 2 + (x & 1) == x * 2 - (x & 1) * 2 + (x & 1) == x * 2 - (x & 1)
+			return (y & Int(~1)) * pitchB +
+			       ((y & Int(1)) * 2 + x * 2 - (x & Int(1))) * bytes;
+		}
+	}
+
+	Float4 Blitter::LinearToSRGB(Float4 &c)
+	{
+		Float4 lc = Min(c, Float4(0.0031308f)) * Float4(12.92f);
+		Float4 ec = Float4(1.055f) * power(c, Float4(1.0f / 2.4f)) - Float4(0.055f);
+
+		Float4 s = c;
+		s.xyz = Max(lc, ec);
+
+		return s;
+	}
+
+	Float4 Blitter::sRGBtoLinear(Float4 &c)
+	{
+		Float4 lc = c * Float4(1.0f / 12.92f);
+		Float4 ec = power((c + Float4(0.055f)) * Float4(1.0f / 1.055f), Float4(2.4f));
+
+		Int4 linear = CmpLT(c, Float4(0.04045f));
+
+		Float4 s = c;
+		s.xyz = As<Float4>((linear & As<Int4>(lc)) | (~linear & As<Int4>(ec)));   // FIXME: IfThenElse()
+
+		return s;
+	}
+
+	Routine *Blitter::generate(const State &state)
+	{
+		Function<Void(Pointer<Byte>)> function;
+		{
+			Pointer<Byte> blit(function.Arg<0>());
+
+			Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,source));
+			Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData,dest));
+			Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData,sPitchB));
+			Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData,dPitchB));
+
+			Float x0 = *Pointer<Float>(blit + OFFSET(BlitData,x0));
+			Float y0 = *Pointer<Float>(blit + OFFSET(BlitData,y0));
+			Float w = *Pointer<Float>(blit + OFFSET(BlitData,w));
+			Float h = *Pointer<Float>(blit + OFFSET(BlitData,h));
+
+			Int x0d = *Pointer<Int>(blit + OFFSET(BlitData,x0d));
+			Int x1d = *Pointer<Int>(blit + OFFSET(BlitData,x1d));
+			Int y0d = *Pointer<Int>(blit + OFFSET(BlitData,y0d));
+			Int y1d = *Pointer<Int>(blit + OFFSET(BlitData,y1d));
+
+			Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData,sWidth));
+			Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData,sHeight));
+
+			bool intSrc = Surface::isNonNormalizedInteger(state.sourceFormat);
+			bool intDst = Surface::isNonNormalizedInteger(state.destFormat);
+			bool intBoth = intSrc && intDst;
+			bool srcQuadLayout = Surface::hasQuadLayout(state.sourceFormat);
+			bool dstQuadLayout = Surface::hasQuadLayout(state.destFormat);
+			int srcBytes = Surface::bytes(state.sourceFormat);
+			int dstBytes = Surface::bytes(state.destFormat);
+
+			bool hasConstantColorI = false;
+			Int4 constantColorI;
+			bool hasConstantColorF = false;
+			Float4 constantColorF;
+			if(state.clearOperation)
+			{
+				if(intBoth) // Integer types
+				{
+					if(!read(constantColorI, source, state))
+					{
+						return nullptr;
+					}
+					hasConstantColorI = true;
+				}
+				else
+				{
+					if(!read(constantColorF, source, state))
+					{
+						return nullptr;
+					}
+					hasConstantColorF = true;
+
+					if(!ApplyScaleAndClamp(constantColorF, state))
+					{
+						return nullptr;
+					}
+				}
+			}
+
+			For(Int j = y0d, j < y1d, j++)
+			{
+				Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
+				Pointer<Byte> destLine = dest + (dstQuadLayout ? j & Int(~1) : RValue<Int>(j)) * dPitchB;
+
+				For(Int i = x0d, i < x1d, i++)
+				{
+					Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
+					Pointer<Byte> d = destLine + (dstQuadLayout ? (((j & Int(1)) << 1) + (i * 2) - (i & Int(1))) : RValue<Int>(i)) * dstBytes;
+
+					if(hasConstantColorI)
+					{
+						if(!write(constantColorI, d, state))
+						{
+							return nullptr;
+						}
+					}
+					else if(hasConstantColorF)
+					{
+						for(int s = 0; s < state.destSamples; s++)
+						{
+							if(!write(constantColorF, d, state))
+							{
+								return nullptr;
+							}
+
+							d += *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
+						}
+					}
+					else if(intBoth) // Integer types do not support filtering
+					{
+						Int4 color; // When both formats are true integer types, we don't go to float to avoid losing precision
+						Int X = Int(x);
+						Int Y = Int(y);
+
+						if(state.clampToEdge)
+						{
+							X = Clamp(X, 0, sWidth - 1);
+							Y = Clamp(Y, 0, sHeight - 1);
+						}
+
+						Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes, srcQuadLayout);
+
+						if(!read(color, s, state))
+						{
+							return nullptr;
+						}
+
+						if(!write(color, d, state))
+						{
+							return nullptr;
+						}
+					}
+					else
+					{
+						Float4 color;
+
+						bool preScaled = false;
+						if(!state.filter || intSrc)
+						{
+							Int X = Int(x);
+							Int Y = Int(y);
+
+							if(state.clampToEdge)
+							{
+								X = Clamp(X, 0, sWidth - 1);
+								Y = Clamp(Y, 0, sHeight - 1);
+							}
+
+							Pointer<Byte> s = source + ComputeOffset(X, Y, sPitchB, srcBytes, srcQuadLayout);
+
+							if(!read(color, s, state))
+							{
+								return nullptr;
+							}
+						}
+						else   // Bilinear filtering
+						{
+							Float X = x;
+							Float Y = y;
+
+							if(state.clampToEdge)
+							{
+								X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
+								Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
+							}
+
+							Float x0 = X - 0.5f;
+							Float y0 = Y - 0.5f;
+
+							Int X0 = Max(Int(x0), 0);
+							Int Y0 = Max(Int(y0), 0);
+
+							Int X1 = X0 + 1;
+							Int Y1 = Y0 + 1;
+							X1 = IfThenElse(X1 >= sWidth, X0, X1);
+							Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
+
+							Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, sPitchB, srcBytes, srcQuadLayout);
+							Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, sPitchB, srcBytes, srcQuadLayout);
+							Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, sPitchB, srcBytes, srcQuadLayout);
+							Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, sPitchB, srcBytes, srcQuadLayout);
+
+							Float4 c00; if(!read(c00, s00, state)) return nullptr;
+							Float4 c01; if(!read(c01, s01, state)) return nullptr;
+							Float4 c10; if(!read(c10, s10, state)) return nullptr;
+							Float4 c11; if(!read(c11, s11, state)) return nullptr;
+
+							if(state.convertSRGB && Surface::isSRGBformat(state.sourceFormat)) // sRGB -> RGB
+							{
+								if(!ApplyScaleAndClamp(c00, state)) return nullptr;
+								if(!ApplyScaleAndClamp(c01, state)) return nullptr;
+								if(!ApplyScaleAndClamp(c10, state)) return nullptr;
+								if(!ApplyScaleAndClamp(c11, state)) return nullptr;
+								preScaled = true;
+							}
+
+							Float4 fx = Float4(x0 - Float(X0));
+							Float4 fy = Float4(y0 - Float(Y0));
+							Float4 ix = Float4(1.0f) - fx;
+							Float4 iy = Float4(1.0f) - fy;
+
+							color = (c00 * ix + c01 * fx) * iy +
+							        (c10 * ix + c11 * fx) * fy;
+						}
+
+						if(!ApplyScaleAndClamp(color, state, preScaled))
+						{
+							return nullptr;
+						}
+
+						for(int s = 0; s < state.destSamples; s++)
+						{
+							if(!write(color, d, state))
+							{
+								return nullptr;
+							}
+
+							d += *Pointer<Int>(blit + OFFSET(BlitData,dSliceB));
+						}
+					}
+				}
+			}
+		}
+
+		return function(L"BlitRoutine");
+	}
+
+	bool Blitter::blitReactor(Surface *source, const SliceRectF &sourceRect, Surface *dest, const SliceRect &destRect, const Blitter::Options &options)
+	{
+		ASSERT(!options.clearOperation || ((source->getWidth() == 1) && (source->getHeight() == 1) && (source->getDepth() == 1)));
+
+		Rect dRect = destRect;
+		RectF sRect = sourceRect;
+		if(destRect.x0 > destRect.x1)
+		{
+			swap(dRect.x0, dRect.x1);
+			swap(sRect.x0, sRect.x1);
+		}
+		if(destRect.y0 > destRect.y1)
+		{
+			swap(dRect.y0, dRect.y1);
+			swap(sRect.y0, sRect.y1);
+		}
+
+		State state(options);
+		state.clampToEdge = (sourceRect.x0 < 0.0f) ||
+		                    (sourceRect.y0 < 0.0f) ||
+		                    (sourceRect.x1 > (float)source->getWidth()) ||
+		                    (sourceRect.y1 > (float)source->getHeight());
+
+		bool useSourceInternal = !source->isExternalDirty();
+		bool useDestInternal = !dest->isExternalDirty();
+		bool isStencil = options.useStencil;
+
+		state.sourceFormat = isStencil ? source->getStencilFormat() : source->getFormat(useSourceInternal);
+		state.destFormat = isStencil ? dest->getStencilFormat() : dest->getFormat(useDestInternal);
+		state.destSamples = dest->getSamples();
+
+		criticalSection.lock();
+		Routine *blitRoutine = blitCache->query(state);
+
+		if(!blitRoutine)
+		{
+			blitRoutine = generate(state);
+
+			if(!blitRoutine)
+			{
+				criticalSection.unlock();
+				return false;
+			}
+
+			blitCache->add(state, blitRoutine);
+		}
+
+		criticalSection.unlock();
+
+		void (*blitFunction)(const BlitData *data) = (void(*)(const BlitData*))blitRoutine->getEntry();
+
+		BlitData data;
+
+		bool isRGBA = options.writeMask == 0xF;
+		bool isEntireDest = dest->isEntire(destRect);
+
+		data.source = isStencil ? source->lockStencil(0, 0, 0, sw::PUBLIC) :
+		                          source->lock(0, 0, sourceRect.slice, sw::LOCK_READONLY, sw::PUBLIC, useSourceInternal);
+		data.dest = isStencil ? dest->lockStencil(0, 0, 0, sw::PUBLIC) :
+		                        dest->lock(0, 0, destRect.slice, isRGBA ? (isEntireDest ? sw::LOCK_DISCARD : sw::LOCK_WRITEONLY) : sw::LOCK_READWRITE, sw::PUBLIC, useDestInternal);
+		data.sPitchB = isStencil ? source->getStencilPitchB() : source->getPitchB(useSourceInternal);
+		data.dPitchB = isStencil ? dest->getStencilPitchB() : dest->getPitchB(useDestInternal);
+		data.dSliceB = isStencil ? dest->getStencilSliceB() : dest->getSliceB(useDestInternal);
+
+		data.w = sRect.width() / dRect.width();
+		data.h = sRect.height() / dRect.height();
+		data.x0 = sRect.x0 + (0.5f - dRect.x0) * data.w;
+		data.y0 = sRect.y0 + (0.5f - dRect.y0) * data.h;
+
+		data.x0d = dRect.x0;
+		data.x1d = dRect.x1;
+		data.y0d = dRect.y0;
+		data.y1d = dRect.y1;
+
+		data.sWidth = source->getWidth();
+		data.sHeight = source->getHeight();
+
+		blitFunction(&data);
+
+		if(isStencil)
+		{
+			source->unlockStencil();
+			dest->unlockStencil();
+		}
+		else
+		{
+			source->unlock(useSourceInternal);
+			dest->unlock(useDestInternal);
+		}
+
+		return true;
+	}
+}
diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
new file mode 100644
index 0000000..e3db745
--- /dev/null
+++ b/src/Device/Blitter.hpp
@@ -0,0 +1,121 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Blitter_hpp
+#define sw_Blitter_hpp
+
+#include "Surface.hpp"
+#include "RoutineCache.hpp"
+#include "Reactor/Reactor.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+	class Blitter
+	{
+		struct Options
+		{
+			Options() = default;
+			Options(bool filter, bool useStencil, bool convertSRGB)
+				: writeMask(0xF), clearOperation(false), filter(filter), useStencil(useStencil), convertSRGB(convertSRGB), clampToEdge(false) {}
+			Options(unsigned int writeMask)
+				: writeMask(writeMask), clearOperation(true), filter(false), useStencil(false), convertSRGB(true), clampToEdge(false) {}
+
+			union
+			{
+				struct
+				{
+					bool writeRed : 1;
+					bool writeGreen : 1;
+					bool writeBlue : 1;
+					bool writeAlpha : 1;
+				};
+
+				unsigned char writeMask;
+			};
+
+			bool clearOperation : 1;
+			bool filter : 1;
+			bool useStencil : 1;
+			bool convertSRGB : 1;
+			bool clampToEdge : 1;
+		};
+
+		struct State : Options
+		{
+			State() = default;
+			State(const Options &options) : Options(options) {}
+
+			bool operator==(const State &state) const
+			{
+				return memcmp(this, &state, sizeof(State)) == 0;
+			}
+
+			Format sourceFormat;
+			Format destFormat;
+			int destSamples;
+		};
+
+		struct BlitData
+		{
+			void *source;
+			void *dest;
+			int sPitchB;
+			int dPitchB;
+			int dSliceB;
+
+			float x0;
+			float y0;
+			float w;
+			float h;
+
+			int y0d;
+			int y1d;
+			int x0d;
+			int x1d;
+
+			int sWidth;
+			int sHeight;
+		};
+
+	public:
+		Blitter();
+		virtual ~Blitter();
+
+		void clear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+		void blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, const Options &options);
+		void blit3D(Surface *source, Surface *dest);
+
+	private:
+		bool fastClear(void *pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+
+		bool read(Float4 &color, Pointer<Byte> element, const State &state);
+		bool write(Float4 &color, Pointer<Byte> element, const State &state);
+		bool read(Int4 &color, Pointer<Byte> element, const State &state);
+		bool write(Int4 &color, Pointer<Byte> element, const State &state);
+		static bool GetScale(float4& scale, Format format);
+		static bool ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled = false);
+		static Int ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes, bool quadLayout);
+		static Float4 LinearToSRGB(Float4 &color);
+		static Float4 sRGBtoLinear(Float4 &color);
+		bool blitReactor(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, const Options &options);
+		Routine *generate(const State &state);
+
+		RoutineCache<State> *blitCache;
+		MutexLock criticalSection;
+	};
+}
+
+#endif   // sw_Blitter_hpp
diff --git a/src/Device/Clipper.cpp b/src/Device/Clipper.cpp
new file mode 100644
index 0000000..a100f05
--- /dev/null
+++ b/src/Device/Clipper.cpp
@@ -0,0 +1,359 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Clipper.hpp"
+
+#include "Polygon.hpp"
+#include "Renderer.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	Clipper::Clipper(bool symmetricNormalizedDepth)
+	{
+		n = symmetricNormalizedDepth ? -1.0f : 0.0f;
+	}
+
+	Clipper::~Clipper()
+	{
+	}
+
+	unsigned int Clipper::computeClipFlags(const float4 &v)
+	{
+		return ((v.x > v.w)     ? CLIP_RIGHT  : 0) |
+		       ((v.y > v.w)     ? CLIP_TOP    : 0) |
+		       ((v.z > v.w)     ? CLIP_FAR    : 0) |
+		       ((v.x < -v.w)    ? CLIP_LEFT   : 0) |
+		       ((v.y < -v.w)    ? CLIP_BOTTOM : 0) |
+		       ((v.z < n * v.w) ? CLIP_NEAR   : 0) |
+		       Clipper::CLIP_FINITE;   // FIXME: xyz finite
+	}
+
+	bool Clipper::clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw)
+	{
+		if(clipFlagsOr & CLIP_FRUSTUM)
+		{
+			if(clipFlagsOr & CLIP_NEAR)   clipNear(polygon);
+			if(polygon.n >= 3) {
+			if(clipFlagsOr & CLIP_FAR)    clipFar(polygon);
+			if(polygon.n >= 3) {
+			if(clipFlagsOr & CLIP_LEFT)   clipLeft(polygon);
+			if(polygon.n >= 3) {
+			if(clipFlagsOr & CLIP_RIGHT)  clipRight(polygon);
+			if(polygon.n >= 3) {
+			if(clipFlagsOr & CLIP_TOP)    clipTop(polygon);
+			if(polygon.n >= 3) {
+			if(clipFlagsOr & CLIP_BOTTOM) clipBottom(polygon);
+			}}}}}
+		}
+
+		if(clipFlagsOr & CLIP_USER)
+		{
+			int clipFlags = draw.clipFlags;
+			DrawData &data = *draw.data;
+
+			if(polygon.n >= 3) {
+			if(clipFlags & CLIP_PLANE0) clipPlane(polygon, data.clipPlane[0]);
+			if(polygon.n >= 3) {
+			if(clipFlags & CLIP_PLANE1) clipPlane(polygon, data.clipPlane[1]);
+			if(polygon.n >= 3) {
+			if(clipFlags & CLIP_PLANE2) clipPlane(polygon, data.clipPlane[2]);
+			if(polygon.n >= 3) {
+			if(clipFlags & CLIP_PLANE3) clipPlane(polygon, data.clipPlane[3]);
+			if(polygon.n >= 3) {
+			if(clipFlags & CLIP_PLANE4) clipPlane(polygon, data.clipPlane[4]);
+			if(polygon.n >= 3) {
+			if(clipFlags & CLIP_PLANE5) clipPlane(polygon, data.clipPlane[5]);
+			}}}}}}
+		}
+
+		return polygon.n >= 3;
+	}
+
+	void Clipper::clipNear(Polygon &polygon)
+	{
+		const float4 **V = polygon.P[polygon.i];
+		const float4 **T = polygon.P[polygon.i + 1];
+
+		int t = 0;
+
+		for(int i = 0; i < polygon.n; i++)
+		{
+			int j = i == polygon.n - 1 ? 0 : i + 1;
+
+			float di = V[i]->z - n * V[i]->w;
+			float dj = V[j]->z - n * V[j]->w;
+
+			if(di >= 0)
+			{
+				T[t++] = V[i];
+
+				if(dj < 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+			else
+			{
+				if(dj > 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+		}
+
+		polygon.n = t;
+		polygon.i += 1;
+	}
+
+	void Clipper::clipFar(Polygon &polygon)
+	{
+		const float4 **V = polygon.P[polygon.i];
+		const float4 **T = polygon.P[polygon.i + 1];
+
+		int t = 0;
+
+		for(int i = 0; i < polygon.n; i++)
+		{
+			int j = i == polygon.n - 1 ? 0 : i + 1;
+
+			float di = V[i]->w - V[i]->z;
+			float dj = V[j]->w - V[j]->z;
+
+			if(di >= 0)
+			{
+				T[t++] = V[i];
+
+				if(dj < 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+			else
+			{
+				if(dj > 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+		}
+
+		polygon.n = t;
+		polygon.i += 1;
+	}
+
+	void Clipper::clipLeft(Polygon &polygon)
+	{
+		const float4 **V = polygon.P[polygon.i];
+		const float4 **T = polygon.P[polygon.i + 1];
+
+		int t = 0;
+
+		for(int i = 0; i < polygon.n; i++)
+		{
+			int j = i == polygon.n - 1 ? 0 : i + 1;
+
+			float di = V[i]->w + V[i]->x;
+			float dj = V[j]->w + V[j]->x;
+
+			if(di >= 0)
+			{
+				T[t++] = V[i];
+
+				if(dj < 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+			else
+			{
+				if(dj > 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+		}
+
+		polygon.n = t;
+		polygon.i += 1;
+	}
+
+	void Clipper::clipRight(Polygon &polygon)
+	{
+		const float4 **V = polygon.P[polygon.i];
+		const float4 **T = polygon.P[polygon.i + 1];
+
+		int t = 0;
+
+		for(int i = 0; i < polygon.n; i++)
+		{
+			int j = i == polygon.n - 1 ? 0 : i + 1;
+
+			float di = V[i]->w - V[i]->x;
+			float dj = V[j]->w - V[j]->x;
+
+			if(di >= 0)
+			{
+				T[t++] = V[i];
+
+				if(dj < 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+			else
+			{
+				if(dj > 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+		}
+
+		polygon.n = t;
+		polygon.i += 1;
+	}
+
+	void Clipper::clipTop(Polygon &polygon)
+	{
+		const float4 **V = polygon.P[polygon.i];
+		const float4 **T = polygon.P[polygon.i + 1];
+
+		int t = 0;
+
+		for(int i = 0; i < polygon.n; i++)
+		{
+			int j = i == polygon.n - 1 ? 0 : i + 1;
+
+			float di = V[i]->w - V[i]->y;
+			float dj = V[j]->w - V[j]->y;
+
+			if(di >= 0)
+			{
+				T[t++] = V[i];
+
+				if(dj < 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+			else
+			{
+				if(dj > 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+		}
+
+		polygon.n = t;
+		polygon.i += 1;
+	}
+
+	void Clipper::clipBottom(Polygon &polygon)
+	{
+		const float4 **V = polygon.P[polygon.i];
+		const float4 **T = polygon.P[polygon.i + 1];
+
+		int t = 0;
+
+		for(int i = 0; i < polygon.n; i++)
+		{
+			int j = i == polygon.n - 1 ? 0 : i + 1;
+
+			float di = V[i]->w + V[i]->y;
+			float dj = V[j]->w + V[j]->y;
+
+			if(di >= 0)
+			{
+				T[t++] = V[i];
+
+				if(dj < 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+			else
+			{
+				if(dj > 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+		}
+
+		polygon.n = t;
+		polygon.i += 1;
+	}
+
+	void Clipper::clipPlane(Polygon &polygon, const Plane &p)
+	{
+		const float4 **V = polygon.P[polygon.i];
+		const float4 **T = polygon.P[polygon.i + 1];
+
+		int t = 0;
+
+		for(int i = 0; i < polygon.n; i++)
+		{
+			int j = i == polygon.n - 1 ? 0 : i + 1;
+
+			float di = p.A * V[i]->x + p.B * V[i]->y + p.C * V[i]->z + p.D * V[i]->w;
+			float dj = p.A * V[j]->x + p.B * V[j]->y + p.C * V[j]->z + p.D * V[j]->w;
+
+			if(di >= 0)
+			{
+				T[t++] = V[i];
+
+				if(dj < 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[i], *V[j], di, dj);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+			else
+			{
+				if(dj > 0)
+				{
+					clipEdge(polygon.B[polygon.b], *V[j], *V[i], dj, di);
+					T[t++] = &polygon.B[polygon.b++];
+				}
+			}
+		}
+
+		polygon.n = t;
+		polygon.i += 1;
+	}
+
+	inline void Clipper::clipEdge(float4 &Vo, const float4 &Vi, const float4 &Vj, float di, float dj) const
+	{
+		float D = 1.0f / (dj - di);
+
+		Vo.x = (dj * Vi.x - di * Vj.x) * D;
+		Vo.y = (dj * Vi.y - di * Vj.y) * D;
+		Vo.z = (dj * Vi.z - di * Vj.z) * D;
+		Vo.w = (dj * Vi.w - di * Vj.w) * D;
+	}
+}
diff --git a/src/Device/Clipper.hpp b/src/Device/Clipper.hpp
new file mode 100644
index 0000000..057153a
--- /dev/null
+++ b/src/Device/Clipper.hpp
@@ -0,0 +1,77 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Clipper_hpp
+#define sw_Clipper_hpp
+
+#include "Plane.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	struct Polygon;
+	struct DrawCall;
+	struct DrawData;
+
+	class Clipper
+	{
+	public:
+		enum ClipFlags
+		{
+			// Indicates the vertex is outside the respective frustum plane
+			CLIP_RIGHT  = 1 << 0,
+			CLIP_TOP    = 1 << 1,
+			CLIP_FAR    = 1 << 2,
+			CLIP_LEFT   = 1 << 3,
+			CLIP_BOTTOM = 1 << 4,
+			CLIP_NEAR   = 1 << 5,
+
+			CLIP_FRUSTUM = 0x003F,
+
+			CLIP_FINITE = 1 << 7,   // All position coordinates are finite
+
+			// User-defined clipping planes
+			CLIP_PLANE0 = 1 << 8,
+			CLIP_PLANE1 = 1 << 9,
+			CLIP_PLANE2 = 1 << 10,
+			CLIP_PLANE3 = 1 << 11,
+			CLIP_PLANE4 = 1 << 12,
+			CLIP_PLANE5 = 1 << 13,
+
+			CLIP_USER = 0x3F00
+		};
+
+		Clipper(bool symmetricNormalizedDepth);
+
+		~Clipper();
+
+		unsigned int computeClipFlags(const float4 &v);
+		bool clip(Polygon &polygon, int clipFlagsOr, const DrawCall &draw);
+
+	private:
+		void clipNear(Polygon &polygon);
+		void clipFar(Polygon &polygon);
+		void clipLeft(Polygon &polygon);
+		void clipRight(Polygon &polygon);
+		void clipTop(Polygon &polygon);
+		void clipBottom(Polygon &polygon);
+		void clipPlane(Polygon &polygon, const Plane &plane);
+
+		void clipEdge(float4 &Vo, const float4 &Vi, const float4 &Vj, float di, float dj) const;
+
+		float n;   // Near clip plane distance
+	};
+}
+
+#endif   // sw_Clipper_hpp
diff --git a/src/Device/Color.cpp b/src/Device/Color.cpp
new file mode 100644
index 0000000..9ad6767
--- /dev/null
+++ b/src/Device/Color.cpp
@@ -0,0 +1,19 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Color.hpp"
+
+namespace sw
+{
+}
diff --git a/src/Device/Color.hpp b/src/Device/Color.hpp
new file mode 100644
index 0000000..7afe61f
--- /dev/null
+++ b/src/Device/Color.hpp
@@ -0,0 +1,472 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Color_hpp
+#define sw_Color_hpp
+
+#include "Common/Types.hpp"
+#include "Common/Math.hpp"
+
+namespace sw
+{
+	template<class T>
+	struct Color
+	{
+		Color();
+	
+		Color(const Color<byte> &c);
+		Color(const Color<short> &c);
+		Color(const Color<float> &c);
+		
+		Color(int c);
+		Color(unsigned short c);
+		Color(unsigned long c);
+		Color(unsigned int c);
+		
+		Color(T r, T g, T b, T a = 1);
+
+		operator unsigned int() const;
+
+		T &operator[](int i);
+		const T &operator[](int i) const;
+
+		Color<T> operator+() const;
+		Color<T> operator-() const;
+
+		Color<T>& operator=(const Color<T>& c);
+
+		Color<T> &operator+=(const Color<T> &c);
+		Color<T> &operator*=(float l);
+
+		static Color<T> gradient(const Color<T> &c1, const Color<T>  &c2, float d);
+		static Color<T> shade(const Color<T> &c1, const Color<T>  &c2, float d);
+
+		template<class S>
+		friend Color<S> operator+(const Color<S> &c1, const Color<S> &c2);
+		template<class S>
+		friend Color<S> operator-(const Color<S> &c1, const Color<S> &c2);
+
+		template<class S>
+		friend Color<S> operator*(float l, const Color<S> &c);
+		template<class S>
+		friend Color<S> operator*(const Color<S> &c1, const Color<S> &c2);
+		template<class S>
+		friend Color<S> operator/(const Color<S> &c, float l);
+
+		T r;
+		T g;
+		T b;
+		T a;
+	};
+}
+
+#include "Common/Math.hpp"
+
+namespace sw
+{
+	template<class T>
+	inline Color<T>::Color()
+	{
+	}
+
+	template<>
+	inline Color<byte>::Color(const Color<byte> &c)
+	{
+		r = c.r;
+		g = c.g;
+		b = c.b;
+		a = c.a;
+	}
+
+	template<>
+	inline Color<byte>::Color(const Color<short> &c)
+	{
+		r = clamp(c.r >> 4, 0, 255);
+		g = clamp(c.g >> 4, 0, 255);
+		b = clamp(c.b >> 4, 0, 255);
+		a = clamp(c.a >> 4, 0, 255);
+	}
+
+	template<>
+	inline Color<byte>::Color(const Color<float> &c)
+	{
+		r = ifloor(clamp(c.r * 256.0f, 0.0f, 255.0f));
+		g = ifloor(clamp(c.g * 256.0f, 0.0f, 255.0f));
+		b = ifloor(clamp(c.b * 256.0f, 0.0f, 255.0f));
+		a = ifloor(clamp(c.a * 256.0f, 0.0f, 255.0f));
+	}
+
+	template<>
+	inline Color<short>::Color(const Color<short> &c)
+	{
+		r = c.r;
+		g = c.g;
+		b = c.b;
+		a = c.a;
+	}
+
+	template<>
+	inline Color<short>::Color(const Color<byte> &c)
+	{
+		r = c.r << 4;
+		g = c.g << 4;
+		b = c.b << 4;
+		a = c.a << 4;
+	}
+
+	template<>
+	inline Color<float>::Color(const Color<float> &c)
+	{
+		r = c.r;
+		g = c.g;
+		b = c.b;
+		a = c.a;
+	}
+
+	template<>
+	inline Color<short>::Color(const Color<float> &c)
+	{
+		r = iround(clamp(c.r * 4095.0f, -4096.0f, 4095.0f));
+		g = iround(clamp(c.g * 4095.0f, -4096.0f, 4095.0f));
+		b = iround(clamp(c.b * 4095.0f, -4096.0f, 4095.0f));
+		a = iround(clamp(c.a * 4095.0f, -4096.0f, 4095.0f));
+	}
+
+	template<>
+	inline Color<float>::Color(const Color<byte> &c)
+	{
+		r = c.r / 255.0f;
+		g = c.g / 255.0f;
+		b = c.b / 255.0f;
+		a = c.a / 255.0f;
+	}
+
+	template<>
+	inline Color<float>::Color(const Color<short> &c)
+	{
+		r = c.r / 4095.0f;
+		g = c.g / 4095.0f;
+		b = c.b / 4095.0f;
+		a = c.a / 4095.0f;
+	}
+
+	template<>
+	inline Color<float>::Color(unsigned short c)
+	{
+		r = (float)(c & 0xF800) / (float)0xF800;
+		g = (float)(c & 0x07E0) / (float)0x07E0;
+		b = (float)(c & 0x001F) / (float)0x001F;
+		a = 1;
+	}
+
+	template<>
+	inline Color<short>::Color(unsigned short c)
+	{
+		// 4.12 fixed-point format
+		r = ((c & 0xF800) >> 4) + ((c & 0xF800) >> 9) + ((c & 0xF800) >> 14);
+		g = ((c & 0x07E0) << 1) + ((c & 0x07E0) >> 5);
+		b = ((c & 0x001F) << 7) + ((c & 0x001F) << 2) + ((c & 0x001F) >> 3);
+		a = 0x1000;
+	}
+
+	template<>
+	inline Color<byte>::Color(unsigned short c)
+	{
+		r = (byte)(((c & 0xF800) >> 8) + ((c & 0xE000) >> 13));
+		g = (byte)(((c & 0x07E0) >> 3) + ((c & 0x0600) >> 9));
+		b = (byte)(((c & 0x001F) << 3) + ((c & 0x001C) >> 2));
+		a = 0xFF;
+	}
+
+	template<>
+	inline Color<float>::Color(int c)
+	{
+		const float d = 1.0f / 255.0f;
+
+		r = (float)((c & 0x00FF0000) >> 16) * d;
+		g = (float)((c & 0x0000FF00) >> 8) * d;
+		b = (float)((c & 0x000000FF) >> 0) * d;
+		a = (float)((c & 0xFF000000) >> 24) * d;
+	}
+
+	template<>
+	inline Color<short>::Color(int c)
+	{
+		// 4.12 fixed-point format
+		r = (short)((c & 0x00FF0000) >> 12);
+		g = (short)((c & 0x0000FF00) >> 4);
+		b = (short)((c & 0x000000FF) << 4);
+		a = (short)((c & 0xFF000000) >> 20);
+	}
+
+	template<>
+	inline Color<byte>::Color(int c)
+	{
+		r = (byte)((c & 0x00FF0000) >> 16);
+		g = (byte)((c & 0x0000FF00) >> 8);
+		b = (byte)((c & 0x000000FF) >> 0);
+		a = (byte)((c & 0xFF000000) >> 24);
+	}
+
+	template<>
+	inline Color<float>::Color(unsigned int c)
+	{
+		const float d = 1.0f / 255.0f;
+
+		r = (float)((c & 0x00FF0000) >> 16) * d;
+		g = (float)((c & 0x0000FF00) >> 8) * d;
+		b = (float)((c & 0x000000FF) >> 0) * d;
+		a = (float)((c & 0xFF000000) >> 24) * d;
+	}
+
+	template<>
+	inline Color<short>::Color(unsigned int c)
+	{
+		// 4.12 fixed-point format
+		r = (short)((c & 0x00FF0000) >> 12);
+		g = (short)((c & 0x0000FF00) >> 4);
+		b = (short)((c & 0x000000FF) << 4);
+		a = (short)((c & 0xFF000000) >> 20);
+	}
+
+	template<>
+	inline Color<byte>::Color(unsigned int c)
+	{
+		r = (byte)((c & 0x00FF0000) >> 16);
+		g = (byte)((c & 0x0000FF00) >> 8);
+		b = (byte)((c & 0x000000FF) >> 0);
+		a = (byte)((c & 0xFF000000) >> 24);
+	}
+
+	template<>
+	inline Color<float>::Color(unsigned long c)
+	{
+		const float d = 1.0f / 255.0f;
+
+		r = (float)((c & 0x00FF0000) >> 16) * d;
+		g = (float)((c & 0x0000FF00) >> 8) * d;
+		b = (float)((c & 0x000000FF) >> 0) * d;
+		a = (float)((c & 0xFF000000) >> 24) * d;
+	}
+
+	template<>
+	inline Color<short>::Color(unsigned long c)
+	{
+		// 4.12 fixed-point format
+		r = (short)((c & 0x00FF0000) >> 12);
+		g = (short)((c & 0x0000FF00) >> 4);
+		b = (short)((c & 0x000000FF) << 4);
+		a = (short)((c & 0xFF000000) >> 20);
+	}
+
+	template<>
+	inline Color<byte>::Color(unsigned long c)
+	{
+		r = (byte)((c & 0x00FF0000) >> 16);
+		g = (byte)((c & 0x0000FF00) >> 8);
+		b = (byte)((c & 0x000000FF) >> 0);
+		a = (byte)((c & 0xFF000000) >> 24);
+	}
+
+	template<class T>
+	inline Color<T>::Color(T r_, T g_, T b_, T a_)
+	{
+		r = r_;
+		g = g_;
+		b = b_;
+		a = a_;
+	}
+
+	template<>
+	inline Color<float>::operator unsigned int() const
+	{
+		return ((unsigned int)min(b * 255.0f, 255.0f) << 0) |
+		       ((unsigned int)min(g * 255.0f, 255.0f) << 8) |
+		       ((unsigned int)min(r * 255.0f, 255.0f) << 16) |
+		       ((unsigned int)min(a * 255.0f, 255.0f) << 24);
+	}
+
+	template<>
+	inline Color<short>::operator unsigned int() const
+	{
+		return ((unsigned int)min(b >> 4, 255) << 0) |
+		       ((unsigned int)min(g >> 4, 255) << 8) |
+		       ((unsigned int)min(r >> 4, 255) << 16) |
+		       ((unsigned int)min(a >> 4, 255) << 24);
+	}
+
+	template<>
+	inline Color<byte>::operator unsigned int() const
+	{
+		return (b << 0) +
+		       (g << 8) +
+		       (r << 16) +
+			   (a << 24);
+	}
+
+	template<class T>
+	inline T &Color<T>::operator[](int i)
+	{
+		return (&r)[i];
+	}
+
+	template<class T>
+	inline const T &Color<T>::operator[](int i) const
+	{
+		return (&r)[i];
+	}
+
+	template<class T>
+	inline Color<T> Color<T>::operator+() const
+	{
+		return *this;
+	}
+
+	template<class T>
+	inline Color<T> Color<T>::operator-() const
+	{
+		return Color(-r, -g, -b, -a);
+	}
+
+	template<class T>
+	inline Color<T> &Color<T>::operator=(const Color& c)
+	{
+		r = c.r;
+		g = c.g;
+		b = c.b;
+		a = c.a;
+
+		return *this;
+	}
+
+	template<class T>
+	inline Color<T> &Color<T>::operator+=(const Color &c)
+	{
+		r += c.r;
+		g += c.g;
+		b += c.b;
+		a += c.a;
+
+		return *this;
+	}
+
+	template<class T>
+	inline Color<T> &Color<T>::operator*=(float l)
+	{
+		*this = l * *this;
+
+		return *this;
+	}
+
+	template<class T>
+	inline Color<T> operator+(const Color<T> &c1, const Color<T> &c2)
+	{
+		return Color<T>(c1.r + c2.r,
+		                c1.g + c2.g,
+		                c1.b + c2.b,
+		                c1.a + c2.a);	
+	}
+
+	template<class T>
+	inline Color<T> operator-(const Color<T> &c1, const Color<T> &c2)
+	{
+		return Color<T>(c1.r - c2.r,
+		                c1.g - c2.g,
+		                c1.b - c2.b,
+		                c1.a - c2.a);	
+	}
+
+	template<class T>
+	inline Color<T> operator*(float l, const Color<T> &c)
+	{
+		T r = (T)(l * c.r);
+		T g = (T)(l * c.g);
+		T b = (T)(l * c.b);
+		T a = (T)(l * c.a);
+
+		return Color<T>(r, g, b, a);
+	}
+
+	template<class T>
+	inline Color<T> operator*(const Color<T> &c1, const Color<T> &c2)
+	{
+		T r = c1.r * c2.r;
+		T g = c1.g * c2.g;
+		T b = c1.b * c2.b;
+		T a = c1.a * c2.a;
+
+		return Color<T>(r, g, b, a);
+	}
+
+	template<>
+	inline Color<short> operator*(const Color<short> &c1, const Color<short> &c2)
+	{
+		short r = c1.r * c2.r >> 12;
+		short g = c1.g * c2.g >> 12;
+		short b = c1.b * c2.b >> 12;
+		short a = c1.a * c2.a >> 12;
+
+		return Color<short>(r, g, b, a);
+	}
+
+	template<>
+	inline Color<byte> operator*(const Color<byte> &c1, const Color<byte> &c2)
+	{
+		byte r = c1.r * c2.r >> 8;
+		byte g = c1.g * c2.g >> 8;
+		byte b = c1.b * c2.b >> 8;
+		byte a = c1.a * c2.a >> 8;
+
+		return Color<byte>(r, g, b, a);
+	}
+
+	template<class T>
+	inline Color<T> operator/(const Color<T> &c, float l)
+	{
+		l = 1.0f / l; 
+
+		T r = (T)(l * c.r);
+		T g = (T)(l * c.g);
+		T b = (T)(l * c.b);
+		T a = (T)(l * c.a);
+
+		return Color<T>(r, g, b, a);
+	}
+
+	template<class T>
+	inline Color<T> Color<T>::gradient(const Color<T> &c1, const Color<T> &c2, float d)
+	{
+		d = 1.0f / d; 
+
+		T r = (c2.r - c1.r) * d;
+		T g = (c2.g - c1.g) * d;
+		T b = (c2.b - c1.b) * d;
+		T a = (c2.a - c1.a) * d;
+
+		return Color<T>(r, g, b, a);
+	}
+
+	template<class T>
+	inline Color<T> Color<T>::shade(const Color<T> &c1, const Color<T>  &c2, float d)
+	{
+		T r = c1.r + (T)(d * (c2.r - c1.r));
+		T g = c1.g + (T)(d * (c2.g - c1.g));
+		T b = c1.b + (T)(d * (c2.b - c1.b));
+		T a = c1.a + (T)(d * (c2.a - c1.a));
+
+		return Color<T>(r, g, b, a);
+	}
+}
+
+#endif   // sw_Color_hpp
diff --git a/src/Device/Config.cpp b/src/Device/Config.cpp
new file mode 100644
index 0000000..7cb309a
--- /dev/null
+++ b/src/Device/Config.cpp
@@ -0,0 +1,82 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Config.hpp"
+
+#include "Common/Thread.hpp"
+#include "Common/Timer.hpp"
+
+namespace sw
+{
+	Profiler profiler;
+
+	Profiler::Profiler()
+	{
+		reset();
+	}
+
+	void Profiler::reset()
+	{
+		framesSec = 0;
+		framesTotal = 0;
+		FPS = 0;
+
+		#if PERF_PROFILE
+			for(int i = 0; i < PERF_TIMERS; i++)
+			{
+				cycles[i] = 0;
+			}
+
+			ropOperations = 0;
+			ropOperationsTotal = 0;
+			ropOperationsFrame = 0;
+
+			texOperations = 0;
+			texOperationsTotal = 0;
+			texOperationsFrame = 0;
+
+			compressedTex = 0;
+			compressedTexTotal = 0;
+			compressedTexFrame = 0;
+		#endif
+	};
+
+	void Profiler::nextFrame()
+	{
+		#if PERF_PROFILE
+			ropOperationsFrame = sw::atomicExchange(&ropOperations, 0);
+			texOperationsFrame = sw::atomicExchange(&texOperations, 0);
+			compressedTexFrame = sw::atomicExchange(&compressedTex, 0);
+
+			ropOperationsTotal += ropOperationsFrame;
+			texOperationsTotal += texOperationsFrame;
+			compressedTexTotal += compressedTexFrame;
+		#endif
+
+		static double fpsTime = sw::Timer::seconds();
+
+		double time = sw::Timer::seconds();
+		double delta = time - fpsTime;
+		framesSec++;
+
+		if(delta > 1.0)
+		{
+			FPS = framesSec / delta;
+
+			fpsTime = time;
+			framesTotal += framesSec;
+			framesSec = 0;
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/Device/Config.hpp b/src/Device/Config.hpp
new file mode 100644
index 0000000..017e38b
--- /dev/null
+++ b/src/Device/Config.hpp
@@ -0,0 +1,103 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Config_hpp
+#define sw_Config_hpp
+
+#include "Common/Types.hpp"
+
+#define PERF_HUD 0       // Display time spent on vertex, setup and pixel processing for each thread
+#define PERF_PROFILE 0   // Profile various pipeline stages and display the timing in SwiftConfig
+
+#define ASTC_SUPPORT 0
+
+// Worker thread count when not set by SwiftConfig
+// 0 = process affinity count (recommended)
+// 1 = rendering on main thread (no worker threads), useful for debugging
+#ifndef DEFAULT_THREAD_COUNT
+#define DEFAULT_THREAD_COUNT 0
+#endif
+
+namespace sw
+{
+	enum
+	{
+		PERF_PIXEL,
+		PERF_PIPE,
+		PERF_INTERP,
+		PERF_SHADER,
+		PERF_TEX,
+		PERF_ROP,
+
+		PERF_TIMERS
+	};
+
+	struct Profiler
+	{
+		Profiler();
+
+		void reset();
+		void nextFrame();
+
+		int framesSec;
+		int framesTotal;
+		double FPS;
+
+		#if PERF_PROFILE
+		double cycles[PERF_TIMERS];
+
+		int64_t ropOperations;
+		int64_t ropOperationsTotal;
+		int64_t ropOperationsFrame;
+
+		int64_t texOperations;
+		int64_t texOperationsTotal;
+		int64_t texOperationsFrame;
+
+		int64_t compressedTex;
+		int64_t compressedTexTotal;
+		int64_t compressedTexFrame;
+		#endif
+	};
+
+	extern Profiler profiler;
+
+	enum
+	{
+		OUTLINE_RESOLUTION = 8192,   // Maximum vertical resolution of the render target
+		MIPMAP_LEVELS = 14,
+		TEXTURE_IMAGE_UNITS = 16,
+		VERTEX_TEXTURE_IMAGE_UNITS = 16,
+		TOTAL_IMAGE_UNITS = TEXTURE_IMAGE_UNITS + VERTEX_TEXTURE_IMAGE_UNITS,
+		FRAGMENT_UNIFORM_VECTORS = 264,
+		VERTEX_UNIFORM_VECTORS = 259,
+		MAX_VERTEX_INPUTS = 32,
+		MAX_VERTEX_OUTPUTS = 34,
+		MAX_FRAGMENT_INPUTS = 32,
+		MAX_FRAGMENT_UNIFORM_BLOCKS = 12,
+		MAX_VERTEX_UNIFORM_BLOCKS = 12,
+		MAX_UNIFORM_BUFFER_BINDINGS = MAX_FRAGMENT_UNIFORM_BLOCKS + MAX_VERTEX_UNIFORM_BLOCKS,   // Limited to 127 by SourceParameter.bufferIndex in Shader.hpp
+		MAX_UNIFORM_BLOCK_SIZE = 16384,
+		MAX_CLIP_PLANES = 6,
+		MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
+		MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 64,
+		MIN_PROGRAM_TEXEL_OFFSET = -8,
+		MAX_PROGRAM_TEXEL_OFFSET = 7,
+		MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
+		RENDERTARGETS = 8,
+		NUM_TEMPORARY_REGISTERS = 4096,
+	};
+}
+
+#endif   // sw_Config_hpp
diff --git a/src/Device/Context.cpp b/src/Device/Context.cpp
new file mode 100644
index 0000000..25c5775
--- /dev/null
+++ b/src/Device/Context.cpp
@@ -0,0 +1,1496 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Context.hpp"
+
+#include "Primitive.hpp"
+#include "Surface.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Shader/VertexShader.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+	extern bool perspectiveCorrection;
+
+	bool halfIntegerCoordinates = false;     // Pixel centers are not at integer coordinates
+	bool symmetricNormalizedDepth = false;   // [-1, 1] instead of [0, 1]
+	bool booleanFaceRegister = false;
+	bool fullPixelPositionRegister = false;
+	bool leadingVertexFirst = false;         // Flat shading uses first vertex, else last
+	bool secondaryColor = false;             // Specular lighting is applied after texturing
+	bool colorsDefaultToZero = false;
+
+	bool forceWindowed = false;
+	bool quadLayoutEnabled = false;
+	bool veryEarlyDepthTest = true;
+	bool complementaryDepthBuffer = false;
+	bool postBlendSRGB = false;
+	bool exactColorRounding = false;
+	TransparencyAntialiasing transparencyAntialiasing = TRANSPARENCY_NONE;
+	bool forceClearRegisters = false;
+
+	Context::Context()
+	{
+		init();
+	}
+
+	Context::~Context()
+	{
+	}
+
+	void *Context::operator new(size_t bytes)
+	{
+		return allocate((unsigned int)bytes);
+	}
+
+	void Context::operator delete(void *pointer, size_t bytes)
+	{
+		deallocate(pointer);
+	}
+
+	bool Context::isDrawPoint(bool fillModeAware) const
+	{
+		switch(drawType)
+		{
+		case DRAW_POINTLIST:
+		case DRAW_INDEXEDPOINTLIST8:
+		case DRAW_INDEXEDPOINTLIST16:
+		case DRAW_INDEXEDPOINTLIST32:
+			return true;
+		case DRAW_LINELIST:
+		case DRAW_LINESTRIP:
+		case DRAW_LINELOOP:
+		case DRAW_INDEXEDLINELIST8:
+		case DRAW_INDEXEDLINESTRIP8:
+		case DRAW_INDEXEDLINELOOP8:
+		case DRAW_INDEXEDLINELIST16:
+		case DRAW_INDEXEDLINESTRIP16:
+		case DRAW_INDEXEDLINELOOP16:
+		case DRAW_INDEXEDLINELIST32:
+		case DRAW_INDEXEDLINESTRIP32:
+		case DRAW_INDEXEDLINELOOP32:
+			return false;
+		case DRAW_TRIANGLELIST:
+		case DRAW_TRIANGLESTRIP:
+		case DRAW_TRIANGLEFAN:
+		case DRAW_INDEXEDTRIANGLELIST8:
+		case DRAW_INDEXEDTRIANGLESTRIP8:
+		case DRAW_INDEXEDTRIANGLEFAN8:
+		case DRAW_INDEXEDTRIANGLELIST16:
+		case DRAW_INDEXEDTRIANGLESTRIP16:
+		case DRAW_INDEXEDTRIANGLEFAN16:
+		case DRAW_INDEXEDTRIANGLELIST32:
+		case DRAW_INDEXEDTRIANGLESTRIP32:
+		case DRAW_INDEXEDTRIANGLEFAN32:
+			return fillModeAware ? fillMode == FILL_VERTEX : false;
+		case DRAW_QUADLIST:
+			return false;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool Context::isDrawLine(bool fillModeAware) const
+	{
+		switch(drawType)
+		{
+		case DRAW_POINTLIST:
+		case DRAW_INDEXEDPOINTLIST8:
+		case DRAW_INDEXEDPOINTLIST16:
+		case DRAW_INDEXEDPOINTLIST32:
+			return false;
+		case DRAW_LINELIST:
+		case DRAW_LINESTRIP:
+		case DRAW_LINELOOP:
+		case DRAW_INDEXEDLINELIST8:
+		case DRAW_INDEXEDLINESTRIP8:
+		case DRAW_INDEXEDLINELOOP8:
+		case DRAW_INDEXEDLINELIST16:
+		case DRAW_INDEXEDLINESTRIP16:
+		case DRAW_INDEXEDLINELOOP16:
+		case DRAW_INDEXEDLINELIST32:
+		case DRAW_INDEXEDLINESTRIP32:
+		case DRAW_INDEXEDLINELOOP32:
+			return true;
+		case DRAW_TRIANGLELIST:
+		case DRAW_TRIANGLESTRIP:
+		case DRAW_TRIANGLEFAN:
+		case DRAW_INDEXEDTRIANGLELIST8:
+		case DRAW_INDEXEDTRIANGLESTRIP8:
+		case DRAW_INDEXEDTRIANGLEFAN8:
+		case DRAW_INDEXEDTRIANGLELIST16:
+		case DRAW_INDEXEDTRIANGLESTRIP16:
+		case DRAW_INDEXEDTRIANGLEFAN16:
+		case DRAW_INDEXEDTRIANGLELIST32:
+		case DRAW_INDEXEDTRIANGLESTRIP32:
+		case DRAW_INDEXEDTRIANGLEFAN32:
+			return fillModeAware ? fillMode == FILL_WIREFRAME : false;
+		case DRAW_QUADLIST:
+			return false;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool Context::isDrawTriangle(bool fillModeAware) const
+	{
+		switch(drawType)
+		{
+		case DRAW_POINTLIST:
+		case DRAW_INDEXEDPOINTLIST8:
+		case DRAW_INDEXEDPOINTLIST16:
+		case DRAW_INDEXEDPOINTLIST32:
+			return false;
+		case DRAW_LINELIST:
+		case DRAW_LINESTRIP:
+		case DRAW_LINELOOP:
+		case DRAW_INDEXEDLINELIST8:
+		case DRAW_INDEXEDLINESTRIP8:
+		case DRAW_INDEXEDLINELOOP8:
+		case DRAW_INDEXEDLINELIST16:
+		case DRAW_INDEXEDLINESTRIP16:
+		case DRAW_INDEXEDLINELOOP16:
+		case DRAW_INDEXEDLINELIST32:
+		case DRAW_INDEXEDLINESTRIP32:
+		case DRAW_INDEXEDLINELOOP32:
+			return false;
+		case DRAW_TRIANGLELIST:
+		case DRAW_TRIANGLESTRIP:
+		case DRAW_TRIANGLEFAN:
+		case DRAW_INDEXEDTRIANGLELIST8:
+		case DRAW_INDEXEDTRIANGLESTRIP8:
+		case DRAW_INDEXEDTRIANGLEFAN8:
+		case DRAW_INDEXEDTRIANGLELIST16:
+		case DRAW_INDEXEDTRIANGLESTRIP16:
+		case DRAW_INDEXEDTRIANGLEFAN16:
+		case DRAW_INDEXEDTRIANGLELIST32:
+		case DRAW_INDEXEDTRIANGLESTRIP32:
+		case DRAW_INDEXEDTRIANGLEFAN32:
+			return fillModeAware ? fillMode == FILL_SOLID : true;
+		case DRAW_QUADLIST:
+			// Quads are broken up into triangles
+			return fillModeAware ? fillMode == FILL_SOLID : true;
+		default:
+			ASSERT(false);
+		}
+
+		return true;
+	}
+
+	void Context::init()
+	{
+		for(int i = 0; i < 8; i++)
+		{
+			textureStage[i].init(i, &sampler[i], (i >= 1) ? &textureStage[i - 1] : 0);
+		}
+
+		// Set vertex streams to null stream
+		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		{
+			input[i].defaults();
+		}
+
+		fogStart = 0.0f;
+		fogEnd = 1.0f;
+
+		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++) textureWrap[i] = 0;
+		for(int i = 0; i < 8; i++) texGen[i] = TEXGEN_PASSTHRU;
+		for(int i = 0; i < 8; i++) textureTransformCount[i] = 0;
+		for(int i = 0; i < 8; i++) textureTransformProject[i] = false;
+		textureWrapActive = false;
+		localViewer = true;
+		normalizeNormals = false;
+
+		for(int i = 0; i < RENDERTARGETS; ++i)
+		{
+			renderTarget[i] = nullptr;
+		}
+		depthBuffer = nullptr;
+		stencilBuffer = nullptr;
+
+		stencilEnable = false;
+		stencilCompareMode = STENCIL_ALWAYS;
+		stencilReference = 0;
+		stencilMask = 0xFFFFFFFF;
+		stencilFailOperation = OPERATION_KEEP;
+		stencilPassOperation = OPERATION_KEEP;
+		stencilZFailOperation = OPERATION_KEEP;
+		stencilWriteMask = 0xFFFFFFFF;
+
+		twoSidedStencil = false;
+		stencilCompareModeCCW = STENCIL_ALWAYS;
+		stencilReferenceCCW = 0;
+		stencilMaskCCW = 0xFFFFFFFF;
+		stencilFailOperationCCW = OPERATION_KEEP;
+		stencilPassOperationCCW = OPERATION_KEEP;
+		stencilZFailOperationCCW = OPERATION_KEEP;
+		stencilWriteMaskCCW = 0xFFFFFFFF;
+
+		setGlobalMipmapBias(0);
+
+		lightingEnable = true;
+		specularEnable = false;
+		for(int i = 0; i < 8; i++) lightEnable[i] = false;
+		for(int i = 0; i < 8; i++) worldLightPosition[i] = 0;
+
+		alphaCompareMode = ALPHA_ALWAYS;
+		alphaTestEnable = false;
+		fillMode = FILL_SOLID;
+		shadingMode = SHADING_GOURAUD;
+
+		rasterizerDiscard = false;
+
+		depthCompareMode = DEPTH_LESS;
+		depthBufferEnable = true;
+		depthWriteEnable = true;
+
+		alphaBlendEnable = false;
+		sourceBlendFactorState = BLEND_ONE;
+		destBlendFactorState = BLEND_ZERO;
+		blendOperationState = BLENDOP_ADD;
+
+		separateAlphaBlendEnable = false;
+		sourceBlendFactorStateAlpha = BLEND_ONE;
+		destBlendFactorStateAlpha = BLEND_ZERO;
+		blendOperationStateAlpha = BLENDOP_ADD;
+
+		cullMode = CULL_CLOCKWISE;
+		frontFacingCCW = true;
+		alphaReference = 0.0f;
+
+		depthBias = 0.0f;
+		slopeDepthBias = 0.0f;
+
+		for(int i = 0; i < RENDERTARGETS; i++)
+		{
+			colorWriteMask[i] = 0x0000000F;
+		}
+
+		ambientMaterialSource = MATERIAL_MATERIAL;
+		diffuseMaterialSource = MATERIAL_COLOR1;
+		specularMaterialSource = MATERIAL_COLOR2;
+		emissiveMaterialSource = MATERIAL_MATERIAL;
+		colorVertexEnable = true;
+
+		fogEnable = false;
+		pixelFogMode = FOG_NONE;
+		vertexFogMode = FOG_NONE;
+		wBasedFog = false;
+		rangeFogEnable = false;
+
+		indexedVertexBlendEnable = false;
+		vertexBlendMatrixCount = 0;
+
+		pixelShader = 0;
+		vertexShader = 0;
+
+		instanceID = 0;
+
+		occlusionEnabled = false;
+		transformFeedbackQueryEnabled = false;
+		transformFeedbackEnabled = 0;
+
+		pointSpriteEnable = false;
+		pointScaleEnable = false;
+		lineWidth = 1.0f;
+
+		writeSRGB = false;
+		sampleMask = 0xFFFFFFFF;
+
+		colorLogicOpEnabled = false;
+		logicalOperation = LOGICALOP_COPY;
+	}
+
+	const float &Context::exp2Bias()
+	{
+		return bias;
+	}
+
+	const Point &Context::getLightPosition(int light)
+	{
+		return worldLightPosition[light];
+	}
+
+	void Context::setGlobalMipmapBias(float bias)
+	{
+		this->bias = exp2(bias + 0.5f);
+	}
+
+	void Context::setLightingEnable(bool lightingEnable)
+	{
+		this->lightingEnable = lightingEnable;
+	}
+
+	void Context::setSpecularEnable(bool specularEnable)
+	{
+		Context::specularEnable = specularEnable;
+	}
+
+	void Context::setLightEnable(int light, bool lightEnable)
+	{
+		Context::lightEnable[light] = lightEnable;
+	}
+
+	void Context::setLightPosition(int light, Point worldLightPosition)
+	{
+		Context::worldLightPosition[light] = worldLightPosition;
+	}
+
+	void Context::setAmbientMaterialSource(MaterialSource ambientMaterialSource)
+	{
+		Context::ambientMaterialSource = ambientMaterialSource;
+	}
+
+	void Context::setDiffuseMaterialSource(MaterialSource diffuseMaterialSource)
+	{
+		Context::diffuseMaterialSource = diffuseMaterialSource;
+	}
+
+	void Context::setSpecularMaterialSource(MaterialSource specularMaterialSource)
+	{
+		Context::specularMaterialSource = specularMaterialSource;
+	}
+
+	void Context::setEmissiveMaterialSource(MaterialSource emissiveMaterialSource)
+	{
+		Context::emissiveMaterialSource = emissiveMaterialSource;
+	}
+
+	void Context::setPointSpriteEnable(bool pointSpriteEnable)
+	{
+		Context::pointSpriteEnable = pointSpriteEnable;
+	}
+
+	void Context::setPointScaleEnable(bool pointScaleEnable)
+	{
+		Context::pointScaleEnable = pointScaleEnable;
+	}
+
+	bool Context::setDepthBufferEnable(bool depthBufferEnable)
+	{
+		bool modified = (Context::depthBufferEnable != depthBufferEnable);
+		Context::depthBufferEnable = depthBufferEnable;
+		return modified;
+	}
+
+	bool Context::setAlphaBlendEnable(bool alphaBlendEnable)
+	{
+		bool modified = (Context::alphaBlendEnable != alphaBlendEnable);
+		Context::alphaBlendEnable = alphaBlendEnable;
+		return modified;
+	}
+
+	bool Context::setSourceBlendFactor(BlendFactor sourceBlendFactor)
+	{
+		bool modified = (Context::sourceBlendFactorState != sourceBlendFactor);
+		Context::sourceBlendFactorState = sourceBlendFactor;
+		return modified;
+	}
+
+	bool Context::setDestBlendFactor(BlendFactor destBlendFactor)
+	{
+		bool modified = (Context::destBlendFactorState != destBlendFactor);
+		Context::destBlendFactorState = destBlendFactor;
+		return modified;
+	}
+
+	bool Context::setBlendOperation(BlendOperation blendOperation)
+	{
+		bool modified = (Context::blendOperationState != blendOperation);
+		Context::blendOperationState = blendOperation;
+		return modified;
+	}
+
+	bool Context::setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable)
+	{
+		bool modified = (Context::separateAlphaBlendEnable != separateAlphaBlendEnable);
+		Context::separateAlphaBlendEnable = separateAlphaBlendEnable;
+		return modified;
+	}
+
+	bool Context::setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha)
+	{
+		bool modified = (Context::sourceBlendFactorStateAlpha != sourceBlendFactorAlpha);
+		Context::sourceBlendFactorStateAlpha = sourceBlendFactorAlpha;
+		return modified;
+	}
+
+	bool Context::setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha)
+	{
+		bool modified = (Context::destBlendFactorStateAlpha != destBlendFactorAlpha);
+		Context::destBlendFactorStateAlpha = destBlendFactorAlpha;
+		return modified;
+	}
+
+	bool Context::setBlendOperationAlpha(BlendOperation blendOperationAlpha)
+	{
+		bool modified = (Context::blendOperationStateAlpha != blendOperationAlpha);
+		Context::blendOperationStateAlpha = blendOperationAlpha;
+		return modified;
+	}
+
+	bool Context::setColorWriteMask(int index, int colorWriteMask)
+	{
+		bool modified = (Context::colorWriteMask[index] != colorWriteMask);
+		Context::colorWriteMask[index] = colorWriteMask;
+		return modified;
+	}
+
+	bool Context::setWriteSRGB(bool sRGB)
+	{
+		bool modified = (Context::writeSRGB != sRGB);
+		Context::writeSRGB = sRGB;
+		return modified;
+	}
+
+	bool Context::setColorLogicOpEnabled(bool enabled)
+	{
+		bool modified = (Context::colorLogicOpEnabled != enabled);
+		Context::colorLogicOpEnabled = enabled;
+		return modified;
+	}
+
+	bool Context::setLogicalOperation(LogicalOperation logicalOperation)
+	{
+		bool modified = (Context::logicalOperation != logicalOperation);
+		Context::logicalOperation = logicalOperation;
+		return modified;
+	}
+
+	void Context::setColorVertexEnable(bool colorVertexEnable)
+	{
+		Context::colorVertexEnable = colorVertexEnable;
+	}
+
+	bool Context::fogActive()
+	{
+		if(!colorUsed()) return false;
+
+		if(pixelShaderModel() >= 0x0300) return false;
+
+		return fogEnable;
+	}
+
+	bool Context::pointSizeActive()
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return isDrawPoint(true) && (input[PointSize] || (!preTransformed && pointScaleActive()));
+	}
+
+	FogMode Context::pixelFogActive()
+	{
+		if(fogActive())
+		{
+			return pixelFogMode;
+		}
+
+		return FOG_NONE;
+	}
+
+	bool Context::depthWriteActive()
+	{
+		if(!depthBufferActive()) return false;
+
+		return depthWriteEnable;
+	}
+
+	bool Context::alphaTestActive()
+	{
+		if(transparencyAntialiasing != TRANSPARENCY_NONE) return true;
+		if(!alphaTestEnable) return false;
+		if(alphaCompareMode == ALPHA_ALWAYS) return false;
+		if(alphaReference == 0.0f && alphaCompareMode == ALPHA_GREATEREQUAL) return false;
+
+		return true;
+	}
+
+	bool Context::depthBufferActive()
+	{
+		return depthBuffer && depthBufferEnable;
+	}
+
+	bool Context::stencilActive()
+	{
+		return stencilBuffer && stencilEnable;
+	}
+
+	bool Context::vertexLightingActive()
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return lightingEnable && !preTransformed;
+	}
+
+	bool Context::texCoordActive(int coordinate, int component)
+	{
+		bool hasTexture = pointSpriteActive();
+
+		if(vertexShader)
+		{
+			if(!preTransformed)
+			{
+				if(vertexShader->getOutput(T0 + coordinate, component).usage == Shader::USAGE_TEXCOORD)
+				{
+					hasTexture = true;
+				}
+			}
+			else
+			{
+				hasTexture = true;   // FIXME: Check vertex buffer streams
+			}
+		}
+		else
+		{
+			switch(texGen[coordinate])
+			{
+			case TEXGEN_NONE:
+				hasTexture = true;
+				break;
+			case TEXGEN_PASSTHRU:
+				hasTexture = hasTexture || (component < input[TexCoord0 + textureStage[coordinate].texCoordIndex].count);
+				break;
+			case TEXGEN_NORMAL:
+				hasTexture = hasTexture || (component <= 2);
+				break;
+			case TEXGEN_POSITION:
+				hasTexture = hasTexture || (component <= 2);
+				break;
+			case TEXGEN_REFLECTION:
+				hasTexture = hasTexture || (component <= 2);
+				break;
+			case TEXGEN_SPHEREMAP:
+				hasTexture = hasTexture || (component <= 1);
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+
+		bool project = isProjectionComponent(coordinate, component);
+		bool usesTexture = false;
+
+		if(pixelShader)
+		{
+			usesTexture = pixelShader->usesTexture(coordinate, component) || project;
+		}
+		else
+		{
+			usesTexture = textureStage[coordinate].usesTexture() || project;
+		}
+
+		return hasTexture && usesTexture;
+	}
+
+	bool Context::texCoordActive(int coordinate)
+	{
+		return texCoordActive(coordinate, 0) ||
+		       texCoordActive(coordinate, 1) ||
+		       texCoordActive(coordinate, 2) ||
+		       texCoordActive(coordinate, 3);
+	}
+
+	bool Context::isProjectionComponent(unsigned int coordinate, int component)
+	{
+		if(pixelShaderModel() <= 0x0103 && coordinate < 8 && textureTransformProject[coordinate])
+		{
+			if(textureTransformCount[coordinate] == 2)
+			{
+				if(component == 1) return true;
+			}
+			else if(textureTransformCount[coordinate] == 3)
+			{
+				if(component == 2) return true;
+			}
+			else if(textureTransformCount[coordinate] == 4 || textureTransformCount[coordinate] == 0)
+			{
+				if(component == 3) return true;
+			}
+		}
+
+		return false;
+	}
+
+	bool Context::vertexSpecularActive()
+	{
+		return vertexLightingActive() && specularEnable && vertexNormalActive();
+	}
+
+	bool Context::vertexNormalActive()
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return input[Normal];
+	}
+
+	bool Context::vertexLightActive(int i)
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return lightingEnable && lightEnable[i];
+	}
+
+	MaterialSource Context::vertexDiffuseMaterialSourceActive()
+	{
+		if(vertexShader)
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		if(diffuseMaterialSource == MATERIAL_MATERIAL || !colorVertexEnable ||
+		   (diffuseMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+		   (diffuseMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		return diffuseMaterialSource;
+	}
+
+	MaterialSource Context::vertexSpecularMaterialSourceActive()
+	{
+		if(vertexShader)
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		if(!colorVertexEnable ||
+		   (specularMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+		   (specularMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		return specularMaterialSource;
+	}
+
+	MaterialSource Context::vertexAmbientMaterialSourceActive()
+	{
+		if(vertexShader)
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		if(!colorVertexEnable ||
+		   (ambientMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+		   (ambientMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		return ambientMaterialSource;
+	}
+
+	MaterialSource Context::vertexEmissiveMaterialSourceActive()
+	{
+		if(vertexShader)
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		if(!colorVertexEnable ||
+		   (emissiveMaterialSource == MATERIAL_COLOR1 && !input[Color0]) ||
+		   (emissiveMaterialSource == MATERIAL_COLOR2 && !input[Color1]))
+		{
+			return MATERIAL_MATERIAL;
+		}
+
+		return emissiveMaterialSource;
+	}
+
+	bool Context::pointSpriteActive()
+	{
+		return isDrawPoint(true) && pointSpriteEnable;
+	}
+
+	bool Context::pointScaleActive()
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return isDrawPoint(true) && pointScaleEnable;
+	}
+
+	bool Context::alphaBlendActive()
+	{
+		if(!alphaBlendEnable)
+		{
+			return false;
+		}
+
+		if(!colorUsed())
+		{
+			return false;
+		}
+
+		bool colorBlend = !(blendOperation() == BLENDOP_SOURCE && sourceBlendFactor() == BLEND_ONE);
+		bool alphaBlend = separateAlphaBlendEnable ? !(blendOperationAlpha() == BLENDOP_SOURCE && sourceBlendFactorAlpha() == BLEND_ONE) : colorBlend;
+
+		return colorBlend || alphaBlend;
+	}
+
+	LogicalOperation Context::colorLogicOp()
+	{
+		return colorLogicOpEnabled ? logicalOperation : LOGICALOP_COPY;
+	}
+
+	BlendFactor Context::sourceBlendFactor()
+	{
+		if(!alphaBlendEnable) return BLEND_ONE;
+
+		switch(blendOperationState)
+		{
+		case BLENDOP_ADD:
+		case BLENDOP_SUB:
+		case BLENDOP_INVSUB:
+			return sourceBlendFactorState;
+		case BLENDOP_MIN:
+			return BLEND_ONE;
+		case BLENDOP_MAX:
+			return BLEND_ONE;
+		default:
+			ASSERT(false);
+		}
+
+		return sourceBlendFactorState;
+	}
+
+	BlendFactor Context::destBlendFactor()
+	{
+		if(!alphaBlendEnable) return BLEND_ZERO;
+
+		switch(blendOperationState)
+		{
+		case BLENDOP_ADD:
+		case BLENDOP_SUB:
+		case BLENDOP_INVSUB:
+			return destBlendFactorState;
+		case BLENDOP_MIN:
+			return BLEND_ONE;
+		case BLENDOP_MAX:
+			return BLEND_ONE;
+		default:
+			ASSERT(false);
+		}
+
+		return destBlendFactorState;
+	}
+
+	BlendOperation Context::blendOperation()
+	{
+		if(!alphaBlendEnable) return BLENDOP_SOURCE;
+
+		switch(blendOperationState)
+		{
+		case BLENDOP_ADD:
+			if(sourceBlendFactor() == BLEND_ZERO)
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_NULL;
+				}
+				else
+				{
+					return BLENDOP_DEST;
+				}
+			}
+			else if(sourceBlendFactor() == BLEND_ONE)
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_SOURCE;
+				}
+				else
+				{
+					return BLENDOP_ADD;
+				}
+			}
+			else
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_SOURCE;
+				}
+				else
+				{
+					return BLENDOP_ADD;
+				}
+			}
+		case BLENDOP_SUB:
+			if(sourceBlendFactor() == BLEND_ZERO)
+			{
+				return BLENDOP_NULL;   // Negative, clamped to zero
+			}
+			else if(sourceBlendFactor() == BLEND_ONE)
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_SOURCE;
+				}
+				else
+				{
+					return BLENDOP_SUB;
+				}
+			}
+			else
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_SOURCE;
+				}
+				else
+				{
+					return BLENDOP_SUB;
+				}
+			}
+		case BLENDOP_INVSUB:
+			if(sourceBlendFactor() == BLEND_ZERO)
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_NULL;
+				}
+				else
+				{
+					return BLENDOP_DEST;
+				}
+			}
+			else if(sourceBlendFactor() == BLEND_ONE)
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_NULL;   // Negative, clamped to zero
+				}
+				else
+				{
+					return BLENDOP_INVSUB;
+				}
+			}
+			else
+			{
+				if(destBlendFactor() == BLEND_ZERO)
+				{
+					return BLENDOP_NULL;   // Negative, clamped to zero
+				}
+				else
+				{
+					return BLENDOP_INVSUB;
+				}
+			}
+		case BLENDOP_MIN:
+			return BLENDOP_MIN;
+		case BLENDOP_MAX:
+			return BLENDOP_MAX;
+		default:
+			ASSERT(false);
+		}
+
+		return blendOperationState;
+	}
+
+	BlendFactor Context::sourceBlendFactorAlpha()
+	{
+		if(!separateAlphaBlendEnable)
+		{
+			return sourceBlendFactor();
+		}
+		else
+		{
+			switch(blendOperationStateAlpha)
+			{
+			case BLENDOP_ADD:
+			case BLENDOP_SUB:
+			case BLENDOP_INVSUB:
+				return sourceBlendFactorStateAlpha;
+			case BLENDOP_MIN:
+				return BLEND_ONE;
+			case BLENDOP_MAX:
+				return BLEND_ONE;
+			default:
+				ASSERT(false);
+			}
+
+			return sourceBlendFactorStateAlpha;
+		}
+	}
+
+	BlendFactor Context::destBlendFactorAlpha()
+	{
+		if(!separateAlphaBlendEnable)
+		{
+			return destBlendFactor();
+		}
+		else
+		{
+			switch(blendOperationStateAlpha)
+			{
+			case BLENDOP_ADD:
+			case BLENDOP_SUB:
+			case BLENDOP_INVSUB:
+				return destBlendFactorStateAlpha;
+			case BLENDOP_MIN:
+				return BLEND_ONE;
+			case BLENDOP_MAX:
+				return BLEND_ONE;
+			default:
+				ASSERT(false);
+			}
+
+			return destBlendFactorStateAlpha;
+		}
+	}
+
+	BlendOperation Context::blendOperationAlpha()
+	{
+		if(!separateAlphaBlendEnable)
+		{
+			return blendOperation();
+		}
+		else
+		{
+			switch(blendOperationStateAlpha)
+			{
+			case BLENDOP_ADD:
+				if(sourceBlendFactorAlpha() == BLEND_ZERO)
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_NULL;
+					}
+					else
+					{
+						return BLENDOP_DEST;
+					}
+				}
+				else if(sourceBlendFactorAlpha() == BLEND_ONE)
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_SOURCE;
+					}
+					else
+					{
+						return BLENDOP_ADD;
+					}
+				}
+				else
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_SOURCE;
+					}
+					else
+					{
+						return BLENDOP_ADD;
+					}
+				}
+			case BLENDOP_SUB:
+				if(sourceBlendFactorAlpha() == BLEND_ZERO)
+				{
+					return BLENDOP_NULL;   // Negative, clamped to zero
+				}
+				else if(sourceBlendFactorAlpha() == BLEND_ONE)
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_SOURCE;
+					}
+					else
+					{
+						return BLENDOP_SUB;
+					}
+				}
+				else
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_SOURCE;
+					}
+					else
+					{
+						return BLENDOP_SUB;
+					}
+				}
+			case BLENDOP_INVSUB:
+				if(sourceBlendFactorAlpha() == BLEND_ZERO)
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_NULL;
+					}
+					else
+					{
+						return BLENDOP_DEST;
+					}
+				}
+				else if(sourceBlendFactorAlpha() == BLEND_ONE)
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_NULL;   // Negative, clamped to zero
+					}
+					else
+					{
+						return BLENDOP_INVSUB;
+					}
+				}
+				else
+				{
+					if(destBlendFactorAlpha() == BLEND_ZERO)
+					{
+						return BLENDOP_NULL;   // Negative, clamped to zero
+					}
+					else
+					{
+						return BLENDOP_INVSUB;
+					}
+				}
+			case BLENDOP_MIN:
+				return BLENDOP_MIN;
+			case BLENDOP_MAX:
+				return BLENDOP_MAX;
+			default:
+				ASSERT(false);
+			}
+
+			return blendOperationStateAlpha;
+		}
+	}
+
+	bool Context::indexedVertexBlendActive()
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return indexedVertexBlendEnable;
+	}
+
+	int Context::vertexBlendMatrixCountActive()
+	{
+		if(vertexShader)
+		{
+			return 0;
+		}
+
+		return vertexBlendMatrixCount;
+	}
+
+	bool Context::localViewerActive()
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return localViewer;
+	}
+
+	bool Context::normalizeNormalsActive()
+	{
+		if(vertexShader)
+		{
+			return false;
+		}
+
+		return normalizeNormals;
+	}
+
+	FogMode Context::vertexFogModeActive()
+	{
+		if(vertexShader || !fogActive())
+		{
+			return FOG_NONE;
+		}
+
+		return vertexFogMode;
+	}
+
+	bool Context::rangeFogActive()
+	{
+		if(vertexShader || !fogActive())
+		{
+			return false;
+		}
+
+		return rangeFogEnable;
+	}
+
+	TexGen Context::texGenActive(int stage)
+	{
+		if(vertexShader || !texCoordActive(stage))
+		{
+			return TEXGEN_PASSTHRU;
+		}
+
+		return texGen[stage];
+	}
+
+	int Context::textureTransformCountActive(int stage)
+	{
+		if(vertexShader || !texCoordActive(stage))
+		{
+			return 0;
+		}
+
+		return textureTransformCount[stage];
+	}
+
+	int Context::texCoordIndexActive(int stage)
+	{
+		if(vertexShader || !texCoordActive(stage))
+		{
+			return stage;
+		}
+
+		return textureStage[stage].texCoordIndex;
+	}
+
+	bool Context::perspectiveActive()
+	{
+		if(!colorUsed())
+		{
+			return false;
+		}
+
+		if(!perspectiveCorrection)
+		{
+			return false;
+		}
+
+		if(isDrawPoint(true))
+		{
+			return false;
+		}
+
+		return true;
+	}
+
+	bool Context::diffuseUsed()
+	{
+		return diffuseUsed(0) || diffuseUsed(1) || diffuseUsed(2) || diffuseUsed(3);
+	}
+
+	bool Context::diffuseUsed(int component)
+	{
+		if(!colorUsed())
+		{
+			return false;
+		}
+
+		if(pixelShader)
+		{
+			return pixelShader->usesDiffuse(component);
+		}
+
+		// Directly using the diffuse input color
+		for(int i = 0; i < 8; i++)
+		{
+			if(textureStage[i].isStageDisabled())
+			{
+				break;
+			}
+
+			if(textureStage[i].usesDiffuse())
+			{
+				return true;
+			}
+		}
+
+		// Using the current color (initialized to diffuse) before it's overwritten
+		for(int i = 0; i < 8; i++)
+		{
+			if(textureStage[i].usesCurrent() || textureStage[i].isStageDisabled())   // Current color contains diffuse before being overwritten
+			{
+				return true;
+			}
+
+			if(textureStage[i].writesCurrent())
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	bool Context::diffuseActive()
+	{
+		return diffuseActive(0) || diffuseActive(1) || diffuseActive(2) || diffuseActive(3);
+	}
+
+	bool Context::diffuseActive(int component)
+	{
+		if(!colorUsed())
+		{
+			return false;
+		}
+
+		// Vertex processor provides diffuse component
+		bool vertexDiffuse;
+
+		if(vertexShader)
+		{
+			vertexDiffuse = vertexShader->getOutput(C0, component).active();
+		}
+		else if(!preTransformed)
+		{
+			vertexDiffuse = input[Color0] || lightingEnable;
+		}
+		else
+		{
+			vertexDiffuse = input[Color0];
+		}
+
+		// Pixel processor requires diffuse component
+		bool pixelDiffuse = diffuseUsed(component);
+
+		return vertexDiffuse && pixelDiffuse;
+	}
+
+	bool Context::specularUsed()
+	{
+		return Context::specularUsed(0) || Context::specularUsed(1) || Context::specularUsed(2) || Context::specularUsed(3);
+	}
+
+	bool Context::specularUsed(int component)
+	{
+		if(!colorUsed())
+		{
+			return false;
+		}
+
+		if(pixelShader)
+		{
+			return pixelShader->usesSpecular(component);
+		}
+
+		bool pixelSpecular = specularEnable;
+
+		for(int i = 0; i < 8; i++)
+		{
+			if(textureStage[i].isStageDisabled()) break;
+
+			pixelSpecular = pixelSpecular || textureStage[i].usesSpecular();
+		}
+
+		return pixelSpecular;
+	}
+
+	bool Context::specularActive()
+	{
+		return specularActive(0) || specularActive(1) || specularActive(2) || specularActive(3);
+	}
+
+	bool Context::specularActive(int component)
+	{
+		if(!colorUsed())
+		{
+			return false;
+		}
+
+		// Vertex processor provides specular component
+		bool vertexSpecular;
+
+		if(!vertexShader)
+		{
+			vertexSpecular = input[Color1] || (lightingEnable && specularEnable);
+		}
+		else
+		{
+			vertexSpecular = vertexShader->getOutput(C1, component).active();
+		}
+
+		// Pixel processor requires specular component
+		bool pixelSpecular = specularUsed(component);
+
+		return vertexSpecular && pixelSpecular;
+	}
+
+	bool Context::colorActive(int color, int component)
+	{
+		if(color == 0)
+		{
+			return diffuseActive(component);
+		}
+		else
+		{
+			return specularActive(component);
+		}
+	}
+
+	bool Context::textureActive()
+	{
+		for(int i = 0; i < 8; i++)
+		{
+			if(textureActive(i))
+			{
+				return true;
+			}
+		}
+
+		return false;
+	}
+
+	bool Context::textureActive(int coordinate)
+	{
+		return textureActive(coordinate, 0) || textureActive(coordinate, 1) || textureActive(coordinate, 2) || textureActive(coordinate, 3);
+	}
+
+	bool Context::textureActive(int coordinate, int component)
+	{
+		if(!colorUsed())
+		{
+			return false;
+		}
+
+		if(!texCoordActive(coordinate, component))
+		{
+			return false;
+		}
+
+		if(textureTransformProject[coordinate] && pixelShaderModel() <= 0x0103)
+		{
+			if(textureTransformCount[coordinate] == 2)
+			{
+				if(component == 1) return true;
+			}
+			else if(textureTransformCount[coordinate] == 3)
+			{
+				if(component == 2) return true;
+			}
+			else if(textureTransformCount[coordinate] == 4 || textureTransformCount[coordinate] == 0)
+			{
+				if(component == 3) return true;
+			}
+		}
+
+		if(!pixelShader)
+		{
+			bool texture = textureStage[coordinate].usesTexture();
+			bool cube = sampler[coordinate].hasCubeTexture();
+			bool volume = sampler[coordinate].hasVolumeTexture();
+
+			if(texture)
+			{
+				for(int i = coordinate; i >= 0; i--)
+				{
+					if(textureStage[i].stageOperation == TextureStage::STAGE_DISABLE)
+					{
+						return false;
+					}
+				}
+			}
+
+			switch(component)
+			{
+			case 0:
+				return texture;
+			case 1:
+				return texture;
+			case 2:
+				return (texture && (cube || volume));
+			case 3:
+				return false;
+			}
+		}
+		else
+		{
+			return pixelShader->usesTexture(coordinate, component);
+		}
+
+		return false;
+	}
+
+	unsigned short Context::pixelShaderModel() const
+	{
+		return pixelShader ? pixelShader->getShaderModel() : 0x0000;
+	}
+
+	unsigned short Context::vertexShaderModel() const
+	{
+		return vertexShader ? vertexShader->getShaderModel() : 0x0000;
+	}
+
+	int Context::getMultiSampleCount() const
+	{
+		return renderTarget[0] ? renderTarget[0]->getMultiSampleCount() : 1;
+	}
+
+	int Context::getSuperSampleCount() const
+	{
+		return renderTarget[0] ? renderTarget[0]->getSuperSampleCount() : 1;
+	}
+
+	Format Context::renderTargetInternalFormat(int index)
+	{
+		if(renderTarget[index])
+		{
+			return renderTarget[index]->getInternalFormat();
+		}
+		else
+		{
+			return FORMAT_NULL;
+		}
+	}
+
+	int Context::colorWriteActive()
+	{
+		return colorWriteActive(0) | colorWriteActive(1) | colorWriteActive(2) | colorWriteActive(3);
+	}
+
+	int Context::colorWriteActive(int index)
+	{
+		if(!renderTarget[index] || renderTarget[index]->getInternalFormat() == FORMAT_NULL)
+		{
+			return 0;
+		}
+
+		if(blendOperation() == BLENDOP_DEST && destBlendFactor() == BLEND_ONE &&
+		   (!separateAlphaBlendEnable || (blendOperationAlpha() == BLENDOP_DEST && destBlendFactorAlpha() == BLEND_ONE)))
+		{
+			return 0;
+		}
+
+		return colorWriteMask[index];
+	}
+
+	bool Context::colorUsed()
+	{
+		return colorWriteActive() || alphaTestActive() || (pixelShader && pixelShader->containsKill());
+	}
+}
diff --git a/src/Device/Context.hpp b/src/Device/Context.hpp
new file mode 100644
index 0000000..d9110d8
--- /dev/null
+++ b/src/Device/Context.hpp
@@ -0,0 +1,542 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Context_hpp
+#define sw_Context_hpp
+
+#include "Sampler.hpp"
+#include "TextureStage.hpp"
+#include "Stream.hpp"
+#include "Point.hpp"
+#include "Vertex.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	class Sampler;
+	class Surface;
+	class PixelShader;
+	class VertexShader;
+	struct Triangle;
+	struct Primitive;
+	struct Vertex;
+	class Resource;
+
+	enum In   // Default input stream semantic
+	{
+		Position = 0,
+		BlendWeight = 1,
+		BlendIndices = 2,
+		Normal = 3,
+		PointSize = 4,
+		Color0 = 5,
+		Color1 = 6,
+		TexCoord0 = 7,
+		TexCoord1 = 8,
+		TexCoord2 = 9,
+		TexCoord3 = 10,
+		TexCoord4 = 11,
+		TexCoord5 = 12,
+		TexCoord6 = 13,
+		TexCoord7 = 14,
+		PositionT = 15
+	};
+
+	enum DrawType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		// These types must stay ordered by vertices per primitive. Also, if these basic types
+		// are modified, verify the value assigned to task->verticesPerPrimitive in Renderer.cpp
+		DRAW_POINTLIST     = 0x00,
+		DRAW_LINELIST      = 0x01,
+		DRAW_LINESTRIP     = 0x02,
+		DRAW_LINELOOP      = 0x03,
+		DRAW_TRIANGLELIST  = 0x04,
+		DRAW_TRIANGLESTRIP = 0x05,
+		DRAW_TRIANGLEFAN   = 0x06,
+		DRAW_QUADLIST      = 0x07,
+
+		DRAW_NONINDEXED = 0x00,
+		DRAW_INDEXED8   = 0x10,
+		DRAW_INDEXED16  = 0x20,
+		DRAW_INDEXED32  = 0x30,
+
+		DRAW_INDEXEDPOINTLIST8 = DRAW_POINTLIST | DRAW_INDEXED8,
+		DRAW_INDEXEDLINELIST8  = DRAW_LINELIST  | DRAW_INDEXED8,
+		DRAW_INDEXEDLINESTRIP8 = DRAW_LINESTRIP | DRAW_INDEXED8,
+		DRAW_INDEXEDLINELOOP8  = DRAW_LINELOOP  | DRAW_INDEXED8,
+		DRAW_INDEXEDTRIANGLELIST8  = DRAW_TRIANGLELIST  | DRAW_INDEXED8,
+		DRAW_INDEXEDTRIANGLESTRIP8 = DRAW_TRIANGLESTRIP | DRAW_INDEXED8,
+		DRAW_INDEXEDTRIANGLEFAN8   = DRAW_TRIANGLEFAN   | DRAW_INDEXED8,
+
+		DRAW_INDEXEDPOINTLIST16 = DRAW_POINTLIST | DRAW_INDEXED16,
+		DRAW_INDEXEDLINELIST16  = DRAW_LINELIST  | DRAW_INDEXED16,
+		DRAW_INDEXEDLINESTRIP16 = DRAW_LINESTRIP | DRAW_INDEXED16,
+		DRAW_INDEXEDLINELOOP16  = DRAW_LINELOOP  | DRAW_INDEXED16,
+		DRAW_INDEXEDTRIANGLELIST16  = DRAW_TRIANGLELIST  | DRAW_INDEXED16,
+		DRAW_INDEXEDTRIANGLESTRIP16 = DRAW_TRIANGLESTRIP | DRAW_INDEXED16,
+		DRAW_INDEXEDTRIANGLEFAN16   = DRAW_TRIANGLEFAN   | DRAW_INDEXED16,
+
+		DRAW_INDEXEDPOINTLIST32 = DRAW_POINTLIST | DRAW_INDEXED32,
+		DRAW_INDEXEDLINELIST32  = DRAW_LINELIST  | DRAW_INDEXED32,
+		DRAW_INDEXEDLINESTRIP32 = DRAW_LINESTRIP | DRAW_INDEXED32,
+		DRAW_INDEXEDLINELOOP32  = DRAW_LINELOOP  | DRAW_INDEXED32,
+		DRAW_INDEXEDTRIANGLELIST32  = DRAW_TRIANGLELIST  | DRAW_INDEXED32,
+		DRAW_INDEXEDTRIANGLESTRIP32 = DRAW_TRIANGLESTRIP | DRAW_INDEXED32,
+		DRAW_INDEXEDTRIANGLEFAN32   = DRAW_TRIANGLEFAN   | DRAW_INDEXED32,
+
+		DRAW_LAST = DRAW_INDEXEDTRIANGLEFAN32
+	};
+
+	enum FillMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		FILL_SOLID,
+		FILL_WIREFRAME,
+		FILL_VERTEX,
+
+		FILL_LAST = FILL_VERTEX
+	};
+
+	enum ShadingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		SHADING_FLAT,
+		SHADING_GOURAUD,
+
+		SHADING_LAST = SHADING_GOURAUD
+	};
+
+	enum DepthCompareMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		DEPTH_ALWAYS,
+		DEPTH_NEVER,
+		DEPTH_EQUAL,
+		DEPTH_NOTEQUAL,
+		DEPTH_LESS,
+		DEPTH_LESSEQUAL,
+		DEPTH_GREATER,
+		DEPTH_GREATEREQUAL,
+
+		DEPTH_LAST = DEPTH_GREATEREQUAL
+	};
+
+	enum StencilCompareMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		STENCIL_ALWAYS,
+		STENCIL_NEVER,
+		STENCIL_EQUAL,
+		STENCIL_NOTEQUAL,
+		STENCIL_LESS,
+		STENCIL_LESSEQUAL,
+		STENCIL_GREATER,
+		STENCIL_GREATEREQUAL,
+
+		STENCIL_LAST = STENCIL_GREATEREQUAL
+	};
+
+	enum StencilOperation ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		OPERATION_KEEP,
+		OPERATION_ZERO,
+		OPERATION_REPLACE,
+		OPERATION_INCRSAT,
+		OPERATION_DECRSAT,
+		OPERATION_INVERT,
+		OPERATION_INCR,
+		OPERATION_DECR,
+
+		OPERATION_LAST = OPERATION_DECR
+	};
+
+	enum AlphaCompareMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		ALPHA_ALWAYS,
+		ALPHA_NEVER,
+		ALPHA_EQUAL,
+		ALPHA_NOTEQUAL,
+		ALPHA_LESS,
+		ALPHA_LESSEQUAL,
+		ALPHA_GREATER,
+		ALPHA_GREATEREQUAL,
+
+		ALPHA_LAST = ALPHA_GREATEREQUAL
+	};
+
+	enum CullMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		CULL_NONE,
+		CULL_CLOCKWISE,
+		CULL_COUNTERCLOCKWISE,
+
+		CULL_LAST = CULL_COUNTERCLOCKWISE
+	};
+
+	enum BlendFactor ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		BLEND_ZERO,
+		BLEND_ONE,
+		BLEND_SOURCE,
+		BLEND_INVSOURCE,
+		BLEND_DEST,
+		BLEND_INVDEST,
+		BLEND_SOURCEALPHA,
+		BLEND_INVSOURCEALPHA,
+		BLEND_DESTALPHA,
+		BLEND_INVDESTALPHA,
+		BLEND_SRCALPHASAT,
+		BLEND_CONSTANT,
+		BLEND_INVCONSTANT,
+		BLEND_CONSTANTALPHA,
+		BLEND_INVCONSTANTALPHA,
+
+		BLEND_LAST = BLEND_INVCONSTANTALPHA
+	};
+
+	enum BlendOperation ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		BLENDOP_ADD,
+		BLENDOP_SUB,
+		BLENDOP_INVSUB,
+		BLENDOP_MIN,
+		BLENDOP_MAX,
+
+		BLENDOP_SOURCE,   // Copy source
+		BLENDOP_DEST,     // Copy dest
+		BLENDOP_NULL,     // Nullify result
+
+		BLENDOP_LAST = BLENDOP_NULL
+	};
+
+	enum LogicalOperation ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		LOGICALOP_CLEAR,
+		LOGICALOP_SET,
+		LOGICALOP_COPY,
+		LOGICALOP_COPY_INVERTED,
+		LOGICALOP_NOOP,
+		LOGICALOP_INVERT,
+		LOGICALOP_AND,
+		LOGICALOP_NAND,
+		LOGICALOP_OR,
+		LOGICALOP_NOR,
+		LOGICALOP_XOR,
+		LOGICALOP_EQUIV,
+		LOGICALOP_AND_REVERSE,
+		LOGICALOP_AND_INVERTED,
+		LOGICALOP_OR_REVERSE,
+		LOGICALOP_OR_INVERTED,
+
+		LOGICALOP_LAST = LOGICALOP_OR_INVERTED
+	};
+
+	enum MaterialSource ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		MATERIAL_MATERIAL,
+		MATERIAL_COLOR1,
+		MATERIAL_COLOR2,
+
+		MATERIAL_LAST = MATERIAL_COLOR2
+	};
+
+	enum FogMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		FOG_NONE,
+		FOG_LINEAR,
+		FOG_EXP,
+		FOG_EXP2,
+
+		FOG_LAST = FOG_EXP2
+	};
+
+	enum TexGen ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		TEXGEN_PASSTHRU,
+		TEXGEN_NORMAL,
+		TEXGEN_POSITION,
+		TEXGEN_REFLECTION,
+		TEXGEN_SPHEREMAP,
+		TEXGEN_NONE,
+
+		TEXGEN_LAST = TEXGEN_NONE
+	};
+
+	enum TransparencyAntialiasing ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		TRANSPARENCY_NONE,
+		TRANSPARENCY_ALPHA_TO_COVERAGE,
+
+		TRANSPARENCY_LAST = TRANSPARENCY_ALPHA_TO_COVERAGE
+	};
+
+	class Context
+	{
+	public:
+		Context();
+
+		~Context();
+
+		void *operator new(size_t bytes);
+		void operator delete(void *pointer, size_t bytes);
+
+		bool isDrawPoint(bool fillModeAware = false) const;
+		bool isDrawLine(bool fillModeAware = false) const;
+		bool isDrawTriangle(bool fillModeAware = false) const;
+
+		void init();
+
+		const float &exp2Bias();   // NOTE: Needs address for JIT
+
+		const Point &getLightPosition(int light);
+
+		void setGlobalMipmapBias(float bias);
+
+		// Set fixed-function vertex pipeline states
+		void setLightingEnable(bool lightingEnable);
+		void setSpecularEnable(bool specularEnable);
+		void setLightEnable(int light, bool lightEnable);
+		void setLightPosition(int light, Point worldLightPosition);
+
+		void setColorVertexEnable(bool colorVertexEnable);
+		void setAmbientMaterialSource(MaterialSource ambientMaterialSource);
+		void setDiffuseMaterialSource(MaterialSource diffuseMaterialSource);
+		void setSpecularMaterialSource(MaterialSource specularMaterialSource);
+		void setEmissiveMaterialSource(MaterialSource emissiveMaterialSource);
+
+		void setPointSpriteEnable(bool pointSpriteEnable);
+		void setPointScaleEnable(bool pointScaleEnable);
+
+		// Set fixed-function pixel pipeline states, return true when modified
+		bool setDepthBufferEnable(bool depthBufferEnable);
+
+		bool setAlphaBlendEnable(bool alphaBlendEnable);
+		bool setSourceBlendFactor(BlendFactor sourceBlendFactor);
+		bool setDestBlendFactor(BlendFactor destBlendFactor);
+		bool setBlendOperation(BlendOperation blendOperation);
+
+		bool setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable);
+		bool setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha);
+		bool setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha);
+		bool setBlendOperationAlpha(BlendOperation blendOperationAlpha);
+
+		bool setColorWriteMask(int index, int colorWriteMask);
+		bool setWriteSRGB(bool sRGB);
+
+		bool setColorLogicOpEnabled(bool colorLogicOpEnabled);
+		bool setLogicalOperation(LogicalOperation logicalOperation);
+
+		// Active fixed-function pixel pipeline states
+		bool fogActive();
+		bool pointSizeActive();
+		FogMode pixelFogActive();
+		bool depthWriteActive();
+		bool alphaTestActive();
+		bool depthBufferActive();
+		bool stencilActive();
+
+		bool perspectiveActive();
+
+		// Active fixed-function vertex pipeline states
+		bool vertexLightingActive();
+		bool texCoordActive(int coordinate, int component);
+		bool texCoordActive(int coordinate);
+		bool isProjectionComponent(unsigned int coordinate, int component);
+		bool vertexSpecularInputActive();
+		bool vertexSpecularActive();
+		bool vertexNormalActive();
+		bool vertexLightActive();
+		bool vertexLightActive(int i);
+		MaterialSource vertexDiffuseMaterialSourceActive();
+		MaterialSource vertexSpecularMaterialSourceActive();
+		MaterialSource vertexAmbientMaterialSourceActive();
+		MaterialSource vertexEmissiveMaterialSourceActive();
+
+		bool pointSpriteActive();
+		bool pointScaleActive();
+
+		bool alphaBlendActive();
+		BlendFactor sourceBlendFactor();
+		BlendFactor destBlendFactor();
+		BlendOperation blendOperation();
+
+		BlendFactor sourceBlendFactorAlpha();
+		BlendFactor destBlendFactorAlpha();
+		BlendOperation blendOperationAlpha();
+
+		LogicalOperation colorLogicOp();
+		LogicalOperation indexLogicOp();
+
+		bool indexedVertexBlendActive();
+		int vertexBlendMatrixCountActive();
+		bool localViewerActive();
+		bool normalizeNormalsActive();
+		FogMode vertexFogModeActive();
+		bool rangeFogActive();
+
+		TexGen texGenActive(int stage);
+		int textureTransformCountActive(int stage);
+		int texCoordIndexActive(int stage);
+
+		// Active context states
+		bool diffuseUsed();     // Used by pixel processor but not provided by vertex processor
+		bool diffuseUsed(int component);     // Used by pixel processor but not provided by vertex processor
+		bool diffuseActive();
+		bool diffuseActive(int component);
+		bool specularUsed();
+		bool specularUsed(int component);
+		bool specularActive();
+		bool specularActive(int component);
+		bool colorActive(int color, int component);
+		bool textureActive();
+		bool textureActive(int coordinate);
+		bool textureActive(int coordinate, int component);
+
+		unsigned short pixelShaderModel() const;
+		unsigned short vertexShaderModel() const;
+
+		int getMultiSampleCount() const;
+		int getSuperSampleCount() const;
+
+		DrawType drawType;
+
+		bool stencilEnable;
+		StencilCompareMode stencilCompareMode;
+		int stencilReference;
+		int stencilMask;
+		StencilOperation stencilFailOperation;
+		StencilOperation stencilPassOperation;
+		StencilOperation stencilZFailOperation;
+		int stencilWriteMask;
+
+		bool twoSidedStencil;
+		StencilCompareMode stencilCompareModeCCW;
+		int stencilReferenceCCW;
+		int stencilMaskCCW;
+		StencilOperation stencilFailOperationCCW;
+		StencilOperation stencilPassOperationCCW;
+		StencilOperation stencilZFailOperationCCW;
+		int stencilWriteMaskCCW;
+
+		// Pixel processor states
+		AlphaCompareMode alphaCompareMode;
+		bool alphaTestEnable;
+		FillMode fillMode;
+		ShadingMode shadingMode;
+
+		CullMode cullMode;
+		bool frontFacingCCW;
+		float alphaReference;
+
+		float depthBias;
+		float slopeDepthBias;
+
+		TextureStage textureStage[8];
+		Sampler sampler[TOTAL_IMAGE_UNITS];
+
+		Format renderTargetInternalFormat(int index);
+		int colorWriteActive();
+		int colorWriteActive(int index);
+		bool colorUsed();
+
+		Resource *texture[TOTAL_IMAGE_UNITS];
+		Stream input[MAX_VERTEX_INPUTS];
+		Resource *indexBuffer;
+
+		bool preTransformed;   // FIXME: Private
+
+		float fogStart;
+		float fogEnd;
+
+		void computeIllumination();
+
+		bool textureWrapActive;
+		unsigned char textureWrap[TEXTURE_IMAGE_UNITS];
+		TexGen texGen[8];
+		bool localViewer;
+		bool normalizeNormals;
+		int textureTransformCount[8];
+		bool textureTransformProject[8];
+
+		Surface *renderTarget[RENDERTARGETS];
+		unsigned int renderTargetLayer[RENDERTARGETS];
+		Surface *depthBuffer;
+		unsigned int depthBufferLayer;
+		Surface *stencilBuffer;
+		unsigned int stencilBufferLayer;
+
+		// Fog
+		bool fogEnable;
+		FogMode pixelFogMode;
+		FogMode vertexFogMode;
+		bool wBasedFog;
+		bool rangeFogEnable;
+
+		// Vertex blending
+		bool indexedVertexBlendEnable;
+		int vertexBlendMatrixCount;
+
+		// Shaders
+		const PixelShader *pixelShader;
+		const VertexShader *vertexShader;
+
+		// Global mipmap bias
+		float bias;
+
+		// Instancing
+		int instanceID;
+
+		// Fixed-function vertex pipeline state
+		bool lightingEnable;
+		bool specularEnable;
+		bool lightEnable[8];
+		Point worldLightPosition[8];
+
+		MaterialSource ambientMaterialSource;
+		MaterialSource diffuseMaterialSource;
+		MaterialSource specularMaterialSource;
+		MaterialSource emissiveMaterialSource;
+		bool colorVertexEnable;
+
+		bool occlusionEnabled;
+		bool transformFeedbackQueryEnabled;
+		uint64_t transformFeedbackEnabled;
+
+		// Pixel processor states
+		bool rasterizerDiscard;
+		bool depthBufferEnable;
+		DepthCompareMode depthCompareMode;
+		bool depthWriteEnable;
+
+		bool alphaBlendEnable;
+		BlendFactor sourceBlendFactorState;
+		BlendFactor destBlendFactorState;
+		BlendOperation blendOperationState;
+
+		bool separateAlphaBlendEnable;
+		BlendFactor sourceBlendFactorStateAlpha;
+		BlendFactor destBlendFactorStateAlpha;
+		BlendOperation blendOperationStateAlpha;
+
+		bool pointSpriteEnable;
+		bool pointScaleEnable;
+		float lineWidth;
+
+		int colorWriteMask[RENDERTARGETS];   // RGBA
+		bool writeSRGB;
+		unsigned int sampleMask;
+		unsigned int multiSampleMask;
+
+		bool colorLogicOpEnabled;
+		LogicalOperation logicalOperation;
+	};
+}
+
+#endif   // sw_Context_hpp
diff --git a/src/Device/ETC_Decoder.cpp b/src/Device/ETC_Decoder.cpp
new file mode 100644
index 0000000..dbc6276
--- /dev/null
+++ b/src/Device/ETC_Decoder.cpp
@@ -0,0 +1,741 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ETC_Decoder.hpp"
+
+namespace
+{
+	inline int clampByte(int value)
+	{
+		return (value < 0) ? 0 : ((value > 255) ? 255 : value);
+	}
+
+	inline int clampSByte(int value)
+	{
+		return (value < -128) ? -128 : ((value > 127) ? 127 : value);
+	}
+
+	inline int clampEAC(int value, bool isSigned)
+	{
+		int min = isSigned ? -1023 : 0;
+		int max = isSigned ? 1023 : 2047;
+		return (value < min) ? min : ((value > max) ? max : value);
+	}
+
+	struct bgra8
+	{
+		unsigned char b;
+		unsigned char g;
+		unsigned char r;
+		unsigned char a;
+
+		inline bgra8()
+		{
+		}
+
+		inline void set(int red, int green, int blue)
+		{
+			r = static_cast<unsigned char>(clampByte(red));
+			g = static_cast<unsigned char>(clampByte(green));
+			b = static_cast<unsigned char>(clampByte(blue));
+		}
+
+		inline void set(int red, int green, int blue, int alpha)
+		{
+			r = static_cast<unsigned char>(clampByte(red));
+			g = static_cast<unsigned char>(clampByte(green));
+			b = static_cast<unsigned char>(clampByte(blue));
+			a = static_cast<unsigned char>(clampByte(alpha));
+		}
+
+		const bgra8& addA(int alpha)
+		{
+			a = alpha;
+			return *this;
+		}
+	};
+
+	inline int extend_4to8bits(int x)
+	{
+		return (x << 4) | x;
+	}
+
+	inline int extend_5to8bits(int x)
+	{
+		return (x << 3) | (x >> 2);
+	}
+
+	inline int extend_6to8bits(int x)
+	{
+		return (x << 2) | (x >> 4);
+	}
+
+	inline int extend_7to8bits(int x)
+	{
+		return (x << 1) | (x >> 6);
+	}
+
+	struct ETC2
+	{
+		// Decodes unsigned single or dual channel block to bytes
+		static void DecodeBlock(const ETC2** sources, unsigned char *dest, int nbChannels, int x, int y, int w, int h, int pitch, bool isSigned, bool isEAC)
+		{
+			if(isEAC)
+			{
+				for(int j = 0; j < 4 && (y + j) < h; j++)
+				{
+					int* sDst = reinterpret_cast<int*>(dest);
+					for(int i = 0; i < 4 && (x + i) < w; i++)
+					{
+						for(int c = nbChannels - 1; c >= 0; c--)
+						{
+							sDst[i * nbChannels + c] = clampEAC(sources[c]->getSingleChannel(i, j, isSigned, true), isSigned);
+						}
+					}
+					dest += pitch;
+				}
+			}
+			else
+			{
+				if(isSigned)
+				{
+					signed char* sDst = reinterpret_cast<signed char*>(dest);
+					for(int j = 0; j < 4 && (y + j) < h; j++)
+					{
+						for(int i = 0; i < 4 && (x + i) < w; i++)
+						{
+							for(int c = nbChannels - 1; c >= 0; c--)
+							{
+								sDst[i * nbChannels + c] = clampSByte(sources[c]->getSingleChannel(i, j, isSigned, false));
+							}
+						}
+						sDst += pitch;
+					}
+				}
+				else
+				{
+					for(int j = 0; j < 4 && (y + j) < h; j++)
+					{
+						for(int i = 0; i < 4 && (x + i) < w; i++)
+						{
+							for(int c = nbChannels - 1; c >= 0; c--)
+							{
+								dest[i * nbChannels + c] = clampByte(sources[c]->getSingleChannel(i, j, isSigned, false));
+							}
+						}
+						dest += pitch;
+					}
+				}
+			}
+		}
+
+		// Decodes RGB block to bgra8
+		void decodeBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool punchThroughAlpha) const
+		{
+			bool opaqueBit = diffbit;
+			bool nonOpaquePunchThroughAlpha = punchThroughAlpha && !opaqueBit;
+
+			// Select mode
+			if(diffbit || punchThroughAlpha)
+			{
+				int r = (R + dR);
+				int g = (G + dG);
+				int b = (B + dB);
+				if(r < 0 || r > 31)
+				{
+					decodeTBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+				}
+				else if(g < 0 || g > 31)
+				{
+					decodeHBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+				}
+				else if(b < 0 || b > 31)
+				{
+					decodePlanarBlock(dest, x, y, w, h, pitch, alphaValues);
+				}
+				else
+				{
+					decodeDifferentialBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+				}
+			}
+			else
+			{
+				decodeIndividualBlock(dest, x, y, w, h, pitch, alphaValues, nonOpaquePunchThroughAlpha);
+			}
+		}
+
+	private:
+		struct
+		{
+			union
+			{
+				// Individual, differential, H and T modes
+				struct
+				{
+					union
+					{
+						// Individual and differential modes
+						struct
+						{
+							union
+							{
+								struct   // Individual colors
+								{
+									unsigned char R2 : 4;
+									unsigned char R1 : 4;
+									unsigned char G2 : 4;
+									unsigned char G1 : 4;
+									unsigned char B2 : 4;
+									unsigned char B1 : 4;
+								};
+
+								struct   // Differential colors
+								{
+									signed char dR : 3;
+									unsigned char R : 5;
+									signed char dG : 3;
+									unsigned char G : 5;
+									signed char dB : 3;
+									unsigned char B : 5;
+								};
+							};
+
+							bool flipbit : 1;
+							bool diffbit : 1;
+							unsigned char cw2 : 3;
+							unsigned char cw1 : 3;
+						};
+
+						// T mode
+						struct
+						{
+							// Byte 1
+							unsigned char TR1b : 2;
+							unsigned char TdummyB : 1;
+							unsigned char TR1a : 2;
+							unsigned char TdummyA : 3;
+
+							// Byte 2
+							unsigned char TB1 : 4;
+							unsigned char TG1 : 4;
+
+							// Byte 3
+							unsigned char TG2 : 4;
+							unsigned char TR2 : 4;
+
+							// Byte 4
+							unsigned char Tdb : 1;
+							bool Tflipbit : 1;
+							unsigned char Tda : 2;
+							unsigned char TB2 : 4;
+						};
+
+						// H mode
+						struct
+						{
+							// Byte 1
+							unsigned char HG1a : 3;
+							unsigned char HR1 : 4;
+							unsigned char HdummyA : 1;
+
+							// Byte 2
+							unsigned char HB1b : 2;
+							unsigned char HdummyC : 1;
+							unsigned char HB1a : 1;
+							unsigned char HG1b : 1;
+							unsigned char HdummyB : 3;
+
+							// Byte 3
+							unsigned char HG2a : 3;
+							unsigned char HR2 : 4;
+							unsigned char HB1c : 1;
+
+							// Byte 4
+							unsigned char Hdb : 1;
+							bool Hflipbit : 1;
+							unsigned char Hda : 1;
+							unsigned char HB2 : 4;
+							unsigned char HG2b : 1;
+						};
+					};
+
+					unsigned char pixelIndexMSB[2];
+					unsigned char pixelIndexLSB[2];
+				};
+
+				// planar mode
+				struct
+				{
+					// Byte 1
+					unsigned char GO1 : 1;
+					unsigned char RO : 6;
+					unsigned char PdummyA : 1;
+
+					// Byte 2
+					unsigned char BO1 : 1;
+					unsigned char GO2 : 6;
+					unsigned char PdummyB : 1;
+
+					// Byte 3
+					unsigned char BO3a : 2;
+					unsigned char PdummyD : 1;
+					unsigned char BO2 : 2;
+					unsigned char PdummyC : 3;
+
+					// Byte 4
+					unsigned char RH2 : 1;
+					bool Pflipbit : 1;
+					unsigned char RH1 : 5;
+					unsigned char BO3b : 1;
+
+					// Byte 5
+					unsigned char BHa : 1;
+					unsigned char GH : 7;
+
+					// Byte 6
+					unsigned char RVa : 3;
+					unsigned char BHb : 5;
+
+					// Byte 7
+					unsigned char GVa : 5;
+					unsigned char RVb : 3;
+
+					// Byte 8
+					unsigned char BV : 6;
+					unsigned char GVb : 2;
+				};
+
+				// Single channel block
+				struct
+				{
+					union
+					{
+						unsigned char base_codeword;
+						signed char signed_base_codeword;
+					};
+
+					unsigned char table_index : 4;
+					unsigned char multiplier : 4;
+
+					unsigned char mc1 : 2;
+					unsigned char mb : 3;
+					unsigned char ma : 3;
+
+					unsigned char mf1 : 1;
+					unsigned char me : 3;
+					unsigned char md : 3;
+					unsigned char mc2 : 1;
+
+					unsigned char mh : 3;
+					unsigned char mg : 3;
+					unsigned char mf2 : 2;
+
+					unsigned char mk1 : 2;
+					unsigned char mj : 3;
+					unsigned char mi : 3;
+
+					unsigned char mn1 : 1;
+					unsigned char mm : 3;
+					unsigned char ml : 3;
+					unsigned char mk2 : 1;
+
+					unsigned char mp : 3;
+					unsigned char mo : 3;
+					unsigned char mn2 : 2;
+				};
+			};
+		};
+
+		void decodeIndividualBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+		{
+			int r1 = extend_4to8bits(R1);
+			int g1 = extend_4to8bits(G1);
+			int b1 = extend_4to8bits(B1);
+
+			int r2 = extend_4to8bits(R2);
+			int g2 = extend_4to8bits(G2);
+			int b2 = extend_4to8bits(B2);
+
+			decodeIndividualOrDifferentialBlock(dest, x, y, w, h, pitch, r1, g1, b1, r2, g2, b2, alphaValues, nonOpaquePunchThroughAlpha);
+		}
+
+		void decodeDifferentialBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+		{
+			int b1 = extend_5to8bits(B);
+			int g1 = extend_5to8bits(G);
+			int r1 = extend_5to8bits(R);
+
+			int r2 = extend_5to8bits(R + dR);
+			int g2 = extend_5to8bits(G + dG);
+			int b2 = extend_5to8bits(B + dB);
+
+			decodeIndividualOrDifferentialBlock(dest, x, y, w, h, pitch, r1, g1, b1, r2, g2, b2, alphaValues, nonOpaquePunchThroughAlpha);
+		}
+
+		void decodeIndividualOrDifferentialBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, int r1, int g1, int b1, int r2, int g2, int b2, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+		{
+			// Table 3.17.2 sorted according to table 3.17.3
+			static const int intensityModifierDefault[8][4] =
+			{
+				{ 2, 8, -2, -8 },
+				{ 5, 17, -5, -17 },
+				{ 9, 29, -9, -29 },
+				{ 13, 42, -13, -42 },
+				{ 18, 60, -18, -60 },
+				{ 24, 80, -24, -80 },
+				{ 33, 106, -33, -106 },
+				{ 47, 183, -47, -183 }
+			};
+
+			// Table C.12, intensity modifier for non opaque punchthrough alpha
+			static const int intensityModifierNonOpaque[8][4] =
+			{
+				{ 0, 8, 0, -8 },
+				{ 0, 17, 0, -17 },
+				{ 0, 29, 0, -29 },
+				{ 0, 42, 0, -42 },
+				{ 0, 60, 0, -60 },
+				{ 0, 80, 0, -80 },
+				{ 0, 106, 0, -106 },
+				{ 0, 183, 0, -183 }
+			};
+
+			const int(&intensityModifier)[8][4] = nonOpaquePunchThroughAlpha ? intensityModifierNonOpaque : intensityModifierDefault;
+
+			bgra8 subblockColors0[4];
+			bgra8 subblockColors1[4];
+
+			const int i10 = intensityModifier[cw1][0];
+			const int i11 = intensityModifier[cw1][1];
+			const int i12 = intensityModifier[cw1][2];
+			const int i13 = intensityModifier[cw1][3];
+
+			subblockColors0[0].set(r1 + i10, g1 + i10, b1 + i10);
+			subblockColors0[1].set(r1 + i11, g1 + i11, b1 + i11);
+			subblockColors0[2].set(r1 + i12, g1 + i12, b1 + i12);
+			subblockColors0[3].set(r1 + i13, g1 + i13, b1 + i13);
+
+			const int i20 = intensityModifier[cw2][0];
+			const int i21 = intensityModifier[cw2][1];
+			const int i22 = intensityModifier[cw2][2];
+			const int i23 = intensityModifier[cw2][3];
+
+			subblockColors1[0].set(r2 + i20, g2 + i20, b2 + i20);
+			subblockColors1[1].set(r2 + i21, g2 + i21, b2 + i21);
+			subblockColors1[2].set(r2 + i22, g2 + i22, b2 + i22);
+			subblockColors1[3].set(r2 + i23, g2 + i23, b2 + i23);
+
+			unsigned char* destStart = dest;
+
+			if(flipbit)
+			{
+				for(int j = 0; j < 2 && (y + j) < h; j++)
+				{
+					bgra8* color = (bgra8*)dest;
+					if((x + 0) < w) color[0] = subblockColors0[getIndex(0, j)].addA(alphaValues[j][0]);
+					if((x + 1) < w) color[1] = subblockColors0[getIndex(1, j)].addA(alphaValues[j][1]);
+					if((x + 2) < w) color[2] = subblockColors0[getIndex(2, j)].addA(alphaValues[j][2]);
+					if((x + 3) < w) color[3] = subblockColors0[getIndex(3, j)].addA(alphaValues[j][3]);
+					dest += pitch;
+				}
+
+				for(int j = 2; j < 4 && (y + j) < h; j++)
+				{
+					bgra8* color = (bgra8*)dest;
+					if((x + 0) < w) color[0] = subblockColors1[getIndex(0, j)].addA(alphaValues[j][0]);
+					if((x + 1) < w) color[1] = subblockColors1[getIndex(1, j)].addA(alphaValues[j][1]);
+					if((x + 2) < w) color[2] = subblockColors1[getIndex(2, j)].addA(alphaValues[j][2]);
+					if((x + 3) < w) color[3] = subblockColors1[getIndex(3, j)].addA(alphaValues[j][3]);
+					dest += pitch;
+				}
+			}
+			else
+			{
+				for(int j = 0; j < 4 && (y + j) < h; j++)
+				{
+					bgra8* color = (bgra8*)dest;
+					if((x + 0) < w) color[0] = subblockColors0[getIndex(0, j)].addA(alphaValues[j][0]);
+					if((x + 1) < w) color[1] = subblockColors0[getIndex(1, j)].addA(alphaValues[j][1]);
+					if((x + 2) < w) color[2] = subblockColors1[getIndex(2, j)].addA(alphaValues[j][2]);
+					if((x + 3) < w) color[3] = subblockColors1[getIndex(3, j)].addA(alphaValues[j][3]);
+					dest += pitch;
+				}
+			}
+
+			if(nonOpaquePunchThroughAlpha)
+			{
+				decodePunchThroughAlphaBlock(destStart, x, y, w, h, pitch);
+			}
+		}
+
+		void decodeTBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+		{
+			// Table C.8, distance index fot T and H modes
+			static const int distance[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
+			bgra8 paintColors[4];
+
+			int r1 = extend_4to8bits(TR1a << 2 | TR1b);
+			int g1 = extend_4to8bits(TG1);
+			int b1 = extend_4to8bits(TB1);
+
+			int r2 = extend_4to8bits(TR2);
+			int g2 = extend_4to8bits(TG2);
+			int b2 = extend_4to8bits(TB2);
+
+			const int d = distance[Tda << 1 | Tdb];
+
+			paintColors[0].set(r1, g1, b1);
+			paintColors[1].set(r2 + d, g2 + d, b2 + d);
+			paintColors[2].set(r2, g2, b2);
+			paintColors[3].set(r2 - d, g2 - d, b2 - d);
+
+			unsigned char* destStart = dest;
+
+			for(int j = 0; j < 4 && (y + j) < h; j++)
+			{
+				bgra8* color = (bgra8*)dest;
+				if((x + 0) < w) color[0] = paintColors[getIndex(0, j)].addA(alphaValues[j][0]);
+				if((x + 1) < w) color[1] = paintColors[getIndex(1, j)].addA(alphaValues[j][1]);
+				if((x + 2) < w) color[2] = paintColors[getIndex(2, j)].addA(alphaValues[j][2]);
+				if((x + 3) < w) color[3] = paintColors[getIndex(3, j)].addA(alphaValues[j][3]);
+				dest += pitch;
+			}
+
+			if(nonOpaquePunchThroughAlpha)
+			{
+				decodePunchThroughAlphaBlock(destStart, x, y, w, h, pitch);
+			}
+		}
+
+		void decodeHBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4], bool nonOpaquePunchThroughAlpha) const
+		{
+			// Table C.8, distance index fot T and H modes
+			static const int distance[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+
+			bgra8 paintColors[4];
+
+			int r1 = extend_4to8bits(HR1);
+			int g1 = extend_4to8bits(HG1a << 1 | HG1b);
+			int b1 = extend_4to8bits(HB1a << 3 | HB1b << 1 | HB1c);
+
+			int r2 = extend_4to8bits(HR2);
+			int g2 = extend_4to8bits(HG2a << 1 | HG2b);
+			int b2 = extend_4to8bits(HB2);
+
+			const int d = distance[(Hda << 2) | (Hdb << 1) | ((r1 << 16 | g1 << 8 | b1) >= (r2 << 16 | g2 << 8 | b2) ? 1 : 0)];
+
+			paintColors[0].set(r1 + d, g1 + d, b1 + d);
+			paintColors[1].set(r1 - d, g1 - d, b1 - d);
+			paintColors[2].set(r2 + d, g2 + d, b2 + d);
+			paintColors[3].set(r2 - d, g2 - d, b2 - d);
+
+			unsigned char* destStart = dest;
+
+			for(int j = 0; j < 4 && (y + j) < h; j++)
+			{
+				bgra8* color = (bgra8*)dest;
+				if((x + 0) < w) color[0] = paintColors[getIndex(0, j)].addA(alphaValues[j][0]);
+				if((x + 1) < w) color[1] = paintColors[getIndex(1, j)].addA(alphaValues[j][1]);
+				if((x + 2) < w) color[2] = paintColors[getIndex(2, j)].addA(alphaValues[j][2]);
+				if((x + 3) < w) color[3] = paintColors[getIndex(3, j)].addA(alphaValues[j][3]);
+				dest += pitch;
+			}
+
+			if(nonOpaquePunchThroughAlpha)
+			{
+				decodePunchThroughAlphaBlock(destStart, x, y, w, h, pitch);
+			}
+		}
+
+		void decodePlanarBlock(unsigned char *dest, int x, int y, int w, int h, int pitch, unsigned char alphaValues[4][4]) const
+		{
+			int ro = extend_6to8bits(RO);
+			int go = extend_7to8bits(GO1 << 6 | GO2);
+			int bo = extend_6to8bits(BO1 << 5 | BO2 << 3 | BO3a << 1 | BO3b);
+
+			int rh = extend_6to8bits(RH1 << 1 | RH2);
+			int gh = extend_7to8bits(GH);
+			int bh = extend_6to8bits(BHa << 5 | BHb);
+
+			int rv = extend_6to8bits(RVa << 3 | RVb);
+			int gv = extend_7to8bits(GVa << 2 | GVb);
+			int bv = extend_6to8bits(BV);
+
+			for(int j = 0; j < 4 && (y + j) < h; j++)
+			{
+				int ry = j * (rv - ro) + 2;
+				int gy = j * (gv - go) + 2;
+				int by = j * (bv - bo) + 2;
+				for(int i = 0; i < 4 && (x + i) < w; i++)
+				{
+					((bgra8*)(dest))[i].set(((i * (rh - ro) + ry) >> 2) + ro,
+						((i * (gh - go) + gy) >> 2) + go,
+						((i * (bh - bo) + by) >> 2) + bo,
+						alphaValues[j][i]);
+				}
+				dest += pitch;
+			}
+		}
+
+		// Index for individual, differential, H and T modes
+		inline int getIndex(int x, int y) const
+		{
+			int bitIndex = x * 4 + y;
+			int bitOffset = bitIndex & 7;
+			int lsb = (pixelIndexLSB[1 - (bitIndex >> 3)] >> bitOffset) & 1;
+			int msb = (pixelIndexMSB[1 - (bitIndex >> 3)] >> bitOffset) & 1;
+
+			return (msb << 1) | lsb;
+		}
+
+		void decodePunchThroughAlphaBlock(unsigned char *dest, int x, int y, int w, int h, int pitch) const
+		{
+			for(int j = 0; j < 4 && (y + j) < h; j++)
+			{
+				for(int i = 0; i < 4 && (x + i) < w; i++)
+				{
+					if(getIndex(i, j) == 2) //  msb == 1 && lsb == 0
+					{
+						((bgra8*)dest)[i].set(0, 0, 0, 0);
+					}
+				}
+				dest += pitch;
+			}
+		}
+
+		// Single channel utility functions
+		inline int getSingleChannel(int x, int y, bool isSigned, bool isEAC) const
+		{
+			int codeword = isSigned ? signed_base_codeword : base_codeword;
+			return isEAC ?
+			       ((multiplier == 0) ?
+			        (codeword * 8 + 4 + getSingleChannelModifier(x, y)) :
+			        (codeword * 8 + 4 + getSingleChannelModifier(x, y) * multiplier * 8)) :
+			       codeword + getSingleChannelModifier(x, y) * multiplier;
+		}
+
+		inline int getSingleChannelIndex(int x, int y) const
+		{
+			switch(x * 4 + y)
+			{
+			case 0: return ma;
+			case 1: return mb;
+			case 2: return mc1 << 1 | mc2;
+			case 3: return md;
+			case 4: return me;
+			case 5: return mf1 << 2 | mf2;
+			case 6: return mg;
+			case 7: return mh;
+			case 8: return mi;
+			case 9: return mj;
+			case 10: return mk1 << 1 | mk2;
+			case 11: return ml;
+			case 12: return mm;
+			case 13: return mn1 << 2 | mn2;
+			case 14: return mo;
+			default: return mp; // 15
+			}
+		}
+
+		inline int getSingleChannelModifier(int x, int y) const
+		{
+			static const int modifierTable[16][8] = { { -3, -6, -9, -15, 2, 5, 8, 14 },
+			{ -3, -7, -10, -13, 2, 6, 9, 12 },
+			{ -2, -5, -8, -13, 1, 4, 7, 12 },
+			{ -2, -4, -6, -13, 1, 3, 5, 12 },
+			{ -3, -6, -8, -12, 2, 5, 7, 11 },
+			{ -3, -7, -9, -11, 2, 6, 8, 10 },
+			{ -4, -7, -8, -11, 3, 6, 7, 10 },
+			{ -3, -5, -8, -11, 2, 4, 7, 10 },
+			{ -2, -6, -8, -10, 1, 5, 7, 9 },
+			{ -2, -5, -8, -10, 1, 4, 7, 9 },
+			{ -2, -4, -8, -10, 1, 3, 7, 9 },
+			{ -2, -5, -7, -10, 1, 4, 6, 9 },
+			{ -3, -4, -7, -10, 2, 3, 6, 9 },
+			{ -1, -2, -3, -10, 0, 1, 2, 9 },
+			{ -4, -6, -8, -9, 3, 5, 7, 8 },
+			{ -3, -5, -7, -9, 2, 4, 6, 8 } };
+
+			return modifierTable[table_index][getSingleChannelIndex(x, y)];
+		}
+	};
+}
+
+// Decodes 1 to 4 channel images to 8 bit output
+bool ETC_Decoder::Decode(const unsigned char* src, unsigned char *dst, int w, int h, int dstW, int dstH, int dstPitch, int dstBpp, InputType inputType)
+{
+	const ETC2* sources[2];
+	sources[0] = (const ETC2*)src;
+
+	unsigned char alphaValues[4][4] = { { 255, 255, 255, 255 }, { 255, 255, 255, 255 }, { 255, 255, 255, 255 }, { 255, 255, 255, 255 } };
+
+	switch(inputType)
+	{
+	case ETC_R_SIGNED:
+	case ETC_R_UNSIGNED:
+		for(int y = 0; y < h; y += 4)
+		{
+			unsigned char *dstRow = dst + (y * dstPitch);
+			for(int x = 0; x < w; x += 4, sources[0]++)
+			{
+				ETC2::DecodeBlock(sources, dstRow + (x * dstBpp), 1, x, y, dstW, dstH, dstPitch, inputType == ETC_R_SIGNED, true);
+			}
+		}
+		break;
+	case ETC_RG_SIGNED:
+	case ETC_RG_UNSIGNED:
+		sources[1] = sources[0] + 1;
+		for(int y = 0; y < h; y += 4)
+		{
+			unsigned char *dstRow = dst + (y * dstPitch);
+			for(int x = 0; x < w; x += 4, sources[0] += 2, sources[1] += 2)
+			{
+				ETC2::DecodeBlock(sources, dstRow + (x * dstBpp), 2, x, y, dstW, dstH, dstPitch, inputType == ETC_RG_SIGNED, true);
+			}
+		}
+		break;
+	case ETC_RGB:
+	case ETC_RGB_PUNCHTHROUGH_ALPHA:
+		for(int y = 0; y < h; y += 4)
+		{
+			unsigned char *dstRow = dst + (y * dstPitch);
+			for(int x = 0; x < w; x += 4, sources[0]++)
+			{
+				sources[0]->decodeBlock(dstRow + (x * dstBpp), x, y, dstW, dstH, dstPitch, alphaValues, inputType == ETC_RGB_PUNCHTHROUGH_ALPHA);
+			}
+		}
+		break;
+	case ETC_RGBA:
+		for(int y = 0; y < h; y += 4)
+		{
+			unsigned char *dstRow = dst + (y * dstPitch);
+			for(int x = 0; x < w; x += 4)
+			{
+				// Decode Alpha
+				ETC2::DecodeBlock(&sources[0], &(alphaValues[0][0]), 1, x, y, dstW, dstH, 4, false, false);
+				sources[0]++; // RGBA packets are 128 bits, so move on to the next 64 bit packet to decode the RGB color
+
+				// Decode RGB
+				sources[0]->decodeBlock(dstRow + (x * dstBpp), x, y, dstW, dstH, dstPitch, alphaValues, false);
+				sources[0]++;
+			}
+		}
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
diff --git a/src/Device/ETC_Decoder.hpp b/src/Device/ETC_Decoder.hpp
new file mode 100644
index 0000000..1039b37
--- /dev/null
+++ b/src/Device/ETC_Decoder.hpp
@@ -0,0 +1,41 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+class ETC_Decoder
+{
+public:
+	enum InputType
+	{
+		ETC_R_SIGNED,
+		ETC_R_UNSIGNED,
+		ETC_RG_SIGNED,
+		ETC_RG_UNSIGNED,
+		ETC_RGB,
+		ETC_RGB_PUNCHTHROUGH_ALPHA,
+		ETC_RGBA
+	};
+
+	/// ETC_Decoder::Decode - Decodes 1 to 4 channel images to 8 bit output
+	/// @param src            Pointer to ETC2 encoded image
+	/// @param dst            Pointer to BGRA, 8 bit output
+	/// @param w              src image width
+	/// @param h              src image height
+	/// @param dstW           dst image width
+	/// @param dstH           dst image height
+	/// @param dstPitch       dst image pitch (bytes per row)
+	/// @param dstBpp         dst image bytes per pixel
+	/// @param inputType      src's format
+	/// @return               true if the decoding was performed
+	static bool Decode(const unsigned char* src, unsigned char *dst, int w, int h, int dstW, int dstH, int dstPitch, int dstBpp, InputType inputType);
+};
diff --git a/src/Device/LRUCache.hpp b/src/Device/LRUCache.hpp
new file mode 100644
index 0000000..1a1a302
--- /dev/null
+++ b/src/Device/LRUCache.hpp
@@ -0,0 +1,145 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_LRUCache_hpp
+#define sw_LRUCache_hpp
+
+#include "Common/Math.hpp"
+
+namespace sw
+{
+	template<class Key, class Data>
+	class LRUCache
+	{
+	public:
+		LRUCache(int n);
+
+		~LRUCache();
+
+		Data *query(const Key &key) const;
+		Data *add(const Key &key, Data *data);
+	
+		int getSize() {return size;}
+		Key &getKey(int i) {return key[i];}
+
+	private:
+		int size;
+		int mask;
+		int top;
+		int fill;
+
+		Key *key;
+		Key **ref;
+		Data **data;
+	};
+}
+
+namespace sw
+{
+	template<class Key, class Data>
+	LRUCache<Key, Data>::LRUCache(int n)
+	{
+		size = ceilPow2(n);
+		mask = size - 1;
+		top = 0;
+		fill = 0;
+
+		key = new Key[size];
+		ref = new Key*[size];
+		data = new Data*[size];
+
+		for(int i = 0; i < size; i++)
+		{
+			data[i] = nullptr;
+
+			ref[i] = &key[i];
+		}
+	}
+
+	template<class Key, class Data>
+	LRUCache<Key, Data>::~LRUCache()
+	{
+		delete[] key;
+		key = nullptr;
+
+		delete[] ref;
+		ref = nullptr;
+
+		for(int i = 0; i < size; i++)
+		{
+			if(data[i])
+			{
+				data[i]->unbind();
+				data[i] = nullptr;
+			}
+		}
+
+		delete[] data;
+		data = nullptr;
+	}
+
+	template<class Key, class Data>
+	Data *LRUCache<Key, Data>::query(const Key &key) const
+	{
+		for(int i = top; i > top - fill; i--)
+		{
+			int j = i & mask;
+
+			if(key == *ref[j])
+			{
+				Data *hit = data[j];
+
+				if(i != top)
+				{
+					// Move one up
+					int k = (j + 1) & mask;
+
+					Data *swapD = data[k];
+					data[k] = data[j];
+					data[j] = swapD;
+
+					Key *swapK = ref[k];
+					ref[k] = ref[j];
+					ref[j] = swapK;
+				}
+
+				return hit;
+			}
+		}
+
+		return nullptr;   // Not found
+	}
+
+	template<class Key, class Data>
+	Data *LRUCache<Key, Data>::add(const Key &key, Data *data)
+	{
+		top = (top + 1) & mask;
+		fill = fill + 1 < size ? fill + 1 : size;
+
+		*ref[top] = key;
+
+		data->bind();
+
+		if(this->data[top])
+		{
+			this->data[top]->unbind();
+		}
+
+		this->data[top] = data;
+
+		return data;
+	}
+}
+
+#endif   // sw_LRUCache_hpp
diff --git a/src/Device/Matrix.cpp b/src/Device/Matrix.cpp
new file mode 100644
index 0000000..0da07e5
--- /dev/null
+++ b/src/Device/Matrix.cpp
@@ -0,0 +1,402 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Matrix.hpp"
+
+#include "Point.hpp"
+#include "Common/Math.hpp"
+
+namespace sw
+{
+	Matrix Matrix::diag(float m11, float m22, float m33, float m44)
+	{
+		return Matrix(m11, 0,   0,   0,
+		              0,   m22, 0,   0,
+		              0,   0,   m33, 0,
+		              0,   0,   0,   m44);
+	}
+
+	Matrix::operator float*()
+	{
+		return &(*this)(1, 1);
+	}
+
+	Matrix Matrix::operator+() const
+	{
+		return *this;
+	}
+
+	Matrix Matrix::operator-() const
+	{
+		const Matrix &M = *this;
+
+		return Matrix(-M(1, 1), -M(1, 2), -M(1, 3), -M(1, 4), 
+		              -M(2, 1), -M(2, 2), -M(2, 3), -M(2, 4), 
+		              -M(3, 1), -M(3, 2), -M(3, 3), -M(3, 4), 
+		              -M(4, 1), -M(4, 2), -M(4, 3), -M(4, 4));
+	}
+
+	Matrix Matrix::operator!() const
+	{
+		const Matrix &M = *this;
+		Matrix I;
+
+		float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+		float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+		float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+		float M3244 = M(3, 2) * M(4, 4) - M(4, 2) * M(3, 4);
+		float M2244 = M(2, 2) * M(4, 4) - M(4, 2) * M(2, 4);
+		float M2234 = M(2, 2) * M(3, 4) - M(3, 2) * M(2, 4);
+		float M3243 = M(3, 2) * M(4, 3) - M(4, 2) * M(3, 3);
+		float M2243 = M(2, 2) * M(4, 3) - M(4, 2) * M(2, 3);
+		float M2233 = M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3);
+		float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+		float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+		float M1244 = M(1, 2) * M(4, 4) - M(4, 2) * M(1, 4);
+		float M1234 = M(1, 2) * M(3, 4) - M(3, 2) * M(1, 4);
+		float M1243 = M(1, 2) * M(4, 3) - M(4, 2) * M(1, 3);
+		float M1233 = M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3);
+		float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+		float M1224 = M(1, 2) * M(2, 4) - M(2, 2) * M(1, 4);
+		float M1223 = M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3);
+
+		// Adjoint Matrix
+		I(1, 1) =  M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334;
+		I(2, 1) = -M(2, 1) * M3344 + M(3, 1) * M2344 - M(4, 1) * M2334;
+		I(3, 1) =  M(2, 1) * M3244 - M(3, 1) * M2244 + M(4, 1) * M2234;
+		I(4, 1) = -M(2, 1) * M3243 + M(3, 1) * M2243 - M(4, 1) * M2233;
+
+		I(1, 2) = -M(1, 2) * M3344 + M(3, 2) * M1344 - M(4, 2) * M1334;
+		I(2, 2) =  M(1, 1) * M3344 - M(3, 1) * M1344 + M(4, 1) * M1334;
+		I(3, 2) = -M(1, 1) * M3244 + M(3, 1) * M1244 - M(4, 1) * M1234;
+		I(4, 2) =  M(1, 1) * M3243 - M(3, 1) * M1243 + M(4, 1) * M1233;
+
+		I(1, 3) =  M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324;
+		I(2, 3) = -M(1, 1) * M2344 + M(2, 1) * M1344 - M(4, 1) * M1324;
+		I(3, 3) =  M(1, 1) * M2244 - M(2, 1) * M1244 + M(4, 1) * M1224;
+		I(4, 3) = -M(1, 1) * M2243 + M(2, 1) * M1243 - M(4, 1) * M1223;
+
+		I(1, 4) = -M(1, 2) * M2334 + M(2, 2) * M1334 - M(3, 2) * M1324;
+		I(2, 4) =  M(1, 1) * M2334 - M(2, 1) * M1334 + M(3, 1) * M1324;
+		I(3, 4) = -M(1, 1) * M2234 + M(2, 1) * M1234 - M(3, 1) * M1224;
+		I(4, 4) =  M(1, 1) * M2233 - M(2, 1) * M1233 + M(3, 1) * M1223;
+
+		// Division by determinant
+		I /= M(1, 1) * I(1, 1) +
+		     M(2, 1) * I(1, 2) +
+		     M(3, 1) * I(1, 3) +
+		     M(4, 1) * I(1, 4);
+
+		return I;
+	}
+
+	Matrix Matrix::operator~() const
+	{
+		const Matrix &M = *this;
+
+		return Matrix(M(1, 1), M(2, 1), M(3, 1), M(4, 1), 
+		              M(1, 2), M(2, 2), M(3, 2), M(4, 2), 
+		              M(1, 3), M(2, 3), M(3, 3), M(4, 3), 
+		              M(1, 4), M(2, 4), M(3, 4), M(4, 4));
+	}
+
+	Matrix &Matrix::operator+=(const Matrix &N)
+	{
+		Matrix &M = *this;
+
+		M(1, 1) += N(1, 1); M(1, 2) += N(1, 2); M(1, 3) += N(1, 3); M(1, 4) += N(1, 4);
+		M(2, 1) += N(2, 1); M(2, 2) += N(2, 2); M(2, 3) += N(2, 3); M(2, 4) += N(2, 4);
+		M(3, 1) += N(3, 1); M(3, 2) += N(3, 2); M(3, 3) += N(3, 3); M(3, 4) += N(3, 4);
+		M(4, 1) += N(4, 1); M(4, 2) += N(4, 2); M(4, 3) += N(4, 3); M(4, 4) += N(4, 4);
+
+		return M;
+	}
+
+	Matrix &Matrix::operator-=(const Matrix &N)
+	{
+		Matrix &M = *this;
+
+		M(1, 1) -= N(1, 1); M(1, 2) -= N(1, 2); M(1, 3) -= N(1, 3); M(1, 4) -= N(1, 4);
+		M(2, 1) -= N(2, 1); M(2, 2) -= N(2, 2); M(2, 3) -= N(2, 3); M(2, 4) -= N(2, 4);
+		M(3, 1) -= N(3, 1); M(3, 2) -= N(3, 2); M(3, 3) -= N(3, 3); M(3, 4) -= N(3, 4);
+		M(4, 1) -= N(4, 1); M(4, 2) -= N(4, 2); M(4, 3) -= N(4, 3); M(4, 4) -= N(4, 4);
+
+		return M;
+	}
+
+	Matrix &Matrix::operator*=(float s)
+	{
+		Matrix &M = *this;
+
+		M(1, 1) *= s; M(1, 2) *= s; M(1, 3) *= s; M(1, 4) *= s;
+		M(2, 1) *= s; M(2, 2) *= s; M(2, 3) *= s; M(2, 4) *= s;
+		M(3, 1) *= s; M(3, 2) *= s; M(3, 3) *= s; M(3, 4) *= s;
+		M(4, 1) *= s; M(4, 2) *= s; M(4, 3) *= s; M(4, 4) *= s;
+
+		return M;
+	}
+
+	Matrix &Matrix::operator*=(const Matrix &M)
+	{
+		return *this = *this * M;
+	}
+
+	Matrix &Matrix::operator/=(float s)
+	{
+		float r = 1.0f / s;
+
+		return *this *= r;
+	}
+
+	bool operator==(const Matrix &M, const Matrix &N)
+	{
+		if(M(1, 1) == N(1, 1) && M(1, 2) == N(1, 2) && M(1, 3) == N(1, 3) && M(1, 4) == N(1, 4) &&
+		   M(2, 1) == N(2, 1) && M(2, 2) == N(2, 2) && M(2, 3) == N(2, 3) && M(2, 4) == N(2, 4) &&
+		   M(3, 1) == N(3, 1) && M(3, 2) == N(3, 2) && M(3, 3) == N(3, 3) && M(3, 4) == N(3, 4) &&
+		   M(4, 1) == N(4, 1) && M(4, 2) == N(4, 2) && M(4, 3) == N(4, 3) && M(4, 4) == N(4, 4))
+			return true;
+		else
+			return false;
+	}
+
+	bool operator!=(const Matrix &M, const Matrix &N)
+	{
+		if(M(1, 1) != N(1, 1) || M(1, 2) != N(1, 2) || M(1, 3) != N(1, 3) || M(1, 4) != N(1, 4) ||
+		   M(2, 1) != N(2, 1) || M(2, 2) != N(2, 2) || M(2, 3) != N(2, 3) || M(2, 4) != N(2, 4) ||
+		   M(3, 1) != N(3, 1) || M(3, 2) != N(3, 2) || M(3, 3) != N(3, 3) || M(3, 4) != N(3, 4) ||
+		   M(4, 1) != N(4, 1) || M(4, 2) != N(4, 2) || M(4, 3) != N(4, 3) || M(4, 4) != N(4, 4))
+			return true;
+		else
+			return false;
+	}
+
+	Matrix operator+(const Matrix &M, const Matrix &N)
+	{
+		return Matrix(M(1, 1) + N(1, 1), M(1, 2) + N(1, 2), M(1, 3) + N(1, 3), M(1, 4) + N(1, 4), 
+		              M(2, 1) + N(2, 1), M(2, 2) + N(2, 2), M(2, 3) + N(2, 3), M(2, 4) + N(2, 4), 
+		              M(3, 1) + N(3, 1), M(3, 2) + N(3, 2), M(3, 3) + N(3, 3), M(3, 4) + N(3, 4), 
+		              M(4, 1) + N(4, 1), M(4, 2) + N(4, 2), M(4, 3) + N(4, 3), M(4, 4) + N(4, 4));
+	}
+
+	Matrix operator-(const Matrix &M, const Matrix &N)
+	{
+		return Matrix(M(1, 1) - N(1, 1), M(1, 2) - N(1, 2), M(1, 3) - N(1, 3), M(1, 4) - N(1, 4), 
+		              M(2, 1) - N(2, 1), M(2, 2) - N(2, 2), M(2, 3) - N(2, 3), M(2, 4) - N(2, 4), 
+		              M(3, 1) - N(3, 1), M(3, 2) - N(3, 2), M(3, 3) - N(3, 3), M(3, 4) - N(3, 4), 
+		              M(4, 1) - N(4, 1), M(4, 2) - N(4, 2), M(4, 3) - N(4, 3), M(4, 4) - N(4, 4));
+	}
+
+	Matrix operator*(float s, const Matrix &M)
+	{
+		return Matrix(s * M(1, 1), s * M(1, 2), s * M(1, 3), s * M(1, 4), 
+		              s * M(2, 1), s * M(2, 2), s * M(2, 3), s * M(2, 4), 
+		              s * M(3, 1), s * M(3, 2), s * M(3, 3), s * M(3, 4), 
+		              s * M(4, 1), s * M(4, 2), s * M(4, 3), s * M(4, 4));
+	}
+
+	Matrix operator*(const Matrix &M, float s)
+	{
+		return Matrix(M(1, 1) * s, M(1, 2) * s, M(1, 3) * s, M(1, 4) * s, 
+		              M(2, 1) * s, M(2, 2) * s, M(2, 3) * s, M(2, 4) * s, 
+		              M(3, 1) * s, M(3, 2) * s, M(3, 3) * s, M(3, 4) * s, 
+		              M(4, 1) * s, M(4, 2) * s, M(4, 3) * s, M(4, 4) * s);
+	}
+
+	Matrix operator*(const Matrix &M, const Matrix &N)
+	{
+		return Matrix(M(1, 1) * N(1, 1) + M(1, 2) * N(2, 1) + M(1, 3) * N(3, 1) + M(1, 4) * N(4, 1), M(1, 1) * N(1, 2) + M(1, 2) * N(2, 2) + M(1, 3) * N(3, 2) + M(1, 4) * N(4, 2), M(1, 1) * N(1, 3) + M(1, 2) * N(2, 3) + M(1, 3) * N(3, 3) + M(1, 4) * N(4, 3), M(1, 1) * N(1, 4) + M(1, 2) * N(2, 4) + M(1, 3) * N(3, 4) + M(1, 4) * N(4, 4), 
+		              M(2, 1) * N(1, 1) + M(2, 2) * N(2, 1) + M(2, 3) * N(3, 1) + M(2, 4) * N(4, 1), M(2, 1) * N(1, 2) + M(2, 2) * N(2, 2) + M(2, 3) * N(3, 2) + M(2, 4) * N(4, 2), M(2, 1) * N(1, 3) + M(2, 2) * N(2, 3) + M(2, 3) * N(3, 3) + M(2, 4) * N(4, 3), M(2, 1) * N(1, 4) + M(2, 2) * N(2, 4) + M(2, 3) * N(3, 4) + M(2, 4) * N(4, 4), 
+		              M(3, 1) * N(1, 1) + M(3, 2) * N(2, 1) + M(3, 3) * N(3, 1) + M(3, 4) * N(4, 1), M(3, 1) * N(1, 2) + M(3, 2) * N(2, 2) + M(3, 3) * N(3, 2) + M(3, 4) * N(4, 2), M(3, 1) * N(1, 3) + M(3, 2) * N(2, 3) + M(3, 3) * N(3, 3) + M(3, 4) * N(4, 3), M(3, 1) * N(1, 4) + M(3, 2) * N(2, 4) + M(3, 3) * N(3, 4) + M(3, 4) * N(4, 4), 
+		              M(4, 1) * N(1, 1) + M(4, 2) * N(2, 1) + M(4, 3) * N(3, 1) + M(4, 4) * N(4, 1), M(4, 1) * N(1, 2) + M(4, 2) * N(2, 2) + M(4, 3) * N(3, 2) + M(4, 4) * N(4, 2), M(4, 1) * N(1, 3) + M(4, 2) * N(2, 3) + M(4, 3) * N(3, 3) + M(4, 4) * N(4, 3), M(4, 1) * N(1, 4) + M(4, 2) * N(2, 4) + M(4, 3) * N(3, 4) + M(4, 4) * N(4, 4));
+	}
+
+	Matrix operator/(const Matrix &M, float s)
+	{
+		float r = 1.0f / s;
+
+		return M * r;
+	}
+
+	float4 Matrix::operator*(const float4 &v) const
+	{
+		const Matrix &M = *this;
+		float Mx = M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z + M(1, 4) * v.w;
+		float My = M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z + M(2, 4) * v.w;
+		float Mz = M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z + M(3, 4) * v.w;
+		float Mw = M(4, 1) * v.x + M(4, 2) * v.y + M(4, 3) * v.z + M(4, 4) * v.w;
+
+		return {Mx, My, Mz, Mw};
+	}
+
+	float Matrix::det(const Matrix &M)
+	{
+		float M3344 = M(3, 3) * M(4, 4) - M(4, 3) * M(3, 4);
+		float M2344 = M(2, 3) * M(4, 4) - M(4, 3) * M(2, 4);
+		float M2334 = M(2, 3) * M(3, 4) - M(3, 3) * M(2, 4);
+		float M1344 = M(1, 3) * M(4, 4) - M(4, 3) * M(1, 4);
+		float M1334 = M(1, 3) * M(3, 4) - M(3, 3) * M(1, 4);
+		float M1324 = M(1, 3) * M(2, 4) - M(2, 3) * M(1, 4);
+
+		return M(1, 1) * (M(2, 2) * M3344 - M(3, 2) * M2344 + M(4, 2) * M2334) -
+		       M(2, 1) * (M(1, 2) * M3344 - M(3, 2) * M1344 + M(4, 2) * M1334) +
+		       M(3, 1) * (M(1, 2) * M2344 - M(2, 2) * M1344 + M(4, 2) * M1324) -
+		       M(4, 1) * (M(1, 2) * M2334 - M(2, 2) * M1334 + M(3, 2) * M1324);
+	}
+
+	float Matrix::det(float m11)
+	{
+		return m11;
+	}
+
+	float Matrix::det(float m11, float m12, 
+	                  float m21, float m22)
+	{
+		return m11 * m22 - m12 * m21; 
+	}
+
+	float Matrix::det(float m11, float m12, float m13, 
+	                  float m21, float m22, float m23, 
+	                  float m31, float m32, float m33)
+	{
+		return m11 * (m22 * m33 - m32 * m23) -
+		       m21 * (m12 * m33 - m32 * m13) +
+		       m31 * (m12 * m23 - m22 * m13);
+	}
+
+	float Matrix::det(float m11, float m12, float m13, float m14, 
+	                  float m21, float m22, float m23, float m24, 
+	                  float m31, float m32, float m33, float m34, 
+	                  float m41, float m42, float m43, float m44)
+	{
+		float M3344 = m33 * m44 - m43 * m34;
+		float M2344 = m23 * m44 - m43 * m24;
+		float M2334 = m23 * m34 - m33 * m24;
+		float M1344 = m13 * m44 - m43 * m14;
+		float M1334 = m13 * m34 - m33 * m14;
+		float M1324 = m13 * m24 - m23 * m14;
+
+		return m11 * (m22 * M3344 - m32 * M2344 + m42 * M2334) -
+		       m21 * (m12 * M3344 - m32 * M1344 + m42 * M1334) +
+		       m31 * (m12 * M2344 - m22 * M1344 + m42 * M1324) -
+		       m41 * (m12 * M2334 - m22 * M1334 + m32 * M1324);
+	}
+
+	float Matrix::det(const Vector &v1, const Vector &v2, const Vector &v3)
+	{
+		return v1 * (v2 % v3);
+	}
+
+	float Matrix::det3(const Matrix &M)
+	{
+		return M(1, 1) * (M(2, 2) * M(3, 3) - M(3, 2) * M(2, 3)) -
+		       M(2, 1) * (M(1, 2) * M(3, 3) - M(3, 2) * M(1, 3)) +
+		       M(3, 1) * (M(1, 2) * M(2, 3) - M(2, 2) * M(1, 3));
+	}
+
+	float Matrix::tr(const Matrix &M)
+	{
+		return M(1, 1) + M(2, 2) + M(3, 3) + M(4, 4);
+	}
+
+	Matrix &Matrix::orthogonalise()
+	{
+		// NOTE: Numnerically instable, won't return exact the same result when already orhtogonal
+
+		Matrix &M = *this;
+
+		Vector v1(M(1, 1), M(2, 1), M(3, 1));
+		Vector v2(M(1, 2), M(2, 2), M(3, 2));
+		Vector v3(M(1, 3), M(2, 3), M(3, 3));
+
+		v2 -= v1 * (v1 * v2) / (v1 * v1);
+		v3 -= v1 * (v1 * v3) / (v1 * v1);
+		v3 -= v2 * (v2 * v3) / (v2 * v2);
+
+		v1 /= Vector::N(v1);
+		v2 /= Vector::N(v2);
+		v3 /= Vector::N(v3);
+
+		M(1, 1) = v1.x;  M(1, 2) = v2.x;  M(1, 3) = v3.x;
+		M(2, 1) = v1.y;  M(2, 2) = v2.y;  M(2, 3) = v3.y;
+		M(3, 1) = v1.z;  M(3, 2) = v2.z;  M(3, 3) = v3.z;
+
+		return *this;
+	}
+
+	Matrix Matrix::eulerRotate(const Vector &v)
+	{
+		float cz = cos(v.z);
+		float sz = sin(v.z);
+		float cx = cos(v.x);
+		float sx = sin(v.x);
+		float cy = cos(v.y);
+		float sy = sin(v.y);
+
+		float sxsy = sx * sy;
+		float sxcy = sx * cy;
+
+		return Matrix(cy * cz - sxsy * sz, -cy * sz - sxsy * cz, -sy * cx,
+		              cx * sz,              cx * cz,             -sx,
+		              sy * cz + sxcy * sz, -sy * sz + sxcy * cz,  cy * cx);
+	}
+
+	Matrix Matrix::eulerRotate(float x, float y, float z)
+	{
+		return eulerRotate(Vector(x, y, z));
+	}
+
+	Matrix Matrix::translate(const Vector &v)
+	{
+		return Matrix(1, 0, 0, v.x,
+		              0, 1, 0, v.y,
+		              0, 0, 1, v.z,
+		              0, 0, 0, 1);
+	}
+
+	Matrix Matrix::translate(float x, float y, float z)
+	{
+		return translate(Vector(x, y, z));
+	}
+
+	Matrix Matrix::scale(const Vector &v)
+	{
+		return Matrix(v.x, 0,   0,
+		              0,   v.y, 0,
+		              0,   0,   v.z);
+	}
+
+	Matrix Matrix::scale(float x, float y, float z)
+	{
+		return scale(Vector(x, y, z));
+	}
+
+	Matrix Matrix::lookAt(const Vector &v)
+	{
+		Vector y = v;
+		y /= Vector::N(y);
+
+		Vector x = y % Vector(0, 0, 1);
+		x /= Vector::N(x);
+
+		Vector z = x % y;
+		z /= Vector::N(z);
+
+		return ~Matrix(x, y, z);
+	}
+
+	Matrix Matrix::lookAt(float x, float y, float z)
+	{
+		return translate(Vector(x, y, z));
+	}
+}
diff --git a/src/Device/Matrix.hpp b/src/Device/Matrix.hpp
new file mode 100644
index 0000000..41281a6
--- /dev/null
+++ b/src/Device/Matrix.hpp
@@ -0,0 +1,217 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Matrix_hpp
+#define Matrix_hpp
+
+namespace sw
+{
+	struct Vector;
+	struct Point;
+	struct float4;
+
+	struct Matrix
+	{
+		Matrix();
+		Matrix(const int i);
+		Matrix(const float m[16]);
+		Matrix(const float m[4][4]);
+		Matrix(float m11, float m12, float m13,
+		       float m21, float m22, float m23,
+		       float m31, float m32, float m33);
+		Matrix(float m11, float m12, float m13, float m14,
+		       float m21, float m22, float m23, float m24,
+		       float m31, float m32, float m33, float m34,
+		       float m41, float m42, float m43, float m44);
+		Matrix(const Vector &v1, const Vector &v2, const Vector &v3);   // Column vectors
+
+		Matrix &operator=(const Matrix &N);
+
+		// Row major order
+		float m[4][4];
+
+		static Matrix diag(float m11, float m22, float m33, float m44);
+
+		operator float*();
+
+		Matrix operator+() const;
+		Matrix operator-() const;
+
+		Matrix operator!() const;   // Inverse
+		Matrix operator~() const;   // Transpose
+
+		Matrix &operator+=(const Matrix &N);
+		Matrix &operator-=(const Matrix &N);
+		Matrix &operator*=(float s);
+		Matrix &operator*=(const Matrix &N);
+		Matrix &operator/=(float s);
+
+		float *operator[](int i);   // Access element [row][col], starting with [0][0]
+		const float *operator[](int i) const;
+
+		float &operator()(int i, int j);   // Access element (row, col), starting with (1, 1)
+		const float &operator()(int i, int j) const;
+
+		friend bool operator==(const Matrix &M, const Matrix &N);
+		friend bool operator!=(const Matrix &M, const Matrix &N);
+
+		friend Matrix operator+(const Matrix &M, const Matrix &N);
+		friend Matrix operator-(const Matrix &M, const Matrix &N);
+		friend Matrix operator*(float s, const Matrix &M);
+		friend Matrix operator*(const Matrix &M, const Matrix &N);
+		friend Matrix operator/(const Matrix &M, float s);
+
+		float4 operator*(const float4 &v) const;
+
+		static float det(const Matrix &M);
+		static float det(float m11);
+		static float det(float m11, float m12,
+		                 float m21, float m22);
+		static float det(float m11, float m12, float m13,
+		                 float m21, float m22, float m23,
+		                 float m31, float m32, float m33);
+		static float det(float m11, float m12, float m13, float m14,
+		                 float m21, float m22, float m23, float m24,
+		                 float m31, float m32, float m33, float m34,
+		                 float m41, float m42, float m43, float m44);
+		static float det(const Vector &v1, const Vector &v2, const Vector &v3);
+		static float det3(const Matrix &M);
+
+		static float tr(const Matrix &M);
+
+		Matrix &orthogonalise();   // Gram-Schmidt orthogonalisation of 3x3 submatrix
+
+		static Matrix eulerRotate(const Vector &v);
+		static Matrix eulerRotate(float x, float y, float z);
+	
+		static Matrix translate(const Vector &v);
+		static Matrix translate(float x, float y, float z);
+		
+		static Matrix scale(const Vector &v);
+		static Matrix scale(float x, float y, float z);
+
+		static Matrix lookAt(const Vector &v);
+		static Matrix lookAt(float x, float y, float z);
+	};
+}
+
+#include "Vector.hpp"
+
+namespace sw
+{
+	inline Matrix::Matrix()
+	{
+	}
+
+	inline Matrix::Matrix(const int i)
+	{
+		const float s = (float)i;
+
+		Matrix &M = *this;
+
+		M(1, 1) = s; M(1, 2) = 0; M(1, 3) = 0; M(1, 4) = 0;
+		M(2, 1) = 0; M(2, 2) = s; M(2, 3) = 0; M(2, 4) = 0;
+		M(3, 1) = 0; M(3, 2) = 0; M(3, 3) = s; M(3, 4) = 0;
+		M(4, 1) = 0; M(4, 2) = 0; M(4, 3) = 0; M(4, 4) = s;
+	}
+
+	inline Matrix::Matrix(const float m[16])
+	{
+		Matrix &M = *this;
+
+		M(1, 1) = m[0];  M(1, 2) = m[1];  M(1, 3) = m[2];  M(1, 4) = m[3];
+		M(2, 1) = m[4];  M(2, 2) = m[5];  M(2, 3) = m[6];  M(2, 4) = m[7];
+		M(3, 1) = m[8];  M(3, 2) = m[8];  M(3, 3) = m[10]; M(3, 4) = m[11];
+		M(4, 1) = m[12]; M(4, 2) = m[13]; M(4, 3) = m[14]; M(4, 4) = m[15];
+	}
+
+	inline Matrix::Matrix(const float m[4][4])
+	{
+		Matrix &M = *this;
+
+		M[0][0] = m[0][0];  M[0][1] = m[0][1];  M[0][2] = m[0][2];  M[0][3] = m[0][3];
+		M[1][0] = m[1][0];  M[1][1] = m[1][1];  M[1][2] = m[1][2];  M[1][3] = m[1][3];
+		M[2][0] = m[2][0];  M[2][1] = m[2][1];  M[2][2] = m[2][2];  M[2][3] = m[2][3];
+		M[3][0] = m[3][0];  M[3][1] = m[3][1];  M[3][2] = m[3][2];  M[3][3] = m[3][3];
+	}
+
+	inline Matrix::Matrix(float m11, float m12, float m13, 
+	                      float m21, float m22, float m23, 
+	                      float m31, float m32, float m33)
+	{
+		Matrix &M = *this;
+
+		M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = 0;
+		M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = 0;
+		M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = 0;
+		M(4, 1) = 0;   M(4, 2) = 0;   M(4, 3) = 0;   M(4, 4) = 1;
+	}
+
+	inline Matrix::Matrix(float m11, float m12, float m13, float m14, 
+	                      float m21, float m22, float m23, float m24, 
+	                      float m31, float m32, float m33, float m34, 
+	                      float m41, float m42, float m43, float m44)
+	{
+		Matrix &M = *this;
+
+		M(1, 1) = m11; M(1, 2) = m12; M(1, 3) = m13; M(1, 4) = m14;
+		M(2, 1) = m21; M(2, 2) = m22; M(2, 3) = m23; M(2, 4) = m24;
+		M(3, 1) = m31; M(3, 2) = m32; M(3, 3) = m33; M(3, 4) = m34;
+		M(4, 1) = m41; M(4, 2) = m42; M(4, 3) = m43; M(4, 4) = m44;
+	}
+
+	inline Matrix::Matrix(const Vector &v1, const Vector &v2, const Vector &v3)
+	{
+		Matrix &M = *this;
+
+		M(1, 1) = v1.x; M(1, 2) = v2.x; M(1, 3) = v3.x; M(1, 4) = 0;
+		M(2, 1) = v1.y; M(2, 2) = v2.y; M(2, 3) = v3.y; M(2, 4) = 0;
+		M(3, 1) = v1.z; M(3, 2) = v2.z; M(3, 3) = v3.z; M(3, 4) = 0;
+		M(4, 1) = 0;    M(4, 2) = 0;    M(4, 3) = 0;    M(4, 4) = 1;
+	}
+
+	inline Matrix &Matrix::operator=(const Matrix &N)
+	{
+		Matrix &M = *this;
+
+		M(1, 1) = N(1, 1); M(1, 2) = N(1, 2); M(1, 3) = N(1, 3); M(1, 4) = N(1, 4);
+		M(2, 1) = N(2, 1); M(2, 2) = N(2, 2); M(2, 3) = N(2, 3); M(2, 4) = N(2, 4);
+		M(3, 1) = N(3, 1); M(3, 2) = N(3, 2); M(3, 3) = N(3, 3); M(3, 4) = N(3, 4);
+		M(4, 1) = N(4, 1); M(4, 2) = N(4, 2); M(4, 3) = N(4, 3); M(4, 4) = N(4, 4);
+
+		return M;
+	}
+
+	inline float *Matrix::operator[](int i)
+	{
+		return m[i];
+	}
+
+	inline const float *Matrix::operator[](int i) const
+	{
+		return m[i];
+	}
+
+	inline float &Matrix::operator()(int i, int j)
+	{
+		return m[i - 1][j - 1];
+	}
+
+	inline const float &Matrix::operator()(int i, int j) const
+	{
+		return m[i - 1][j - 1];
+	}
+}
+
+#endif   // Matrix_hpp
diff --git a/src/Device/PixelProcessor.cpp b/src/Device/PixelProcessor.cpp
new file mode 100644
index 0000000..8bc40c2
--- /dev/null
+++ b/src/Device/PixelProcessor.cpp
@@ -0,0 +1,1212 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelProcessor.hpp"
+
+#include "Surface.hpp"
+#include "Primitive.hpp"
+#include "Shader/PixelPipeline.hpp"
+#include "Shader/PixelProgram.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+	extern bool complementaryDepthBuffer;
+	extern TransparencyAntialiasing transparencyAntialiasing;
+	extern bool perspectiveCorrection;
+
+	bool precachePixel = false;
+
+	unsigned int PixelProcessor::States::computeHash()
+	{
+		unsigned int *state = (unsigned int*)this;
+		unsigned int hash = 0;
+
+		for(unsigned int i = 0; i < sizeof(States) / 4; i++)
+		{
+			hash ^= state[i];
+		}
+
+		return hash;
+	}
+
+	PixelProcessor::State::State()
+	{
+		memset(this, 0, sizeof(State));
+	}
+
+	bool PixelProcessor::State::operator==(const State &state) const
+	{
+		if(hash != state.hash)
+		{
+			return false;
+		}
+
+		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+	}
+
+	PixelProcessor::UniformBufferInfo::UniformBufferInfo()
+	{
+		buffer = nullptr;
+		offset = 0;
+	}
+
+	PixelProcessor::PixelProcessor(Context *context) : context(context)
+	{
+		setGlobalMipmapBias(0.0f);   // Round to highest LOD [0.5, 1.0]: -0.5
+		                             // Round to nearest LOD [0.7, 1.4]:  0.0
+		                             // Round to lowest LOD  [1.0, 2.0]:  0.5
+
+		routineCache = 0;
+		setRoutineCacheSize(1024);
+	}
+
+	PixelProcessor::~PixelProcessor()
+	{
+		delete routineCache;
+		routineCache = 0;
+	}
+
+	void PixelProcessor::setFloatConstant(unsigned int index, const float value[4])
+	{
+		if(index < FRAGMENT_UNIFORM_VECTORS)
+		{
+			c[index][0] = value[0];
+			c[index][1] = value[1];
+			c[index][2] = value[2];
+			c[index][3] = value[3];
+		}
+		else ASSERT(false);
+
+		if(index < 8)   // ps_1_x constants
+		{
+			// FIXME: Compact into generic function
+			short x = iround(4095 * clamp(value[0], -1.0f, 1.0f));
+			short y = iround(4095 * clamp(value[1], -1.0f, 1.0f));
+			short z = iround(4095 * clamp(value[2], -1.0f, 1.0f));
+			short w = iround(4095 * clamp(value[3], -1.0f, 1.0f));
+
+			cW[index][0][0] = x;
+			cW[index][0][1] = x;
+			cW[index][0][2] = x;
+			cW[index][0][3] = x;
+
+			cW[index][1][0] = y;
+			cW[index][1][1] = y;
+			cW[index][1][2] = y;
+			cW[index][1][3] = y;
+
+			cW[index][2][0] = z;
+			cW[index][2][1] = z;
+			cW[index][2][2] = z;
+			cW[index][2][3] = z;
+
+			cW[index][3][0] = w;
+			cW[index][3][1] = w;
+			cW[index][3][2] = w;
+			cW[index][3][3] = w;
+		}
+	}
+
+	void PixelProcessor::setIntegerConstant(unsigned int index, const int value[4])
+	{
+		if(index < 16)
+		{
+			i[index][0] = value[0];
+			i[index][1] = value[1];
+			i[index][2] = value[2];
+			i[index][3] = value[3];
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setBooleanConstant(unsigned int index, int boolean)
+	{
+		if(index < 16)
+		{
+			b[index] = boolean != 0;
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setUniformBuffer(int index, sw::Resource* buffer, int offset)
+	{
+		uniformBufferInfo[index].buffer = buffer;
+		uniformBufferInfo[index].offset = offset;
+	}
+
+	void PixelProcessor::lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[])
+	{
+		for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; ++i)
+		{
+			u[i] = uniformBufferInfo[i].buffer ? static_cast<byte*>(uniformBufferInfo[i].buffer->lock(PUBLIC, PRIVATE)) + uniformBufferInfo[i].offset : nullptr;
+			uniformBuffers[i] = uniformBufferInfo[i].buffer;
+		}
+	}
+
+	void PixelProcessor::setRenderTarget(int index, Surface *renderTarget, unsigned int layer)
+	{
+		context->renderTarget[index] = renderTarget;
+		context->renderTargetLayer[index] = layer;
+	}
+
+	void PixelProcessor::setDepthBuffer(Surface *depthBuffer, unsigned int layer)
+	{
+		context->depthBuffer = depthBuffer;
+		context->depthBufferLayer = layer;
+	}
+
+	void PixelProcessor::setStencilBuffer(Surface *stencilBuffer, unsigned int layer)
+	{
+		context->stencilBuffer = stencilBuffer;
+		context->stencilBufferLayer = layer;
+	}
+
+	void PixelProcessor::setTexCoordIndex(unsigned int stage, int texCoordIndex)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setTexCoordIndex(texCoordIndex);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setStageOperation(unsigned int stage, TextureStage::StageOperation stageOperation)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setStageOperation(stageOperation);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setFirstArgument(unsigned int stage, TextureStage::SourceArgument firstArgument)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setFirstArgument(firstArgument);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSecondArgument(unsigned int stage, TextureStage::SourceArgument secondArgument)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setSecondArgument(secondArgument);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setThirdArgument(unsigned int stage, TextureStage::SourceArgument thirdArgument)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setThirdArgument(thirdArgument);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setStageOperationAlpha(unsigned int stage, TextureStage::StageOperation stageOperationAlpha)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setStageOperationAlpha(stageOperationAlpha);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setFirstArgumentAlpha(unsigned int stage, TextureStage::SourceArgument firstArgumentAlpha)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setFirstArgumentAlpha(firstArgumentAlpha);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSecondArgumentAlpha(unsigned int stage, TextureStage::SourceArgument secondArgumentAlpha)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setSecondArgumentAlpha(secondArgumentAlpha);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setThirdArgumentAlpha(unsigned int stage, TextureStage::SourceArgument thirdArgumentAlpha)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setThirdArgumentAlpha(thirdArgumentAlpha);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setFirstModifier(unsigned int stage, TextureStage::ArgumentModifier firstModifier)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setFirstModifier(firstModifier);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSecondModifier(unsigned int stage, TextureStage::ArgumentModifier secondModifier)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setSecondModifier(secondModifier);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setThirdModifier(unsigned int stage, TextureStage::ArgumentModifier thirdModifier)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setThirdModifier(thirdModifier);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setFirstModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier firstModifierAlpha)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setFirstModifierAlpha(firstModifierAlpha);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSecondModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier secondModifierAlpha)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setSecondModifierAlpha(secondModifierAlpha);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setThirdModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier thirdModifierAlpha)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setThirdModifierAlpha(thirdModifierAlpha);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setDestinationArgument(unsigned int stage, TextureStage::DestinationArgument destinationArgument)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setDestinationArgument(destinationArgument);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setConstantColor(unsigned int stage, const Color<float> &constantColor)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setConstantColor(constantColor);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setBumpmapMatrix(unsigned int stage, int element, float value)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setBumpmapMatrix(element, value);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setLuminanceScale(unsigned int stage, float value)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setLuminanceScale(value);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setLuminanceOffset(unsigned int stage, float value)
+	{
+		if(stage < 8)
+		{
+			context->textureStage[stage].setLuminanceOffset(value);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setTextureFilter(unsigned int sampler, FilterType textureFilter)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setTextureFilter(textureFilter);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setMipmapFilter(mipmapFilter);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setGatherEnable(unsigned int sampler, bool enable)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setGatherEnable(enable);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setAddressingModeU(unsigned int sampler, AddressingMode addressMode)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setAddressingModeU(addressMode);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setAddressingModeV(unsigned int sampler, AddressingMode addressMode)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setAddressingModeV(addressMode);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setAddressingModeW(unsigned int sampler, AddressingMode addressMode)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setAddressingModeW(addressMode);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setReadSRGB(unsigned int sampler, bool sRGB)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setReadSRGB(sRGB);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setMipmapLOD(unsigned int sampler, float bias)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setMipmapLOD(bias);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setBorderColor(unsigned int sampler, const Color<float> &borderColor)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setBorderColor(borderColor);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setMaxAnisotropy(unsigned int sampler, float maxAnisotropy)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setMaxAnisotropy(maxAnisotropy);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setHighPrecisionFiltering(highPrecisionFiltering);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSwizzleR(unsigned int sampler, SwizzleType swizzleR)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setSwizzleR(swizzleR);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSwizzleG(unsigned int sampler, SwizzleType swizzleG)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setSwizzleG(swizzleG);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSwizzleB(unsigned int sampler, SwizzleType swizzleB)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setSwizzleB(swizzleB);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSwizzleA(unsigned int sampler, SwizzleType swizzleA)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setSwizzleA(swizzleA);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setCompareFunc(unsigned int sampler, CompareFunc compFunc)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setCompareFunc(compFunc);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setBaseLevel(unsigned int sampler, int baseLevel)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setBaseLevel(baseLevel);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setMaxLevel(unsigned int sampler, int maxLevel)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setMaxLevel(maxLevel);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setMinLod(unsigned int sampler, float minLod)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setMinLod(minLod);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setMaxLod(unsigned int sampler, float maxLod)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setMaxLod(maxLod);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setSyncRequired(unsigned int sampler, bool isSincRequired)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setSyncRequired(isSincRequired);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProcessor::setWriteSRGB(bool sRGB)
+	{
+		context->setWriteSRGB(sRGB);
+	}
+
+	void PixelProcessor::setColorLogicOpEnabled(bool colorLogicOpEnabled)
+	{
+		context->setColorLogicOpEnabled(colorLogicOpEnabled);
+	}
+
+	void PixelProcessor::setLogicalOperation(LogicalOperation logicalOperation)
+	{
+		context->setLogicalOperation(logicalOperation);
+	}
+
+	void PixelProcessor::setDepthBufferEnable(bool depthBufferEnable)
+	{
+		context->setDepthBufferEnable(depthBufferEnable);
+	}
+
+	void PixelProcessor::setDepthCompare(DepthCompareMode depthCompareMode)
+	{
+		context->depthCompareMode = depthCompareMode;
+	}
+
+	void PixelProcessor::setAlphaCompare(AlphaCompareMode alphaCompareMode)
+	{
+		context->alphaCompareMode = alphaCompareMode;
+	}
+
+	void PixelProcessor::setDepthWriteEnable(bool depthWriteEnable)
+	{
+		context->depthWriteEnable = depthWriteEnable;
+	}
+
+	void PixelProcessor::setAlphaTestEnable(bool alphaTestEnable)
+	{
+		context->alphaTestEnable = alphaTestEnable;
+	}
+
+	void PixelProcessor::setCullMode(CullMode cullMode, bool frontFacingCCW)
+	{
+		context->cullMode = cullMode;
+		context->frontFacingCCW = frontFacingCCW;
+	}
+
+	void PixelProcessor::setColorWriteMask(int index, int rgbaMask)
+	{
+		context->setColorWriteMask(index, rgbaMask);
+	}
+
+	void PixelProcessor::setStencilEnable(bool stencilEnable)
+	{
+		context->stencilEnable = stencilEnable;
+	}
+
+	void PixelProcessor::setStencilCompare(StencilCompareMode stencilCompareMode)
+	{
+		context->stencilCompareMode = stencilCompareMode;
+	}
+
+	void PixelProcessor::setStencilReference(int stencilReference)
+	{
+		context->stencilReference = stencilReference;
+		stencil.set(stencilReference, context->stencilMask, context->stencilWriteMask);
+	}
+
+	void PixelProcessor::setStencilReferenceCCW(int stencilReferenceCCW)
+	{
+		context->stencilReferenceCCW = stencilReferenceCCW;
+		stencilCCW.set(stencilReferenceCCW, context->stencilMaskCCW, context->stencilWriteMaskCCW);
+	}
+
+	void PixelProcessor::setStencilMask(int stencilMask)
+	{
+		context->stencilMask = stencilMask;
+		stencil.set(context->stencilReference, stencilMask, context->stencilWriteMask);
+	}
+
+	void PixelProcessor::setStencilMaskCCW(int stencilMaskCCW)
+	{
+		context->stencilMaskCCW = stencilMaskCCW;
+		stencilCCW.set(context->stencilReferenceCCW, stencilMaskCCW, context->stencilWriteMaskCCW);
+	}
+
+	void PixelProcessor::setStencilFailOperation(StencilOperation stencilFailOperation)
+	{
+		context->stencilFailOperation = stencilFailOperation;
+	}
+
+	void PixelProcessor::setStencilPassOperation(StencilOperation stencilPassOperation)
+	{
+		context->stencilPassOperation = stencilPassOperation;
+	}
+
+	void PixelProcessor::setStencilZFailOperation(StencilOperation stencilZFailOperation)
+	{
+		context->stencilZFailOperation = stencilZFailOperation;
+	}
+
+	void PixelProcessor::setStencilWriteMask(int stencilWriteMask)
+	{
+		context->stencilWriteMask = stencilWriteMask;
+		stencil.set(context->stencilReference, context->stencilMask, stencilWriteMask);
+	}
+
+	void PixelProcessor::setStencilWriteMaskCCW(int stencilWriteMaskCCW)
+	{
+		context->stencilWriteMaskCCW = stencilWriteMaskCCW;
+		stencilCCW.set(context->stencilReferenceCCW, context->stencilMaskCCW, stencilWriteMaskCCW);
+	}
+
+	void PixelProcessor::setTwoSidedStencil(bool enable)
+	{
+		context->twoSidedStencil = enable;
+	}
+
+	void PixelProcessor::setStencilCompareCCW(StencilCompareMode stencilCompareMode)
+	{
+		context->stencilCompareModeCCW = stencilCompareMode;
+	}
+
+	void PixelProcessor::setStencilFailOperationCCW(StencilOperation stencilFailOperation)
+	{
+		context->stencilFailOperationCCW = stencilFailOperation;
+	}
+
+	void PixelProcessor::setStencilPassOperationCCW(StencilOperation stencilPassOperation)
+	{
+		context->stencilPassOperationCCW = stencilPassOperation;
+	}
+
+	void PixelProcessor::setStencilZFailOperationCCW(StencilOperation stencilZFailOperation)
+	{
+		context->stencilZFailOperationCCW = stencilZFailOperation;
+	}
+
+	void PixelProcessor::setTextureFactor(const Color<float> &textureFactor)
+	{
+		// FIXME: Compact into generic function   // FIXME: Clamp
+		short textureFactorR = iround(4095 * textureFactor.r);
+		short textureFactorG = iround(4095 * textureFactor.g);
+		short textureFactorB = iround(4095 * textureFactor.b);
+		short textureFactorA = iround(4095 * textureFactor.a);
+
+		factor.textureFactor4[0][0] = textureFactorR;
+		factor.textureFactor4[0][1] = textureFactorR;
+		factor.textureFactor4[0][2] = textureFactorR;
+		factor.textureFactor4[0][3] = textureFactorR;
+
+		factor.textureFactor4[1][0] = textureFactorG;
+		factor.textureFactor4[1][1] = textureFactorG;
+		factor.textureFactor4[1][2] = textureFactorG;
+		factor.textureFactor4[1][3] = textureFactorG;
+
+		factor.textureFactor4[2][0] = textureFactorB;
+		factor.textureFactor4[2][1] = textureFactorB;
+		factor.textureFactor4[2][2] = textureFactorB;
+		factor.textureFactor4[2][3] = textureFactorB;
+
+		factor.textureFactor4[3][0] = textureFactorA;
+		factor.textureFactor4[3][1] = textureFactorA;
+		factor.textureFactor4[3][2] = textureFactorA;
+		factor.textureFactor4[3][3] = textureFactorA;
+	}
+
+	void PixelProcessor::setBlendConstant(const Color<float> &blendConstant)
+	{
+		// FIXME: Compact into generic function   // FIXME: Clamp
+		short blendConstantR = iround(65535 * blendConstant.r);
+		short blendConstantG = iround(65535 * blendConstant.g);
+		short blendConstantB = iround(65535 * blendConstant.b);
+		short blendConstantA = iround(65535 * blendConstant.a);
+
+		factor.blendConstant4W[0][0] = blendConstantR;
+		factor.blendConstant4W[0][1] = blendConstantR;
+		factor.blendConstant4W[0][2] = blendConstantR;
+		factor.blendConstant4W[0][3] = blendConstantR;
+
+		factor.blendConstant4W[1][0] = blendConstantG;
+		factor.blendConstant4W[1][1] = blendConstantG;
+		factor.blendConstant4W[1][2] = blendConstantG;
+		factor.blendConstant4W[1][3] = blendConstantG;
+
+		factor.blendConstant4W[2][0] = blendConstantB;
+		factor.blendConstant4W[2][1] = blendConstantB;
+		factor.blendConstant4W[2][2] = blendConstantB;
+		factor.blendConstant4W[2][3] = blendConstantB;
+
+		factor.blendConstant4W[3][0] = blendConstantA;
+		factor.blendConstant4W[3][1] = blendConstantA;
+		factor.blendConstant4W[3][2] = blendConstantA;
+		factor.blendConstant4W[3][3] = blendConstantA;
+
+		// FIXME: Compact into generic function   // FIXME: Clamp
+		short invBlendConstantR = iround(65535 * (1 - blendConstant.r));
+		short invBlendConstantG = iround(65535 * (1 - blendConstant.g));
+		short invBlendConstantB = iround(65535 * (1 - blendConstant.b));
+		short invBlendConstantA = iround(65535 * (1 - blendConstant.a));
+
+		factor.invBlendConstant4W[0][0] = invBlendConstantR;
+		factor.invBlendConstant4W[0][1] = invBlendConstantR;
+		factor.invBlendConstant4W[0][2] = invBlendConstantR;
+		factor.invBlendConstant4W[0][3] = invBlendConstantR;
+
+		factor.invBlendConstant4W[1][0] = invBlendConstantG;
+		factor.invBlendConstant4W[1][1] = invBlendConstantG;
+		factor.invBlendConstant4W[1][2] = invBlendConstantG;
+		factor.invBlendConstant4W[1][3] = invBlendConstantG;
+
+		factor.invBlendConstant4W[2][0] = invBlendConstantB;
+		factor.invBlendConstant4W[2][1] = invBlendConstantB;
+		factor.invBlendConstant4W[2][2] = invBlendConstantB;
+		factor.invBlendConstant4W[2][3] = invBlendConstantB;
+
+		factor.invBlendConstant4W[3][0] = invBlendConstantA;
+		factor.invBlendConstant4W[3][1] = invBlendConstantA;
+		factor.invBlendConstant4W[3][2] = invBlendConstantA;
+		factor.invBlendConstant4W[3][3] = invBlendConstantA;
+
+		factor.blendConstant4F[0][0] = blendConstant.r;
+		factor.blendConstant4F[0][1] = blendConstant.r;
+		factor.blendConstant4F[0][2] = blendConstant.r;
+		factor.blendConstant4F[0][3] = blendConstant.r;
+
+		factor.blendConstant4F[1][0] = blendConstant.g;
+		factor.blendConstant4F[1][1] = blendConstant.g;
+		factor.blendConstant4F[1][2] = blendConstant.g;
+		factor.blendConstant4F[1][3] = blendConstant.g;
+
+		factor.blendConstant4F[2][0] = blendConstant.b;
+		factor.blendConstant4F[2][1] = blendConstant.b;
+		factor.blendConstant4F[2][2] = blendConstant.b;
+		factor.blendConstant4F[2][3] = blendConstant.b;
+
+		factor.blendConstant4F[3][0] = blendConstant.a;
+		factor.blendConstant4F[3][1] = blendConstant.a;
+		factor.blendConstant4F[3][2] = blendConstant.a;
+		factor.blendConstant4F[3][3] = blendConstant.a;
+
+		factor.invBlendConstant4F[0][0] = 1 - blendConstant.r;
+		factor.invBlendConstant4F[0][1] = 1 - blendConstant.r;
+		factor.invBlendConstant4F[0][2] = 1 - blendConstant.r;
+		factor.invBlendConstant4F[0][3] = 1 - blendConstant.r;
+
+		factor.invBlendConstant4F[1][0] = 1 - blendConstant.g;
+		factor.invBlendConstant4F[1][1] = 1 - blendConstant.g;
+		factor.invBlendConstant4F[1][2] = 1 - blendConstant.g;
+		factor.invBlendConstant4F[1][3] = 1 - blendConstant.g;
+
+		factor.invBlendConstant4F[2][0] = 1 - blendConstant.b;
+		factor.invBlendConstant4F[2][1] = 1 - blendConstant.b;
+		factor.invBlendConstant4F[2][2] = 1 - blendConstant.b;
+		factor.invBlendConstant4F[2][3] = 1 - blendConstant.b;
+
+		factor.invBlendConstant4F[3][0] = 1 - blendConstant.a;
+		factor.invBlendConstant4F[3][1] = 1 - blendConstant.a;
+		factor.invBlendConstant4F[3][2] = 1 - blendConstant.a;
+		factor.invBlendConstant4F[3][3] = 1 - blendConstant.a;
+	}
+
+	void PixelProcessor::setFillMode(FillMode fillMode)
+	{
+		context->fillMode = fillMode;
+	}
+
+	void PixelProcessor::setShadingMode(ShadingMode shadingMode)
+	{
+		context->shadingMode = shadingMode;
+	}
+
+	void PixelProcessor::setAlphaBlendEnable(bool alphaBlendEnable)
+	{
+		context->setAlphaBlendEnable(alphaBlendEnable);
+	}
+
+	void PixelProcessor::setSourceBlendFactor(BlendFactor sourceBlendFactor)
+	{
+		context->setSourceBlendFactor(sourceBlendFactor);
+	}
+
+	void PixelProcessor::setDestBlendFactor(BlendFactor destBlendFactor)
+	{
+		context->setDestBlendFactor(destBlendFactor);
+	}
+
+	void PixelProcessor::setBlendOperation(BlendOperation blendOperation)
+	{
+		context->setBlendOperation(blendOperation);
+	}
+
+	void PixelProcessor::setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable)
+	{
+		context->setSeparateAlphaBlendEnable(separateAlphaBlendEnable);
+	}
+
+	void PixelProcessor::setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha)
+	{
+		context->setSourceBlendFactorAlpha(sourceBlendFactorAlpha);
+	}
+
+	void PixelProcessor::setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha)
+	{
+		context->setDestBlendFactorAlpha(destBlendFactorAlpha);
+	}
+
+	void PixelProcessor::setBlendOperationAlpha(BlendOperation blendOperationAlpha)
+	{
+		context->setBlendOperationAlpha(blendOperationAlpha);
+	}
+
+	void PixelProcessor::setAlphaReference(float alphaReference)
+	{
+		context->alphaReference = alphaReference;
+
+		factor.alphaReference4[0] = (word)iround(alphaReference * 0x1000 / 0xFF);
+		factor.alphaReference4[1] = (word)iround(alphaReference * 0x1000 / 0xFF);
+		factor.alphaReference4[2] = (word)iround(alphaReference * 0x1000 / 0xFF);
+		factor.alphaReference4[3] = (word)iround(alphaReference * 0x1000 / 0xFF);
+	}
+
+	void PixelProcessor::setGlobalMipmapBias(float bias)
+	{
+		context->setGlobalMipmapBias(bias);
+	}
+
+	void PixelProcessor::setFogStart(float start)
+	{
+		setFogRanges(start, context->fogEnd);
+	}
+
+	void PixelProcessor::setFogEnd(float end)
+	{
+		setFogRanges(context->fogStart, end);
+	}
+
+	void PixelProcessor::setFogColor(Color<float> fogColor)
+	{
+		// TODO: Compact into generic function
+		word fogR = (unsigned short)(65535 * fogColor.r);
+		word fogG = (unsigned short)(65535 * fogColor.g);
+		word fogB = (unsigned short)(65535 * fogColor.b);
+
+		fog.color4[0][0] = fogR;
+		fog.color4[0][1] = fogR;
+		fog.color4[0][2] = fogR;
+		fog.color4[0][3] = fogR;
+
+		fog.color4[1][0] = fogG;
+		fog.color4[1][1] = fogG;
+		fog.color4[1][2] = fogG;
+		fog.color4[1][3] = fogG;
+
+		fog.color4[2][0] = fogB;
+		fog.color4[2][1] = fogB;
+		fog.color4[2][2] = fogB;
+		fog.color4[2][3] = fogB;
+
+		fog.colorF[0] = replicate(fogColor.r);
+		fog.colorF[1] = replicate(fogColor.g);
+		fog.colorF[2] = replicate(fogColor.b);
+	}
+
+	void PixelProcessor::setFogDensity(float fogDensity)
+	{
+		fog.densityE = replicate(-fogDensity * 1.442695f);   // 1/e^x = 2^(-x*1.44)
+		fog.density2E = replicate(-fogDensity * fogDensity * 1.442695f);
+	}
+
+	void PixelProcessor::setPixelFogMode(FogMode fogMode)
+	{
+		context->pixelFogMode = fogMode;
+	}
+
+	void PixelProcessor::setPerspectiveCorrection(bool perspectiveEnable)
+	{
+		perspectiveCorrection = perspectiveEnable;
+	}
+
+	void PixelProcessor::setOcclusionEnabled(bool enable)
+	{
+		context->occlusionEnabled = enable;
+	}
+
+	void PixelProcessor::setRoutineCacheSize(int cacheSize)
+	{
+		delete routineCache;
+		routineCache = new RoutineCache<State>(clamp(cacheSize, 1, 65536), precachePixel ? "sw-pixel" : 0);
+	}
+
+	void PixelProcessor::setFogRanges(float start, float end)
+	{
+		context->fogStart = start;
+		context->fogEnd = end;
+
+		if(start == end)
+		{
+			end += 0.001f;   // Hack: ensure there is a small range
+		}
+
+		float fogScale = -1.0f / (end - start);
+		float fogOffset = end * -fogScale;
+
+		fog.scale = replicate(fogScale);
+		fog.offset = replicate(fogOffset);
+	}
+
+	const PixelProcessor::State PixelProcessor::update() const
+	{
+		State state;
+
+		if(context->pixelShader)
+		{
+			state.shaderID = context->pixelShader->getSerialID();
+		}
+		else
+		{
+			state.shaderID = 0;
+		}
+
+		state.depthOverride = context->pixelShader && context->pixelShader->depthOverride();
+		state.shaderContainsKill = context->pixelShader ? context->pixelShader->containsKill() : false;
+
+		if(context->alphaTestActive())
+		{
+			state.alphaCompareMode = context->alphaCompareMode;
+
+			state.transparencyAntialiasing = context->getMultiSampleCount() > 1 ? transparencyAntialiasing : TRANSPARENCY_NONE;
+		}
+
+		state.depthWriteEnable = context->depthWriteActive();
+
+		if(context->stencilActive())
+		{
+			state.stencilActive = true;
+			state.stencilCompareMode = context->stencilCompareMode;
+			state.stencilFailOperation = context->stencilFailOperation;
+			state.stencilPassOperation = context->stencilPassOperation;
+			state.stencilZFailOperation = context->stencilZFailOperation;
+			state.noStencilMask = (context->stencilMask == 0xFF);
+			state.noStencilWriteMask = (context->stencilWriteMask == 0xFF);
+			state.stencilWriteMasked = (context->stencilWriteMask == 0x00);
+
+			state.twoSidedStencil = context->twoSidedStencil;
+			state.stencilCompareModeCCW = context->twoSidedStencil ? context->stencilCompareModeCCW : state.stencilCompareMode;
+			state.stencilFailOperationCCW = context->twoSidedStencil ? context->stencilFailOperationCCW : state.stencilFailOperation;
+			state.stencilPassOperationCCW = context->twoSidedStencil ? context->stencilPassOperationCCW : state.stencilPassOperation;
+			state.stencilZFailOperationCCW = context->twoSidedStencil ? context->stencilZFailOperationCCW : state.stencilZFailOperation;
+			state.noStencilMaskCCW = context->twoSidedStencil ? (context->stencilMaskCCW == 0xFF) : state.noStencilMask;
+			state.noStencilWriteMaskCCW = context->twoSidedStencil ? (context->stencilWriteMaskCCW == 0xFF) : state.noStencilWriteMask;
+			state.stencilWriteMaskedCCW = context->twoSidedStencil ? (context->stencilWriteMaskCCW == 0x00) : state.stencilWriteMasked;
+		}
+
+		if(context->depthBufferActive())
+		{
+			state.depthTestActive = true;
+			state.depthCompareMode = context->depthCompareMode;
+			state.quadLayoutDepthBuffer = Surface::hasQuadLayout(context->depthBuffer->getInternalFormat());
+		}
+
+		state.occlusionEnabled = context->occlusionEnabled;
+
+		state.fogActive = context->fogActive();
+		state.pixelFogMode = context->pixelFogActive();
+		state.wBasedFog = context->wBasedFog && context->pixelFogActive() != FOG_NONE;
+		state.perspective = context->perspectiveActive();
+		state.depthClamp = (context->depthBias != 0.0f) || (context->slopeDepthBias != 0.0f);
+
+		if(context->alphaBlendActive())
+		{
+			state.alphaBlendActive = true;
+			state.sourceBlendFactor = context->sourceBlendFactor();
+			state.destBlendFactor = context->destBlendFactor();
+			state.blendOperation = context->blendOperation();
+			state.sourceBlendFactorAlpha = context->sourceBlendFactorAlpha();
+			state.destBlendFactorAlpha = context->destBlendFactorAlpha();
+			state.blendOperationAlpha = context->blendOperationAlpha();
+		}
+
+		state.logicalOperation = context->colorLogicOp();
+
+		for(int i = 0; i < RENDERTARGETS; i++)
+		{
+			state.colorWriteMask |= context->colorWriteActive(i) << (4 * i);
+			state.targetFormat[i] = context->renderTargetInternalFormat(i);
+		}
+
+		state.writeSRGB	= context->writeSRGB && context->renderTarget[0] && Surface::isSRGBwritable(context->renderTarget[0]->getExternalFormat());
+		state.multiSample = context->getMultiSampleCount();
+		state.multiSampleMask = context->multiSampleMask;
+
+		if(state.multiSample > 1 && context->pixelShader)
+		{
+			state.centroid = context->pixelShader->containsCentroid();
+		}
+
+		state.frontFaceCCW = context->frontFacingCCW;
+
+		if(!context->pixelShader)
+		{
+			for(unsigned int i = 0; i < 8; i++)
+			{
+				state.textureStage[i] = context->textureStage[i].textureStageState();
+			}
+
+			state.specularAdd = context->specularActive() && context->specularEnable;
+		}
+
+		for(unsigned int i = 0; i < 16; i++)
+		{
+			if(context->pixelShader)
+			{
+				if(context->pixelShader->usesSampler(i))
+				{
+					state.sampler[i] = context->sampler[i].samplerState();
+				}
+			}
+			else
+			{
+				if(i < 8 && state.textureStage[i].stageOperation != TextureStage::STAGE_DISABLE)
+				{
+					state.sampler[i] = context->sampler[i].samplerState();
+				}
+				else break;
+			}
+		}
+
+		const bool point = context->isDrawPoint(true);
+		const bool sprite = context->pointSpriteActive();
+		const bool flatShading = (context->shadingMode == SHADING_FLAT) || point;
+
+		if(context->pixelShaderModel() < 0x0300)
+		{
+			for(int coordinate = 0; coordinate < 8; coordinate++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					if(context->textureActive(coordinate, component))
+					{
+						state.texture[coordinate].component |= 1 << component;
+
+						if(point && !sprite)
+						{
+							state.texture[coordinate].flat |= 1 << component;
+						}
+					}
+				}
+
+				if(context->textureTransformProject[coordinate] && context->pixelShaderModel() <= 0x0103)
+				{
+					if(context->textureTransformCount[coordinate] == 2)
+					{
+						state.texture[coordinate].project = 1;
+					}
+					else if(context->textureTransformCount[coordinate] == 3)
+					{
+						state.texture[coordinate].project = 2;
+					}
+					else if(context->textureTransformCount[coordinate] == 4 || context->textureTransformCount[coordinate] == 0)
+					{
+						state.texture[coordinate].project = 3;
+					}
+				}
+			}
+
+			for(int color = 0; color < 2; color++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					if(context->colorActive(color, component))
+					{
+						state.color[color].component |= 1 << component;
+
+						if(point || flatShading)
+						{
+							state.color[color].flat |= 1 << component;
+						}
+					}
+				}
+			}
+
+			if(context->fogActive())
+			{
+				state.fog.component = true;
+
+				if(point)
+				{
+					state.fog.flat = true;
+				}
+			}
+		}
+		else
+		{
+			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					const Shader::Semantic &semantic = context->pixelShader->getInput(interpolant, component);
+
+					if(semantic.active())
+					{
+						bool flat = point;
+
+						switch(semantic.usage)
+						{
+						case Shader::USAGE_TEXCOORD: flat = point && !sprite;             break;
+						case Shader::USAGE_COLOR:    flat = semantic.flat || flatShading; break;
+						}
+
+						state.interpolant[interpolant].component |= 1 << component;
+
+						if(flat)
+						{
+							state.interpolant[interpolant].flat |= 1 << component;
+						}
+					}
+				}
+			}
+		}
+
+		if(state.centroid)
+		{
+			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					state.interpolant[interpolant].centroid = context->pixelShader->getInput(interpolant, 0).centroid;
+				}
+			}
+		}
+
+		state.hash = state.computeHash();
+
+		return state;
+	}
+
+	Routine *PixelProcessor::routine(const State &state)
+	{
+		Routine *routine = routineCache->query(state);
+
+		if(!routine)
+		{
+			const bool integerPipeline = (context->pixelShaderModel() <= 0x0104);
+			QuadRasterizer *generator = nullptr;
+
+			if(integerPipeline)
+			{
+				generator = new PixelPipeline(state, context->pixelShader);
+			}
+			else
+			{
+				generator = new PixelProgram(state, context->pixelShader);
+			}
+
+			generator->generate();
+			routine = (*generator)(L"PixelRoutine_%0.8X", state.shaderID);
+			delete generator;
+
+			routineCache->add(state, routine);
+		}
+
+		return routine;
+	}
+}
diff --git a/src/Device/PixelProcessor.hpp b/src/Device/PixelProcessor.hpp
new file mode 100644
index 0000000..98300de
--- /dev/null
+++ b/src/Device/PixelProcessor.hpp
@@ -0,0 +1,342 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelProcessor_hpp
+#define sw_PixelProcessor_hpp
+
+#include "Context.hpp"
+#include "RoutineCache.hpp"
+
+namespace sw
+{
+	class PixelShader;
+	class Rasterizer;
+	struct Texture;
+	struct DrawData;
+
+	class PixelProcessor
+	{
+	public:
+		struct States
+		{
+			unsigned int computeHash();
+
+			int shaderID;
+
+			bool depthOverride                        : 1;   // TODO: Eliminate by querying shader.
+			bool shaderContainsKill                   : 1;   // TODO: Eliminate by querying shader.
+
+			DepthCompareMode depthCompareMode         : BITS(DEPTH_LAST);
+			AlphaCompareMode alphaCompareMode         : BITS(ALPHA_LAST);
+			bool depthWriteEnable                     : 1;
+			bool quadLayoutDepthBuffer                : 1;
+
+			bool stencilActive                        : 1;
+			StencilCompareMode stencilCompareMode     : BITS(STENCIL_LAST);
+			StencilOperation stencilFailOperation     : BITS(OPERATION_LAST);
+			StencilOperation stencilPassOperation     : BITS(OPERATION_LAST);
+			StencilOperation stencilZFailOperation    : BITS(OPERATION_LAST);
+			bool noStencilMask                        : 1;
+			bool noStencilWriteMask                   : 1;
+			bool stencilWriteMasked                   : 1;
+			bool twoSidedStencil                      : 1;
+			StencilCompareMode stencilCompareModeCCW  : BITS(STENCIL_LAST);
+			StencilOperation stencilFailOperationCCW  : BITS(OPERATION_LAST);
+			StencilOperation stencilPassOperationCCW  : BITS(OPERATION_LAST);
+			StencilOperation stencilZFailOperationCCW : BITS(OPERATION_LAST);
+			bool noStencilMaskCCW                     : 1;
+			bool noStencilWriteMaskCCW                : 1;
+			bool stencilWriteMaskedCCW                : 1;
+
+			bool depthTestActive                      : 1;
+			bool fogActive                            : 1;
+			FogMode pixelFogMode                      : BITS(FOG_LAST);
+			bool specularAdd                          : 1;
+			bool occlusionEnabled                     : 1;
+			bool wBasedFog                            : 1;
+			bool perspective                          : 1;
+			bool depthClamp                           : 1;
+
+			bool alphaBlendActive                     : 1;
+			BlendFactor sourceBlendFactor             : BITS(BLEND_LAST);
+			BlendFactor destBlendFactor               : BITS(BLEND_LAST);
+			BlendOperation blendOperation             : BITS(BLENDOP_LAST);
+			BlendFactor sourceBlendFactorAlpha        : BITS(BLEND_LAST);
+			BlendFactor destBlendFactorAlpha          : BITS(BLEND_LAST);
+			BlendOperation blendOperationAlpha        : BITS(BLENDOP_LAST);
+
+			unsigned int colorWriteMask                       : RENDERTARGETS * 4;   // Four component bit masks
+			Format targetFormat[RENDERTARGETS];
+			bool writeSRGB                                    : 1;
+			unsigned int multiSample                          : 3;
+			unsigned int multiSampleMask                      : 4;
+			TransparencyAntialiasing transparencyAntialiasing : BITS(TRANSPARENCY_LAST);
+			bool centroid                                     : 1;
+			bool frontFaceCCW                                 : 1;
+
+			LogicalOperation logicalOperation : BITS(LOGICALOP_LAST);
+
+			Sampler::State sampler[TEXTURE_IMAGE_UNITS];
+			TextureStage::State textureStage[8];
+
+			struct Interpolant
+			{
+				unsigned char component : 4;
+				unsigned char flat : 4;
+				unsigned char project : 2;
+				bool centroid : 1;
+			};
+
+			union
+			{
+				struct
+				{
+					Interpolant color[2];
+					Interpolant texture[8];
+					Interpolant fog;
+				};
+
+				Interpolant interpolant[MAX_FRAGMENT_INPUTS];
+			};
+		};
+
+		struct State : States
+		{
+			State();
+
+			bool operator==(const State &state) const;
+
+			int colorWriteActive(int index) const
+			{
+				return (colorWriteMask >> (index * 4)) & 0xF;
+			}
+
+			bool alphaTestActive() const
+			{
+				return (alphaCompareMode != ALPHA_ALWAYS) || (transparencyAntialiasing != TRANSPARENCY_NONE);
+			}
+
+			bool pixelFogActive() const
+			{
+				return pixelFogMode != FOG_NONE;
+			}
+
+			unsigned int hash;
+		};
+
+		struct Stencil
+		{
+			int64_t testMaskQ;
+			int64_t referenceMaskedQ;
+			int64_t referenceMaskedSignedQ;
+			int64_t writeMaskQ;
+			int64_t invWriteMaskQ;
+			int64_t referenceQ;
+
+			void set(int reference, int testMask, int writeMask)
+			{
+				referenceQ = replicate(reference);
+				testMaskQ = replicate(testMask);
+				writeMaskQ = replicate(writeMask);
+				invWriteMaskQ = ~writeMaskQ;
+				referenceMaskedQ = referenceQ & testMaskQ;
+				referenceMaskedSignedQ = replicate(((reference & testMask) + 0x80) & 0xFF);
+			}
+
+			static int64_t replicate(int b)
+			{
+				int64_t w = b & 0xFF;
+
+				return (w << 0) | (w << 8) | (w << 16) | (w << 24) | (w << 32) | (w << 40) | (w << 48) | (w << 56);
+			}
+		};
+
+		struct Fog
+		{
+			float4 scale;
+			float4 offset;
+			word4 color4[3];
+			float4 colorF[3];
+			float4 densityE;
+			float4 density2E;
+		};
+
+		struct Factor
+		{
+			word4 textureFactor4[4];
+
+			word4 alphaReference4;
+
+			word4 blendConstant4W[4];
+			float4 blendConstant4F[4];
+			word4 invBlendConstant4W[4];
+			float4 invBlendConstant4F[4];
+		};
+
+	public:
+		typedef void (*RoutinePointer)(const Primitive *primitive, int count, int thread, DrawData *draw);
+
+		PixelProcessor(Context *context);
+
+		virtual ~PixelProcessor();
+
+		void setFloatConstant(unsigned int index, const float value[4]);
+		void setIntegerConstant(unsigned int index, const int value[4]);
+		void setBooleanConstant(unsigned int index, int boolean);
+
+		void setUniformBuffer(int index, sw::Resource* buffer, int offset);
+		void lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[]);
+
+		void setRenderTarget(int index, Surface *renderTarget, unsigned int layer = 0);
+		void setDepthBuffer(Surface *depthBuffer, unsigned int layer = 0);
+		void setStencilBuffer(Surface *stencilBuffer, unsigned int layer = 0);
+
+		void setTexCoordIndex(unsigned int stage, int texCoordIndex);
+		void setStageOperation(unsigned int stage, TextureStage::StageOperation stageOperation);
+		void setFirstArgument(unsigned int stage, TextureStage::SourceArgument firstArgument);
+		void setSecondArgument(unsigned int stage, TextureStage::SourceArgument secondArgument);
+		void setThirdArgument(unsigned int stage, TextureStage::SourceArgument thirdArgument);
+		void setStageOperationAlpha(unsigned int stage, TextureStage::StageOperation stageOperationAlpha);
+		void setFirstArgumentAlpha(unsigned int stage, TextureStage::SourceArgument firstArgumentAlpha);
+		void setSecondArgumentAlpha(unsigned int stage, TextureStage::SourceArgument secondArgumentAlpha);
+		void setThirdArgumentAlpha(unsigned int stage, TextureStage::SourceArgument thirdArgumentAlpha);
+		void setFirstModifier(unsigned int stage, TextureStage::ArgumentModifier firstModifier);
+		void setSecondModifier(unsigned int stage, TextureStage::ArgumentModifier secondModifier);
+		void setThirdModifier(unsigned int stage, TextureStage::ArgumentModifier thirdModifier);
+		void setFirstModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier firstModifierAlpha);
+		void setSecondModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier secondModifierAlpha);
+		void setThirdModifierAlpha(unsigned int stage, TextureStage::ArgumentModifier thirdModifierAlpha);
+		void setDestinationArgument(unsigned int stage, TextureStage::DestinationArgument destinationArgument);
+		void setConstantColor(unsigned int stage, const Color<float> &constantColor);
+		void setBumpmapMatrix(unsigned int stage, int element, float value);
+		void setLuminanceScale(unsigned int stage, float value);
+		void setLuminanceOffset(unsigned int stage, float value);
+
+		void setTextureFilter(unsigned int sampler, FilterType textureFilter);
+		void setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter);
+		void setGatherEnable(unsigned int sampler, bool enable);
+		void setAddressingModeU(unsigned int sampler, AddressingMode addressingMode);
+		void setAddressingModeV(unsigned int sampler, AddressingMode addressingMode);
+		void setAddressingModeW(unsigned int sampler, AddressingMode addressingMode);
+		void setReadSRGB(unsigned int sampler, bool sRGB);
+		void setMipmapLOD(unsigned int sampler, float bias);
+		void setBorderColor(unsigned int sampler, const Color<float> &borderColor);
+		void setMaxAnisotropy(unsigned int sampler, float maxAnisotropy);
+		void setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering);
+		void setSwizzleR(unsigned int sampler, SwizzleType swizzleR);
+		void setSwizzleG(unsigned int sampler, SwizzleType swizzleG);
+		void setSwizzleB(unsigned int sampler, SwizzleType swizzleB);
+		void setSwizzleA(unsigned int sampler, SwizzleType swizzleA);
+		void setCompareFunc(unsigned int sampler, CompareFunc compare);
+		void setBaseLevel(unsigned int sampler, int baseLevel);
+		void setMaxLevel(unsigned int sampler, int maxLevel);
+		void setMinLod(unsigned int sampler, float minLod);
+		void setMaxLod(unsigned int sampler, float maxLod);
+		void setSyncRequired(unsigned int sampler, bool isSincRequired);
+
+		void setWriteSRGB(bool sRGB);
+		void setDepthBufferEnable(bool depthBufferEnable);
+		void setDepthCompare(DepthCompareMode depthCompareMode);
+		void setAlphaCompare(AlphaCompareMode alphaCompareMode);
+		void setDepthWriteEnable(bool depthWriteEnable);
+		void setAlphaTestEnable(bool alphaTestEnable);
+		void setCullMode(CullMode cullMode, bool frontFacingCCW);
+		void setColorWriteMask(int index, int rgbaMask);
+
+		void setColorLogicOpEnabled(bool colorLogicOpEnabled);
+		void setLogicalOperation(LogicalOperation logicalOperation);
+
+		void setStencilEnable(bool stencilEnable);
+		void setStencilCompare(StencilCompareMode stencilCompareMode);
+		void setStencilReference(int stencilReference);
+		void setStencilMask(int stencilMask);
+		void setStencilFailOperation(StencilOperation stencilFailOperation);
+		void setStencilPassOperation(StencilOperation stencilPassOperation);
+		void setStencilZFailOperation(StencilOperation stencilZFailOperation);
+		void setStencilWriteMask(int stencilWriteMask);
+		void setTwoSidedStencil(bool enable);
+		void setStencilCompareCCW(StencilCompareMode stencilCompareMode);
+		void setStencilReferenceCCW(int stencilReference);
+		void setStencilMaskCCW(int stencilMask);
+		void setStencilFailOperationCCW(StencilOperation stencilFailOperation);
+		void setStencilPassOperationCCW(StencilOperation stencilPassOperation);
+		void setStencilZFailOperationCCW(StencilOperation stencilZFailOperation);
+		void setStencilWriteMaskCCW(int stencilWriteMask);
+
+		void setTextureFactor(const Color<float> &textureFactor);
+		void setBlendConstant(const Color<float> &blendConstant);
+
+		void setFillMode(FillMode fillMode);
+		void setShadingMode(ShadingMode shadingMode);
+
+		void setAlphaBlendEnable(bool alphaBlendEnable);
+		void setSourceBlendFactor(BlendFactor sourceBlendFactor);
+		void setDestBlendFactor(BlendFactor destBlendFactor);
+		void setBlendOperation(BlendOperation blendOperation);
+
+		void setSeparateAlphaBlendEnable(bool separateAlphaBlendEnable);
+		void setSourceBlendFactorAlpha(BlendFactor sourceBlendFactorAlpha);
+		void setDestBlendFactorAlpha(BlendFactor destBlendFactorAlpha);
+		void setBlendOperationAlpha(BlendOperation blendOperationAlpha);
+
+		void setAlphaReference(float alphaReference);
+
+		void setGlobalMipmapBias(float bias);
+
+		void setFogStart(float start);
+		void setFogEnd(float end);
+		void setFogColor(Color<float> fogColor);
+		void setFogDensity(float fogDensity);
+		void setPixelFogMode(FogMode fogMode);
+
+		void setPerspectiveCorrection(bool perspectiveCorrection);
+
+		void setOcclusionEnabled(bool enable);
+
+	protected:
+		const State update() const;
+		Routine *routine(const State &state);
+		void setRoutineCacheSize(int routineCacheSize);
+
+		// Shader constants
+		word4 cW[8][4];
+		float4 c[FRAGMENT_UNIFORM_VECTORS];
+		int4 i[16];
+		bool b[16];
+
+		// Other semi-constants
+		Stencil stencil;
+		Stencil stencilCCW;
+		Fog fog;
+		Factor factor;
+
+	private:
+		struct UniformBufferInfo
+		{
+			UniformBufferInfo();
+
+			Resource* buffer;
+			int offset;
+		};
+		UniformBufferInfo uniformBufferInfo[MAX_UNIFORM_BUFFER_BINDINGS];
+
+		void setFogRanges(float start, float end);
+
+		Context *const context;
+
+		RoutineCache<State> *routineCache;
+	};
+}
+
+#endif   // sw_PixelProcessor_hpp
diff --git a/src/Device/Plane.cpp b/src/Device/Plane.cpp
new file mode 100644
index 0000000..095b7f2
--- /dev/null
+++ b/src/Device/Plane.cpp
@@ -0,0 +1,60 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Plane.hpp"
+
+#include "Matrix.hpp"
+
+namespace sw
+{
+	Plane::Plane()
+	{
+	}
+
+	Plane::Plane(float p_A, float p_B, float p_C, float p_D)
+	{
+		A = p_A;
+		B = p_B;
+		C = p_C;
+		D = p_D;
+	}
+
+	Plane::Plane(const float ABCD[4])
+	{
+		A = ABCD[0];
+		B = ABCD[1];
+		C = ABCD[2];
+		D = ABCD[3];
+	}
+
+	Plane operator*(const Plane &p, const Matrix &T)
+	{
+		Matrix M = !T;
+
+		return Plane(p.A * M(1, 1) + p.B * M(1, 2) + p.C * M(1, 3) + p.D * M(1, 4),
+		             p.A * M(2, 1) + p.B * M(2, 2) + p.C * M(2, 3) + p.D * M(2, 4),
+		             p.A * M(3, 1) + p.B * M(3, 2) + p.C * M(3, 3) + p.D * M(3, 4),
+		             p.A * M(4, 1) + p.B * M(4, 2) + p.C * M(4, 3) + p.D * M(4, 4));
+	}
+
+	Plane operator*(const Matrix &T, const Plane &p)
+	{
+		Matrix M = !T;
+
+		return Plane(M(1, 1) * p.A + M(2, 1) * p.B + M(3, 1) * p.C + M(4, 1) * p.D,
+		             M(1, 2) * p.A + M(2, 2) * p.B + M(3, 2) * p.C + M(4, 2) * p.D,
+		             M(1, 3) * p.A + M(2, 3) * p.B + M(3, 3) * p.C + M(4, 3) * p.D,
+		             M(1, 4) * p.A + M(2, 4) * p.B + M(3, 4) * p.C + M(4, 4) * p.D);
+	}
+}
diff --git a/src/Device/Plane.hpp b/src/Device/Plane.hpp
new file mode 100644
index 0000000..962b9ae
--- /dev/null
+++ b/src/Device/Plane.hpp
@@ -0,0 +1,40 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Plane_hpp
+#define Plane_hpp
+
+#include "Vector.hpp"
+
+namespace sw
+{
+	struct Matrix;
+
+	struct Plane
+	{
+		float A;
+		float B;
+		float C;
+		float D;
+
+		Plane();
+		Plane(float A, float B, float C, float D);   // Plane equation 
+		Plane(const float ABCD[4]);
+
+		friend Plane operator*(const Plane &p, const Matrix &A);   // Transform plane by matrix (post-multiply)
+		friend Plane operator*(const Matrix &A, const Plane &p);   // Transform plane by matrix (pre-multiply)
+	};
+}
+
+#endif   // Plane_hpp
diff --git a/src/Device/Point.cpp b/src/Device/Point.cpp
new file mode 100644
index 0000000..e7e33dd
--- /dev/null
+++ b/src/Device/Point.cpp
@@ -0,0 +1,92 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Point.hpp"
+
+#include "Matrix.hpp"
+
+namespace sw
+{
+	Point &Point::operator+=(const Vector &v)
+	{
+		x += v.x;
+		y += v.y;
+		z += v.z;
+
+		return *this;
+	}
+
+	Point &Point::operator-=(const Vector &v)
+	{
+		x -= v.x;
+		y -= v.y;
+		z -= v.z;
+
+		return *this;
+	}
+
+	Point operator+(const Point &P, const Vector &v)
+	{
+		return Point(P.x + v.x, P.y + v.y, P.z + v.z);
+	}
+
+	Point operator-(const Point &P, const Vector &v)
+	{
+		return Point(P.x - v.x, P.y - v.y, P.z - v.z);
+	}
+
+	Vector operator-(const Point &P, const Point &Q)
+	{
+		return Vector(P.x - Q.x, P.y - Q.y, P.z - Q.z);
+	}
+
+	Point operator*(const Matrix &M, const Point &P)
+	{
+		return Point(M(1, 1) * P.x + M(1, 2) * P.y + M(1, 3) * P.z + M(1, 4),
+		             M(2, 1) * P.x + M(2, 2) * P.y + M(2, 3) * P.z + M(2, 4),
+		             M(3, 1) * P.x + M(3, 2) * P.y + M(3, 3) * P.z + M(3, 4));
+	}
+
+	Point operator*(const Point &P, const Matrix &M)
+	{
+		return Point(P.x * M(1, 1) + P.y * M(2, 1) + P.z * M(3, 1),
+		             P.x * M(1, 2) + P.y * M(2, 2) + P.z * M(3, 2),
+		             P.x * M(1, 3) + P.y * M(2, 3) + P.z * M(3, 3));
+	}
+
+	Point &operator*=(Point &P, const Matrix &M)
+	{
+		return P = P * M;
+	}
+
+	float Point::d(const Point &P) const
+	{
+		return Vector::N(*this - P);
+	}
+
+	float Point::d2(const Point &P) const
+	{
+		return Vector::N2(*this - P);
+	}
+
+	float Point::d(const Point &P, const Point &Q)
+	{
+		return Vector::N(P - Q);
+	}
+
+	float Point::d2(const Point &P, const Point &Q)
+	{
+		return Vector::N2(P - Q);
+	}
+}
diff --git a/src/Device/Point.hpp b/src/Device/Point.hpp
new file mode 100644
index 0000000..85198c5
--- /dev/null
+++ b/src/Device/Point.hpp
@@ -0,0 +1,139 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Point_hpp
+#define Point_hpp
+
+namespace sw
+{
+	struct Vector;
+	struct Matrix;
+
+	struct Point
+	{
+		Point();
+		Point(const int i);
+		Point(const Point &P);
+		Point(const Vector &v);
+		Point(float Px, float Py, float Pz);
+
+		Point &operator=(const Point &P);
+
+		union
+		{
+			float p[3];
+
+			struct
+			{	
+				float x;
+				float y;
+				float z;
+			};
+		};
+
+		float &operator[](int i);
+		float &operator()(int i);
+
+		const float &operator[](int i) const;
+		const float &operator()(int i) const;
+
+		Point &operator+=(const Vector &v);
+		Point &operator-=(const Vector &v);
+
+		friend Point operator+(const Point &P, const Vector &v);
+		friend Point operator-(const Point &P, const Vector &v);
+
+		friend Vector operator-(const Point &P, const Point &Q);
+
+		friend Point operator*(const Matrix &M, const Point& P);
+		friend Point operator*(const Point &P, const Matrix &M);
+		friend Point &operator*=(Point &P, const Matrix &M);
+
+		float d(const Point &P) const;   // Distance between two points
+		float d2(const Point &P) const;   // Squared distance between two points
+
+		static float d(const Point &P, const Point &Q);   // Distance between two points
+		static float d2(const Point &P, const Point &Q);   // Squared distance between two points
+	};
+}
+
+#include "Vector.hpp"
+
+namespace sw
+{
+	inline Point::Point()
+	{
+	}
+
+	inline Point::Point(const int i)
+	{
+		const float s = (float)i;
+
+		x = s;
+		y = s;
+		z = s;
+	}
+
+	inline Point::Point(const Point &P)
+	{
+		x = P.x;
+		y = P.y;
+		z = P.z;
+	}
+
+	inline Point::Point(const Vector &v)
+	{
+		x = v.x;
+		y = v.y;
+		z = v.z;
+	}
+
+	inline Point::Point(float P_x, float P_y, float P_z)
+	{
+		x = P_x;
+		y = P_y;
+		z = P_z;
+	}
+
+	inline Point &Point::operator=(const Point &P)
+	{
+		x = P.x;
+		y = P.y;
+		z = P.z;
+
+		return *this;
+	}
+
+	inline float &Point::operator()(int i)
+	{
+		return p[i];
+	}
+
+	inline float &Point::operator[](int i)
+	{
+		return p[i];
+	}
+
+	inline const float &Point::operator()(int i) const
+	{
+		return p[i];
+	}
+
+	inline const float &Point::operator[](int i) const
+	{
+		return p[i];
+	}
+}
+
+#endif   // Point_hpp
diff --git a/src/Device/Polygon.hpp b/src/Device/Polygon.hpp
new file mode 100644
index 0000000..8ee8562
--- /dev/null
+++ b/src/Device/Polygon.hpp
@@ -0,0 +1,56 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Polygon_hpp
+#define sw_Polygon_hpp
+
+#include "Vertex.hpp"
+
+namespace sw
+{
+	struct Polygon
+	{
+		Polygon(const float4 *P0, const float4 *P1, const float4 *P2)
+		{
+			P[0][0] = P0;
+			P[0][1] = P1;
+			P[0][2] = P2;
+
+			n = 3;
+			i = 0;
+			b = 0;
+		}
+
+		Polygon(const float4 *P, int n)
+		{
+			for(int i = 0; i < n; i++)
+			{
+				this->P[0][i] = &P[i];
+			}
+
+			this->n = n;
+			this->i = 0;
+			this->b = 0;
+		}
+
+		float4 B[16];              // Buffer for clipped vertices
+		const float4 *P[16][16];   // Pointers to clipped polygon's vertices
+
+		int n;   // Number of vertices
+		int i;   // Level of P to use
+		int b;   // Next available new vertex
+	};
+}
+
+#endif   // sw_Polygon_hpp
diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
new file mode 100644
index 0000000..52daa18
--- /dev/null
+++ b/src/Device/Primitive.hpp
@@ -0,0 +1,80 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Primitive_hpp
+#define sw_Primitive_hpp
+
+#include "Vertex.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+	struct Triangle
+	{
+		Vertex v0;
+		Vertex v1;
+		Vertex v2;
+	};
+
+	struct PlaneEquation   // z = A * x + B * y + C
+	{
+		float4 A;
+		float4 B;
+		float4 C;
+	};
+
+	struct Primitive
+	{
+		int yMin;
+		int yMax;
+
+		float4 xQuad;
+		float4 yQuad;
+
+		PlaneEquation z;
+		PlaneEquation w;
+
+		union
+		{
+			struct
+			{
+				PlaneEquation C[2][4];
+				PlaneEquation T[8][4];
+				PlaneEquation f;
+			};
+
+			PlaneEquation V[MAX_FRAGMENT_INPUTS][4];
+		};
+
+		float area;
+
+		// Masks for two-sided stencil
+		int64_t clockwiseMask;
+		int64_t invClockwiseMask;
+
+		struct Span
+		{
+			unsigned short left;
+			unsigned short right;
+		};
+
+		// The rasterizer adds a zero length span to the top and bottom of the polygon to allow
+		// for 2x2 pixel processing. We need an even number of spans to keep accesses aligned.
+		Span outlineUnderflow[2];
+		Span outline[OUTLINE_RESOLUTION];
+		Span outlineOverflow[2];
+	};
+}
+
+#endif   // sw_Primitive_hpp
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
new file mode 100644
index 0000000..6b319b4
--- /dev/null
+++ b/src/Device/QuadRasterizer.cpp
@@ -0,0 +1,350 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "QuadRasterizer.hpp"
+
+#include "Primitive.hpp"
+#include "Renderer.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Math.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	extern bool veryEarlyDepthTest;
+	extern bool complementaryDepthBuffer;
+	extern bool fullPixelPositionRegister;
+
+	extern int clusterCount;
+
+	QuadRasterizer::QuadRasterizer(const PixelProcessor::State &state, const PixelShader *pixelShader) : state(state), shader(pixelShader)
+	{
+	}
+
+	QuadRasterizer::~QuadRasterizer()
+	{
+	}
+
+	void QuadRasterizer::generate()
+	{
+		#if PERF_PROFILE
+			for(int i = 0; i < PERF_TIMERS; i++)
+			{
+				cycles[i] = 0;
+			}
+
+			Long pixelTime = Ticks();
+		#endif
+
+		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+		occlusion = 0;
+		int clusterCount = Renderer::getClusterCount();
+
+		Do
+		{
+			Int yMin = *Pointer<Int>(primitive + OFFSET(Primitive,yMin));
+			Int yMax = *Pointer<Int>(primitive + OFFSET(Primitive,yMax));
+
+			Int cluster2 = cluster + cluster;
+			yMin += clusterCount * 2 - 2 - cluster2;
+			yMin &= -clusterCount * 2;
+			yMin += cluster2;
+
+			If(yMin < yMax)
+			{
+				rasterize(yMin, yMax);
+			}
+
+			primitive += sizeof(Primitive) * state.multiSample;
+			count--;
+		}
+		Until(count == 0)
+
+		if(state.occlusionEnabled)
+		{
+			UInt clusterOcclusion = *Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster);
+			clusterOcclusion += occlusion;
+			*Pointer<UInt>(data + OFFSET(DrawData,occlusion) + 4 * cluster) = clusterOcclusion;
+		}
+
+		#if PERF_PROFILE
+			cycles[PERF_PIXEL] = Ticks() - pixelTime;
+
+			for(int i = 0; i < PERF_TIMERS; i++)
+			{
+				*Pointer<Long>(data + OFFSET(DrawData,cycles[i]) + 8 * cluster) += cycles[i];
+			}
+		#endif
+
+		Return();
+	}
+
+	void QuadRasterizer::rasterize(Int &yMin, Int &yMax)
+	{
+		Pointer<Byte> cBuffer[RENDERTARGETS];
+		Pointer<Byte> zBuffer;
+		Pointer<Byte> sBuffer;
+
+		for(int index = 0; index < RENDERTARGETS; index++)
+		{
+			if(state.colorWriteActive(index))
+			{
+				cBuffer[index] = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,colorBuffer[index])) + yMin * *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			}
+		}
+
+		if(state.depthTestActive)
+		{
+			zBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,depthBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+		}
+
+		if(state.stencilActive)
+		{
+			sBuffer = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,stencilBuffer)) + yMin * *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB));
+		}
+
+		Int y = yMin;
+
+		Do
+		{
+			Int x0a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+			Int x0b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+			Int x0 = Min(x0a, x0b);
+
+			for(unsigned int q = 1; q < state.multiSample; q++)
+			{
+				x0a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 0) * sizeof(Primitive::Span)));
+				x0b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left) + (y + 1) * sizeof(Primitive::Span)));
+				x0 = Min(x0, Min(x0a, x0b));
+			}
+
+			x0 &= 0xFFFFFFFE;
+
+			Int x1a = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+			Int x1b = Int(*Pointer<Short>(primitive + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+			Int x1 = Max(x1a, x1b);
+
+			for(unsigned int q = 1; q < state.multiSample; q++)
+			{
+				x1a = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 0) * sizeof(Primitive::Span)));
+				x1b = Int(*Pointer<Short>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right) + (y + 1) * sizeof(Primitive::Span)));
+				x1 = Max(x1, Max(x1a, x1b));
+			}
+
+			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
+
+			if(interpolateZ())
+			{
+				for(unsigned int q = 0; q < state.multiSample; q++)
+				{
+					Float4 y = yyyy;
+
+					if(state.multiSample > 1)
+					{
+						y -= *Pointer<Float4>(constants + OFFSET(Constants,Y) + q * sizeof(float4));
+					}
+
+					Dz[q] = *Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) + y * *Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16);
+				}
+			}
+
+			if(veryEarlyDepthTest && state.multiSample == 1 && !state.depthOverride)
+			{
+				if(!state.stencilActive && state.depthTestActive && (state.depthCompareMode == DEPTH_LESSEQUAL || state.depthCompareMode == DEPTH_LESS))   // FIXME: Both modes ok?
+				{
+					Float4 xxxx = Float4(Float(x0)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
+
+					Pointer<Byte> buffer;
+					Int pitch;
+
+					if(!state.quadLayoutDepthBuffer)
+					{
+						buffer = zBuffer + 4 * x0;
+						pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+					}
+					else
+					{
+						buffer = zBuffer + 8 * x0;
+					}
+
+					For(Int x = x0, x < x1, x += 2)
+					{
+						Float4 z = interpolate(xxxx, Dz[0], z, primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
+
+						Float4 zValue;
+
+						if(!state.quadLayoutDepthBuffer)
+						{
+							// FIXME: Properly optimizes?
+							zValue.xy = *Pointer<Float4>(buffer);
+							zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+						}
+						else
+						{
+							zValue = *Pointer<Float4>(buffer, 16);
+						}
+
+						Int4 zTest;
+
+						if(complementaryDepthBuffer)
+						{
+							zTest = CmpLE(zValue, z);
+						}
+						else
+						{
+							zTest = CmpNLT(zValue, z);
+						}
+
+						Int zMask = SignMask(zTest);
+
+						If(zMask == 0)
+						{
+							x0 += 2;
+						}
+						Else
+						{
+							x = x1;
+						}
+
+						xxxx += Float4(2);
+
+						if(!state.quadLayoutDepthBuffer)
+						{
+							buffer += 8;
+						}
+						else
+						{
+							buffer += 16;
+						}
+					}
+				}
+			}
+
+			If(x0 < x1)
+			{
+				if(interpolateW())
+				{
+					Dw = *Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) + yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16);
+				}
+
+				for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+				{
+					for(int component = 0; component < 4; component++)
+					{
+						if(state.interpolant[interpolant].component & (1 << component))
+						{
+							Dv[interpolant][component] = *Pointer<Float4>(primitive + OFFSET(Primitive,V[interpolant][component].C), 16);
+
+							if(!(state.interpolant[interpolant].flat & (1 << component)))
+							{
+								Dv[interpolant][component] += yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,V[interpolant][component].B), 16);
+							}
+						}
+					}
+				}
+
+				if(state.fog.component)
+				{
+					Df = *Pointer<Float4>(primitive + OFFSET(Primitive,f.C), 16);
+
+					if(!state.fog.flat)
+					{
+						Df += yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive,f.B), 16);
+					}
+				}
+
+				Short4 xLeft[4];
+				Short4 xRight[4];
+
+				for(unsigned int q = 0; q < state.multiSample; q++)
+				{
+					xLeft[q] = *Pointer<Short4>(primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline) + y * sizeof(Primitive::Span));
+					xRight[q] = xLeft[q];
+
+					xLeft[q] = Swizzle(xLeft[q], 0xA0) - Short4(1, 2, 1, 2);
+					xRight[q] = Swizzle(xRight[q], 0xF5) - Short4(0, 1, 0, 1);
+				}
+
+				For(Int x = x0, x < x1, x += 2)
+				{
+					Short4 xxxx = Short4(x);
+					Int cMask[4];
+
+					for(unsigned int q = 0; q < state.multiSample; q++)
+					{
+						Short4 mask = CmpGT(xxxx, xLeft[q]) & CmpGT(xRight[q], xxxx);
+						cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
+					}
+
+					quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
+				}
+			}
+
+			int clusterCount = Renderer::getClusterCount();
+
+			for(int index = 0; index < RENDERTARGETS; index++)
+			{
+				if(state.colorWriteActive(index))
+				{
+					cBuffer[index] += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index])) << (1 + sw::log2(clusterCount));   // FIXME: Precompute
+				}
+			}
+
+			if(state.depthTestActive)
+			{
+				zBuffer += *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)) << (1 + sw::log2(clusterCount));   // FIXME: Precompute
+			}
+
+			if(state.stencilActive)
+			{
+				sBuffer += *Pointer<Int>(data + OFFSET(DrawData,stencilPitchB)) << (1 + sw::log2(clusterCount));   // FIXME: Precompute
+			}
+
+			y += 2 * clusterCount;
+		}
+		Until(y >= yMax)
+	}
+
+	Float4 QuadRasterizer::interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp)
+	{
+		Float4 interpolant = D;
+
+		if(!flat)
+		{
+			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16);
+
+			if(perspective)
+			{
+				interpolant *= rhw;
+			}
+		}
+
+		if(clamp)
+		{
+			interpolant = Min(Max(interpolant, Float4(0.0f)), Float4(1.0f));
+		}
+
+		return interpolant;
+	}
+
+	bool QuadRasterizer::interpolateZ() const
+	{
+		return state.depthTestActive || state.pixelFogActive() || (shader && shader->isVPosDeclared() && fullPixelPositionRegister);
+	}
+
+	bool QuadRasterizer::interpolateW() const
+	{
+		return state.perspective || (shader && shader->isVPosDeclared() && fullPixelPositionRegister);
+	}
+}
diff --git a/src/Device/QuadRasterizer.hpp b/src/Device/QuadRasterizer.hpp
new file mode 100644
index 0000000..1d7681d
--- /dev/null
+++ b/src/Device/QuadRasterizer.hpp
@@ -0,0 +1,61 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_QuadRasterizer_hpp
+#define sw_QuadRasterizer_hpp
+
+#include "Rasterizer.hpp"
+#include "Shader/ShaderCore.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	class QuadRasterizer : public Rasterizer
+	{
+	public:
+		QuadRasterizer(const PixelProcessor::State &state, const PixelShader *shader);
+		virtual ~QuadRasterizer();
+
+		void generate();
+
+	protected:
+		Pointer<Byte> constants;
+
+		Float4 Dz[4];
+		Float4 Dw;
+		Float4 Dv[MAX_FRAGMENT_INPUTS][4];
+		Float4 Df;
+
+		UInt occlusion;
+
+#if PERF_PROFILE
+		Long cycles[PERF_TIMERS];
+#endif
+
+		virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y) = 0;
+
+		bool interpolateZ() const;
+		bool interpolateW() const;
+		Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective, bool clamp);
+
+		const PixelProcessor::State &state;
+		const PixelShader *const shader;
+
+	private:
+		void rasterize(Int &yMin, Int &yMax);
+	};
+}
+
+#endif   // sw_QuadRasterizer_hpp
diff --git a/src/Device/Rasterizer.hpp b/src/Device/Rasterizer.hpp
new file mode 100644
index 0000000..3811a25
--- /dev/null
+++ b/src/Device/Rasterizer.hpp
@@ -0,0 +1,38 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Rasterizer_hpp
+#define sw_Rasterizer_hpp
+
+#include "Context.hpp"
+#include "PixelProcessor.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+	class Rasterizer : public Function<Void(Pointer<Byte>, Int, Int, Pointer<Byte>)>
+	{
+	public:
+		Rasterizer() : primitive(Arg<0>()), count(Arg<1>()), cluster(Arg<2>()), data(Arg<3>()) {}
+		virtual ~Rasterizer() {};
+
+	protected:
+		Pointer<Byte> primitive;
+		Int count;
+		Int cluster;
+		Pointer<Byte> data;
+	};
+}
+
+#endif   // sw_Rasterizer_hpp
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
new file mode 100644
index 0000000..e7ec20a
--- /dev/null
+++ b/src/Device/Renderer.cpp
@@ -0,0 +1,2854 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Renderer.hpp"
+
+#include "Clipper.hpp"
+#include "Surface.hpp"
+#include "Primitive.hpp"
+#include "Polygon.hpp"
+#include "Main/FrameBuffer.hpp"
+#include "Main/SwiftConfig.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/MutexLock.hpp"
+#include "Common/CPUID.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Resource.hpp"
+#include "Common/Half.hpp"
+#include "Common/Math.hpp"
+#include "Common/Timer.hpp"
+#include "Common/Debug.hpp"
+
+#undef max
+
+bool disableServer = true;
+
+#ifndef NDEBUG
+unsigned int minPrimitives = 1;
+unsigned int maxPrimitives = 1 << 21;
+#endif
+
+namespace sw
+{
+	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
+	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
+	extern bool booleanFaceRegister;
+	extern bool fullPixelPositionRegister;
+	extern bool leadingVertexFirst;         // Flat shading uses first vertex, else last
+	extern bool secondaryColor;             // Specular lighting is applied after texturing
+	extern bool colorsDefaultToZero;
+
+	extern bool forceWindowed;
+	extern bool complementaryDepthBuffer;
+	extern bool postBlendSRGB;
+	extern bool exactColorRounding;
+	extern TransparencyAntialiasing transparencyAntialiasing;
+	extern bool forceClearRegisters;
+
+	extern bool precacheVertex;
+	extern bool precacheSetup;
+	extern bool precachePixel;
+
+	static const int batchSize = 128;
+	AtomicInt threadCount(1);
+	AtomicInt Renderer::unitCount(1);
+	AtomicInt Renderer::clusterCount(1);
+
+	TranscendentalPrecision logPrecision = ACCURATE;
+	TranscendentalPrecision expPrecision = ACCURATE;
+	TranscendentalPrecision rcpPrecision = ACCURATE;
+	TranscendentalPrecision rsqPrecision = ACCURATE;
+	bool perspectiveCorrection = true;
+
+	static void setGlobalRenderingSettings(Conventions conventions, bool exactColorRounding)
+	{
+		static bool initialized = false;
+
+		if(!initialized)
+		{
+			sw::halfIntegerCoordinates = conventions.halfIntegerCoordinates;
+			sw::symmetricNormalizedDepth = conventions.symmetricNormalizedDepth;
+			sw::booleanFaceRegister = conventions.booleanFaceRegister;
+			sw::fullPixelPositionRegister = conventions.fullPixelPositionRegister;
+			sw::leadingVertexFirst = conventions.leadingVertexFirst;
+			sw::secondaryColor = conventions.secondaryColor;
+			sw::colorsDefaultToZero = conventions.colorsDefaultToZero;
+			sw::exactColorRounding = exactColorRounding;
+			initialized = true;
+		}
+	}
+
+	struct Parameters
+	{
+		Renderer *renderer;
+		int threadIndex;
+	};
+
+	DrawCall::DrawCall()
+	{
+		queries = 0;
+
+		vsDirtyConstF = VERTEX_UNIFORM_VECTORS + 1;
+		vsDirtyConstI = 16;
+		vsDirtyConstB = 16;
+
+		psDirtyConstF = FRAGMENT_UNIFORM_VECTORS;
+		psDirtyConstI = 16;
+		psDirtyConstB = 16;
+
+		references = -1;
+
+		data = (DrawData*)allocate(sizeof(DrawData));
+		data->constants = &constants;
+	}
+
+	DrawCall::~DrawCall()
+	{
+		delete queries;
+
+		deallocate(data);
+	}
+
+	Renderer::Renderer(Context *context, Conventions conventions, bool exactColorRounding) : VertexProcessor(context), PixelProcessor(context), SetupProcessor(context), context(context), viewport()
+	{
+		setGlobalRenderingSettings(conventions, exactColorRounding);
+
+		setRenderTarget(0, 0);
+		clipper = new Clipper(symmetricNormalizedDepth);
+		blitter = new Blitter;
+
+		updateViewMatrix = true;
+		updateBaseMatrix = true;
+		updateProjectionMatrix = true;
+		updateClipPlanes = true;
+
+		#if PERF_HUD
+			resetTimers();
+		#endif
+
+		for(int i = 0; i < 16; i++)
+		{
+			vertexTask[i] = 0;
+
+			worker[i] = 0;
+			resume[i] = 0;
+			suspend[i] = 0;
+		}
+
+		threadsAwake = 0;
+		resumeApp = new Event();
+
+		currentDraw = 0;
+		nextDraw = 0;
+
+		qHead = 0;
+		qSize = 0;
+
+		for(int i = 0; i < 16; i++)
+		{
+			triangleBatch[i] = 0;
+			primitiveBatch[i] = 0;
+		}
+
+		for(int draw = 0; draw < DRAW_COUNT; draw++)
+		{
+			drawCall[draw] = new DrawCall();
+			drawList[draw] = drawCall[draw];
+		}
+
+		for(int unit = 0; unit < 16; unit++)
+		{
+			primitiveProgress[unit].init();
+		}
+
+		for(int cluster = 0; cluster < 16; cluster++)
+		{
+			pixelProgress[cluster].init();
+		}
+
+		clipFlags = 0;
+
+		swiftConfig = new SwiftConfig(disableServer);
+		updateConfiguration(true);
+
+		sync = new Resource(0);
+	}
+
+	Renderer::~Renderer()
+	{
+		sync->destruct();
+
+		delete clipper;
+		clipper = nullptr;
+
+		delete blitter;
+		blitter = nullptr;
+
+		terminateThreads();
+		delete resumeApp;
+
+		for(int draw = 0; draw < DRAW_COUNT; draw++)
+		{
+			delete drawCall[draw];
+		}
+
+		delete swiftConfig;
+	}
+
+	// This object has to be mem aligned
+	void* Renderer::operator new(size_t size)
+	{
+		ASSERT(size == sizeof(Renderer)); // This operator can't be called from a derived class
+		return sw::allocate(sizeof(Renderer), 16);
+	}
+
+	void Renderer::operator delete(void * mem)
+	{
+		sw::deallocate(mem);
+	}
+
+	void Renderer::draw(DrawType drawType, unsigned int indexOffset, unsigned int count, bool update)
+	{
+		#ifndef NDEBUG
+			if(count < minPrimitives || count > maxPrimitives)
+			{
+				return;
+			}
+		#endif
+
+		context->drawType = drawType;
+
+		updateConfiguration();
+		updateClipper();
+
+		int ss = context->getSuperSampleCount();
+		int ms = context->getMultiSampleCount();
+		bool requiresSync = false;
+
+		for(int q = 0; q < ss; q++)
+		{
+			unsigned int oldMultiSampleMask = context->multiSampleMask;
+			context->multiSampleMask = (context->sampleMask >> (ms * q)) & ((unsigned)0xFFFFFFFF >> (32 - ms));
+
+			if(!context->multiSampleMask)
+			{
+				continue;
+			}
+
+			sync->lock(sw::PRIVATE);
+
+			if(update || oldMultiSampleMask != context->multiSampleMask)
+			{
+				vertexState = VertexProcessor::update(drawType);
+				setupState = SetupProcessor::update();
+				pixelState = PixelProcessor::update();
+
+				vertexRoutine = VertexProcessor::routine(vertexState);
+				setupRoutine = SetupProcessor::routine(setupState);
+				pixelRoutine = PixelProcessor::routine(pixelState);
+			}
+
+			int batch = batchSize / ms;
+
+			int (Renderer::*setupPrimitives)(int batch, int count);
+
+			if(context->isDrawTriangle())
+			{
+				switch(context->fillMode)
+				{
+				case FILL_SOLID:
+					setupPrimitives = &Renderer::setupSolidTriangles;
+					break;
+				case FILL_WIREFRAME:
+					setupPrimitives = &Renderer::setupWireframeTriangle;
+					batch = 1;
+					break;
+				case FILL_VERTEX:
+					setupPrimitives = &Renderer::setupVertexTriangle;
+					batch = 1;
+					break;
+				default:
+					ASSERT(false);
+					return;
+				}
+			}
+			else if(context->isDrawLine())
+			{
+				setupPrimitives = &Renderer::setupLines;
+			}
+			else   // Point draw
+			{
+				setupPrimitives = &Renderer::setupPoints;
+			}
+
+			DrawCall *draw = nullptr;
+
+			do
+			{
+				for(int i = 0; i < DRAW_COUNT; i++)
+				{
+					if(drawCall[i]->references == -1)
+					{
+						draw = drawCall[i];
+						drawList[nextDraw & DRAW_COUNT_BITS] = draw;
+
+						break;
+					}
+				}
+
+				if(!draw)
+				{
+					resumeApp->wait();
+				}
+			}
+			while(!draw);
+
+			DrawData *data = draw->data;
+
+			if(queries.size() != 0)
+			{
+				draw->queries = new std::list<Query*>();
+				bool includePrimitivesWrittenQueries = vertexState.transformFeedbackQueryEnabled && vertexState.transformFeedbackEnabled;
+				for(auto &query : queries)
+				{
+					if(includePrimitivesWrittenQueries || (query->type != Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN))
+					{
+						++query->reference; // Atomic
+						draw->queries->push_back(query);
+					}
+				}
+			}
+
+			draw->drawType = drawType;
+			draw->batchSize = batch;
+
+			vertexRoutine->bind();
+			setupRoutine->bind();
+			pixelRoutine->bind();
+
+			draw->vertexRoutine = vertexRoutine;
+			draw->setupRoutine = setupRoutine;
+			draw->pixelRoutine = pixelRoutine;
+			draw->vertexPointer = (VertexProcessor::RoutinePointer)vertexRoutine->getEntry();
+			draw->setupPointer = (SetupProcessor::RoutinePointer)setupRoutine->getEntry();
+			draw->pixelPointer = (PixelProcessor::RoutinePointer)pixelRoutine->getEntry();
+			draw->setupPrimitives = setupPrimitives;
+			draw->setupState = setupState;
+
+			for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+			{
+				draw->vertexStream[i] = context->input[i].resource;
+				data->input[i] = context->input[i].buffer;
+				data->stride[i] = context->input[i].stride;
+
+				if(draw->vertexStream[i])
+				{
+					draw->vertexStream[i]->lock(PUBLIC, PRIVATE);
+				}
+			}
+
+			if(context->indexBuffer)
+			{
+				data->indices = (unsigned char*)context->indexBuffer->lock(PUBLIC, PRIVATE) + indexOffset;
+			}
+
+			draw->indexBuffer = context->indexBuffer;
+
+			for(int sampler = 0; sampler < TOTAL_IMAGE_UNITS; sampler++)
+			{
+				draw->texture[sampler] = 0;
+			}
+
+			for(int sampler = 0; sampler < TEXTURE_IMAGE_UNITS; sampler++)
+			{
+				if(pixelState.sampler[sampler].textureType != TEXTURE_NULL)
+				{
+					draw->texture[sampler] = context->texture[sampler];
+					draw->texture[sampler]->lock(PUBLIC, isReadWriteTexture(sampler) ? MANAGED : PRIVATE);   // If the texure is both read and written, use the same read/write lock as render targets
+
+					data->mipmap[sampler] = context->sampler[sampler].getTextureData();
+
+					requiresSync |= context->sampler[sampler].requiresSync();
+				}
+			}
+
+			if(context->pixelShader)
+			{
+				if(draw->psDirtyConstF)
+				{
+					memcpy(&data->ps.cW, PixelProcessor::cW, sizeof(word4) * 4 * (draw->psDirtyConstF < 8 ? draw->psDirtyConstF : 8));
+					memcpy(&data->ps.c, PixelProcessor::c, sizeof(float4) * draw->psDirtyConstF);
+					draw->psDirtyConstF = 0;
+				}
+
+				if(draw->psDirtyConstI)
+				{
+					memcpy(&data->ps.i, PixelProcessor::i, sizeof(int4) * draw->psDirtyConstI);
+					draw->psDirtyConstI = 0;
+				}
+
+				if(draw->psDirtyConstB)
+				{
+					memcpy(&data->ps.b, PixelProcessor::b, sizeof(bool) * draw->psDirtyConstB);
+					draw->psDirtyConstB = 0;
+				}
+
+				PixelProcessor::lockUniformBuffers(data->ps.u, draw->pUniformBuffers);
+			}
+			else
+			{
+				for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
+				{
+					draw->pUniformBuffers[i] = nullptr;
+				}
+			}
+
+			if(context->pixelShaderModel() <= 0x0104)
+			{
+				for(int stage = 0; stage < 8; stage++)
+				{
+					if(pixelState.textureStage[stage].stageOperation != TextureStage::STAGE_DISABLE || context->pixelShader)
+					{
+						data->textureStage[stage] = context->textureStage[stage].uniforms;
+					}
+					else break;
+				}
+			}
+
+			if(context->vertexShader)
+			{
+				if(context->vertexShader->getShaderModel() >= 0x0300)
+				{
+					for(int sampler = 0; sampler < VERTEX_TEXTURE_IMAGE_UNITS; sampler++)
+					{
+						if(vertexState.sampler[sampler].textureType != TEXTURE_NULL)
+						{
+							draw->texture[TEXTURE_IMAGE_UNITS + sampler] = context->texture[TEXTURE_IMAGE_UNITS + sampler];
+							draw->texture[TEXTURE_IMAGE_UNITS + sampler]->lock(PUBLIC, PRIVATE);
+
+							data->mipmap[TEXTURE_IMAGE_UNITS + sampler] = context->sampler[TEXTURE_IMAGE_UNITS + sampler].getTextureData();
+
+							requiresSync |= context->sampler[TEXTURE_IMAGE_UNITS + sampler].requiresSync();
+						}
+					}
+				}
+
+				if(draw->vsDirtyConstF)
+				{
+					memcpy(&data->vs.c, VertexProcessor::c, sizeof(float4) * draw->vsDirtyConstF);
+					draw->vsDirtyConstF = 0;
+				}
+
+				if(draw->vsDirtyConstI)
+				{
+					memcpy(&data->vs.i, VertexProcessor::i, sizeof(int4) * draw->vsDirtyConstI);
+					draw->vsDirtyConstI = 0;
+				}
+
+				if(draw->vsDirtyConstB)
+				{
+					memcpy(&data->vs.b, VertexProcessor::b, sizeof(bool) * draw->vsDirtyConstB);
+					draw->vsDirtyConstB = 0;
+				}
+
+				if(context->vertexShader->isInstanceIdDeclared())
+				{
+					data->instanceID = context->instanceID;
+				}
+
+				VertexProcessor::lockUniformBuffers(data->vs.u, draw->vUniformBuffers);
+				VertexProcessor::lockTransformFeedbackBuffers(data->vs.t, data->vs.reg, data->vs.row, data->vs.col, data->vs.str, draw->transformFeedbackBuffers);
+			}
+			else
+			{
+				data->ff = ff;
+
+				draw->vsDirtyConstF = VERTEX_UNIFORM_VECTORS + 1;
+				draw->vsDirtyConstI = 16;
+				draw->vsDirtyConstB = 16;
+
+				for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
+				{
+					draw->vUniformBuffers[i] = nullptr;
+				}
+
+				for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
+				{
+					draw->transformFeedbackBuffers[i] = nullptr;
+				}
+			}
+
+			if(pixelState.stencilActive)
+			{
+				data->stencil[0] = stencil;
+				data->stencil[1] = stencilCCW;
+			}
+
+			if(pixelState.fogActive)
+			{
+				data->fog = fog;
+			}
+
+			if(setupState.isDrawPoint)
+			{
+				data->point = point;
+			}
+
+			data->lineWidth = context->lineWidth;
+
+			data->factor = factor;
+
+			if(pixelState.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
+			{
+				float ref = context->alphaReference * (1.0f / 255.0f);
+				float margin = sw::min(ref, 1.0f - ref);
+
+				if(ms == 4)
+				{
+					data->a2c0 = replicate(ref - margin * 0.6f);
+					data->a2c1 = replicate(ref - margin * 0.2f);
+					data->a2c2 = replicate(ref + margin * 0.2f);
+					data->a2c3 = replicate(ref + margin * 0.6f);
+				}
+				else if(ms == 2)
+				{
+					data->a2c0 = replicate(ref - margin * 0.3f);
+					data->a2c1 = replicate(ref + margin * 0.3f);
+				}
+				else ASSERT(false);
+			}
+
+			if(pixelState.occlusionEnabled)
+			{
+				for(int cluster = 0; cluster < clusterCount; cluster++)
+				{
+					data->occlusion[cluster] = 0;
+				}
+			}
+
+			#if PERF_PROFILE
+				for(int cluster = 0; cluster < clusterCount; cluster++)
+				{
+					for(int i = 0; i < PERF_TIMERS; i++)
+					{
+						data->cycles[i][cluster] = 0;
+					}
+				}
+			#endif
+
+			// Viewport
+			{
+				float W = 0.5f * viewport.width;
+				float H = 0.5f * viewport.height;
+				float X0 = viewport.x0 + W;
+				float Y0 = viewport.y0 + H;
+				float N = viewport.minZ;
+				float F = viewport.maxZ;
+				float Z = F - N;
+
+				if(context->isDrawTriangle(false))
+				{
+					N += context->depthBias;
+				}
+
+				if(complementaryDepthBuffer)
+				{
+					Z = -Z;
+					N = 1 - N;
+				}
+
+				static const float X[5][16] =   // Fragment offsets
+				{
+					{+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 1 sample
+					{-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 2 samples
+					{-0.3000f, +0.1000f, +0.3000f, -0.1000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 4 samples
+					{+0.1875f, -0.3125f, +0.3125f, -0.4375f, -0.0625f, +0.4375f, +0.0625f, -0.1875f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 8 samples
+					{+0.2553f, -0.1155f, +0.1661f, -0.1828f, +0.2293f, -0.4132f, -0.1773f, -0.0577f, +0.3891f, -0.4656f, +0.4103f, +0.4248f, -0.2109f, +0.3966f, -0.2664f, -0.3872f}    // 16 samples
+				};
+
+				static const float Y[5][16] =   // Fragment offsets
+				{
+					{+0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 1 sample
+					{-0.2500f, +0.2500f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 2 samples
+					{-0.1000f, -0.3000f, +0.1000f, +0.3000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 4 samples
+					{-0.4375f, -0.3125f, -0.1875f, -0.0625f, +0.0625f, +0.1875f, +0.3125f, +0.4375f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f, +0.0000f},   // 8 samples
+					{-0.4503f, +0.1883f, +0.3684f, -0.4668f, -0.0690f, -0.1315f, +0.4999f, +0.0728f, +0.1070f, -0.3086f, +0.3725f, -0.1547f, -0.1102f, -0.3588f, +0.1789f, +0.0269f}    // 16 samples
+				};
+
+				int s = sw::log2(ss);
+
+				data->Wx16 = replicate(W * 16);
+				data->Hx16 = replicate(H * 16);
+				data->X0x16 = replicate(X0 * 16 - 8);
+				data->Y0x16 = replicate(Y0 * 16 - 8);
+				data->XXXX = replicate(X[s][q] / W);
+				data->YYYY = replicate(Y[s][q] / H);
+				data->halfPixelX = replicate(0.5f / W);
+				data->halfPixelY = replicate(0.5f / H);
+				data->viewportHeight = abs(viewport.height);
+				data->slopeDepthBias = context->slopeDepthBias;
+				data->depthRange = Z;
+				data->depthNear = N;
+				draw->clipFlags = clipFlags;
+
+				if(clipFlags)
+				{
+					if(clipFlags & Clipper::CLIP_PLANE0) data->clipPlane[0] = clipPlane[0];
+					if(clipFlags & Clipper::CLIP_PLANE1) data->clipPlane[1] = clipPlane[1];
+					if(clipFlags & Clipper::CLIP_PLANE2) data->clipPlane[2] = clipPlane[2];
+					if(clipFlags & Clipper::CLIP_PLANE3) data->clipPlane[3] = clipPlane[3];
+					if(clipFlags & Clipper::CLIP_PLANE4) data->clipPlane[4] = clipPlane[4];
+					if(clipFlags & Clipper::CLIP_PLANE5) data->clipPlane[5] = clipPlane[5];
+				}
+			}
+
+			// Target
+			{
+				for(int index = 0; index < RENDERTARGETS; index++)
+				{
+					draw->renderTarget[index] = context->renderTarget[index];
+
+					if(draw->renderTarget[index])
+					{
+						unsigned int layer = context->renderTargetLayer[index];
+						requiresSync |= context->renderTarget[index]->requiresSync();
+						data->colorBuffer[index] = (unsigned int*)context->renderTarget[index]->lockInternal(0, 0, layer, LOCK_READWRITE, MANAGED);
+						data->colorBuffer[index] += q * ms * context->renderTarget[index]->getSliceB(true);
+						data->colorPitchB[index] = context->renderTarget[index]->getInternalPitchB();
+						data->colorSliceB[index] = context->renderTarget[index]->getInternalSliceB();
+					}
+				}
+
+				draw->depthBuffer = context->depthBuffer;
+				draw->stencilBuffer = context->stencilBuffer;
+
+				if(draw->depthBuffer)
+				{
+					unsigned int layer = context->depthBufferLayer;
+					requiresSync |= context->depthBuffer->requiresSync();
+					data->depthBuffer = (float*)context->depthBuffer->lockInternal(0, 0, layer, LOCK_READWRITE, MANAGED);
+					data->depthBuffer += q * ms * context->depthBuffer->getSliceB(true);
+					data->depthPitchB = context->depthBuffer->getInternalPitchB();
+					data->depthSliceB = context->depthBuffer->getInternalSliceB();
+				}
+
+				if(draw->stencilBuffer)
+				{
+					unsigned int layer = context->stencilBufferLayer;
+					requiresSync |= context->stencilBuffer->requiresSync();
+					data->stencilBuffer = (unsigned char*)context->stencilBuffer->lockStencil(0, 0, layer, MANAGED);
+					data->stencilBuffer += q * ms * context->stencilBuffer->getSliceB(true);
+					data->stencilPitchB = context->stencilBuffer->getStencilPitchB();
+					data->stencilSliceB = context->stencilBuffer->getStencilSliceB();
+				}
+			}
+
+			// Scissor
+			{
+				data->scissorX0 = scissor.x0;
+				data->scissorX1 = scissor.x1;
+				data->scissorY0 = scissor.y0;
+				data->scissorY1 = scissor.y1;
+			}
+
+			draw->primitive = 0;
+			draw->count = count;
+
+			draw->references = (count + batch - 1) / batch;
+
+			schedulerMutex.lock();
+			++nextDraw; // Atomic
+			schedulerMutex.unlock();
+
+			#ifndef NDEBUG
+			if(threadCount == 1)   // Use main thread for draw execution
+			{
+				threadsAwake = 1;
+				task[0].type = Task::RESUME;
+
+				taskLoop(0);
+			}
+			else
+			#endif
+			{
+				if(!threadsAwake)
+				{
+					suspend[0]->wait();
+
+					threadsAwake = 1;
+					task[0].type = Task::RESUME;
+
+					resume[0]->signal();
+				}
+			}
+		}
+
+		// TODO(sugoi): This is a temporary brute-force workaround to ensure IOSurface synchronization.
+		if(requiresSync)
+		{
+			synchronize();
+		}
+	}
+
+	void Renderer::clear(void *value, Format format, Surface *dest, const Rect &clearRect, unsigned int rgbaMask)
+	{
+		blitter->clear(value, format, dest, clearRect, rgbaMask);
+	}
+
+	void Renderer::blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil, bool sRGBconversion)
+	{
+		blitter->blit(source, sRect, dest, dRect, {filter, isStencil, sRGBconversion});
+	}
+
+	void Renderer::blit3D(Surface *source, Surface *dest)
+	{
+		blitter->blit3D(source, dest);
+	}
+
+	void Renderer::threadFunction(void *parameters)
+	{
+		Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
+		int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;
+
+		if(logPrecision < IEEE)
+		{
+			CPUID::setFlushToZero(true);
+			CPUID::setDenormalsAreZero(true);
+		}
+
+		renderer->threadLoop(threadIndex);
+	}
+
+	void Renderer::threadLoop(int threadIndex)
+	{
+		while(!exitThreads)
+		{
+			taskLoop(threadIndex);
+
+			suspend[threadIndex]->signal();
+			resume[threadIndex]->wait();
+		}
+	}
+
+	void Renderer::taskLoop(int threadIndex)
+	{
+		while(task[threadIndex].type != Task::SUSPEND)
+		{
+			scheduleTask(threadIndex);
+			executeTask(threadIndex);
+		}
+	}
+
+	void Renderer::findAvailableTasks()
+	{
+		// Find pixel tasks
+		for(int cluster = 0; cluster < clusterCount; cluster++)
+		{
+			if(!pixelProgress[cluster].executing)
+			{
+				for(int unit = 0; unit < unitCount; unit++)
+				{
+					if(primitiveProgress[unit].references > 0)   // Contains processed primitives
+					{
+						if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
+						{
+							if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive)   // Previous primitives have been rendered
+							{
+								Task &task = taskQueue[qHead];
+								task.type = Task::PIXELS;
+								task.primitiveUnit = unit;
+								task.pixelCluster = cluster;
+
+								pixelProgress[cluster].executing = true;
+
+								// Commit to the task queue
+								qHead = (qHead + 1) & TASK_COUNT_BITS;
+								qSize++;
+
+								break;
+							}
+						}
+					}
+				}
+			}
+		}
+
+		// Find primitive tasks
+		if(currentDraw == nextDraw)
+		{
+			return;   // No more primitives to process
+		}
+
+		for(int unit = 0; unit < unitCount; unit++)
+		{
+			DrawCall *draw = drawList[currentDraw & DRAW_COUNT_BITS];
+
+			int primitive = draw->primitive;
+			int count = draw->count;
+
+			if(primitive >= count)
+			{
+				++currentDraw; // Atomic
+
+				if(currentDraw == nextDraw)
+				{
+					return;   // No more primitives to process
+				}
+
+				draw = drawList[currentDraw & DRAW_COUNT_BITS];
+			}
+
+			if(!primitiveProgress[unit].references)   // Task not already being executed and not still in use by a pixel unit
+			{
+				primitive = draw->primitive;
+				count = draw->count;
+				int batch = draw->batchSize;
+
+				primitiveProgress[unit].drawCall = currentDraw;
+				primitiveProgress[unit].firstPrimitive = primitive;
+				primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;
+
+				draw->primitive += batch;
+
+				Task &task = taskQueue[qHead];
+				task.type = Task::PRIMITIVES;
+				task.primitiveUnit = unit;
+
+				primitiveProgress[unit].references = -1;
+
+				// Commit to the task queue
+				qHead = (qHead + 1) & TASK_COUNT_BITS;
+				qSize++;
+			}
+		}
+	}
+
+	void Renderer::scheduleTask(int threadIndex)
+	{
+		schedulerMutex.lock();
+
+		int curThreadsAwake = threadsAwake;
+
+		if((int)qSize < threadCount - curThreadsAwake + 1)
+		{
+			findAvailableTasks();
+		}
+
+		if(qSize != 0)
+		{
+			task[threadIndex] = taskQueue[(qHead - qSize) & TASK_COUNT_BITS];
+			qSize--;
+
+			if(curThreadsAwake != threadCount)
+			{
+				int wakeup = qSize - curThreadsAwake + 1;
+
+				for(int i = 0; i < threadCount && wakeup > 0; i++)
+				{
+					if(task[i].type == Task::SUSPEND)
+					{
+						suspend[i]->wait();
+						task[i].type = Task::RESUME;
+						resume[i]->signal();
+
+						++threadsAwake; // Atomic
+						wakeup--;
+					}
+				}
+			}
+		}
+		else
+		{
+			task[threadIndex].type = Task::SUSPEND;
+
+			--threadsAwake; // Atomic
+		}
+
+		schedulerMutex.unlock();
+	}
+
+	void Renderer::executeTask(int threadIndex)
+	{
+		#if PERF_HUD
+			int64_t startTick = Timer::ticks();
+		#endif
+
+		switch(task[threadIndex].type)
+		{
+		case Task::PRIMITIVES:
+			{
+				int unit = task[threadIndex].primitiveUnit;
+
+				int input = primitiveProgress[unit].firstPrimitive;
+				int count = primitiveProgress[unit].primitiveCount;
+				DrawCall *draw = drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+				int (Renderer::*setupPrimitives)(int batch, int count) = draw->setupPrimitives;
+
+				processPrimitiveVertices(unit, input, count, draw->count, threadIndex);
+
+				#if PERF_HUD
+					int64_t time = Timer::ticks();
+					vertexTime[threadIndex] += time - startTick;
+					startTick = time;
+				#endif
+
+				int visible = 0;
+
+				if(!draw->setupState.rasterizerDiscard)
+				{
+					visible = (this->*setupPrimitives)(unit, count);
+				}
+
+				primitiveProgress[unit].visible = visible;
+				primitiveProgress[unit].references = clusterCount;
+
+				#if PERF_HUD
+					setupTime[threadIndex] += Timer::ticks() - startTick;
+				#endif
+			}
+			break;
+		case Task::PIXELS:
+			{
+				int unit = task[threadIndex].primitiveUnit;
+				int visible = primitiveProgress[unit].visible;
+
+				if(visible > 0)
+				{
+					int cluster = task[threadIndex].pixelCluster;
+					Primitive *primitive = primitiveBatch[unit];
+					DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
+					DrawData *data = draw->data;
+					PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
+
+					pixelRoutine(primitive, visible, cluster, data);
+				}
+
+				finishRendering(task[threadIndex]);
+
+				#if PERF_HUD
+					pixelTime[threadIndex] += Timer::ticks() - startTick;
+				#endif
+			}
+			break;
+		case Task::RESUME:
+			break;
+		case Task::SUSPEND:
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void Renderer::synchronize()
+	{
+		sync->lock(sw::PUBLIC);
+		sync->unlock();
+	}
+
+	void Renderer::finishRendering(Task &pixelTask)
+	{
+		int unit = pixelTask.primitiveUnit;
+		int cluster = pixelTask.pixelCluster;
+
+		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+		DrawData &data = *draw.data;
+		int primitive = primitiveProgress[unit].firstPrimitive;
+		int count = primitiveProgress[unit].primitiveCount;
+		int processedPrimitives = primitive + count;
+
+		pixelProgress[cluster].processedPrimitives = processedPrimitives;
+
+		if(pixelProgress[cluster].processedPrimitives >= draw.count)
+		{
+			++pixelProgress[cluster].drawCall; // Atomic
+			pixelProgress[cluster].processedPrimitives = 0;
+		}
+
+		int ref = primitiveProgress[unit].references--; // Atomic
+
+		if(ref == 0)
+		{
+			ref = draw.references--; // Atomic
+
+			if(ref == 0)
+			{
+				#if PERF_PROFILE
+					for(int cluster = 0; cluster < clusterCount; cluster++)
+					{
+						for(int i = 0; i < PERF_TIMERS; i++)
+						{
+							profiler.cycles[i] += data.cycles[i][cluster];
+						}
+					}
+				#endif
+
+				if(draw.queries)
+				{
+					for(auto &query : *(draw.queries))
+					{
+						switch(query->type)
+						{
+						case Query::FRAGMENTS_PASSED:
+							for(int cluster = 0; cluster < clusterCount; cluster++)
+							{
+								query->data += data.occlusion[cluster];
+							}
+							break;
+						case Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
+							query->data += processedPrimitives;
+							break;
+						default:
+							break;
+						}
+
+						--query->reference; // Atomic
+					}
+
+					delete draw.queries;
+					draw.queries = 0;
+				}
+
+				for(int i = 0; i < RENDERTARGETS; i++)
+				{
+					if(draw.renderTarget[i])
+					{
+						draw.renderTarget[i]->unlockInternal();
+					}
+				}
+
+				if(draw.depthBuffer)
+				{
+					draw.depthBuffer->unlockInternal();
+				}
+
+				if(draw.stencilBuffer)
+				{
+					draw.stencilBuffer->unlockStencil();
+				}
+
+				for(int i = 0; i < TOTAL_IMAGE_UNITS; i++)
+				{
+					if(draw.texture[i])
+					{
+						draw.texture[i]->unlock();
+					}
+				}
+
+				for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+				{
+					if(draw.vertexStream[i])
+					{
+						draw.vertexStream[i]->unlock();
+					}
+				}
+
+				if(draw.indexBuffer)
+				{
+					draw.indexBuffer->unlock();
+				}
+
+				for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; i++)
+				{
+					if(draw.pUniformBuffers[i])
+					{
+						draw.pUniformBuffers[i]->unlock();
+					}
+					if(draw.vUniformBuffers[i])
+					{
+						draw.vUniformBuffers[i]->unlock();
+					}
+				}
+
+				for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
+				{
+					if(draw.transformFeedbackBuffers[i])
+					{
+						draw.transformFeedbackBuffers[i]->unlock();
+					}
+				}
+
+				draw.vertexRoutine->unbind();
+				draw.setupRoutine->unbind();
+				draw.pixelRoutine->unbind();
+
+				sync->unlock();
+
+				draw.references = -1;
+				resumeApp->signal();
+			}
+		}
+
+		pixelProgress[cluster].executing = false;
+	}
+
+	void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread)
+	{
+		Triangle *triangle = triangleBatch[unit];
+		int primitiveDrawCall = primitiveProgress[unit].drawCall;
+		DrawCall *draw = drawList[primitiveDrawCall & DRAW_COUNT_BITS];
+		DrawData *data = draw->data;
+		VertexTask *task = vertexTask[thread];
+
+		const void *indices = data->indices;
+		VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;
+
+		if(task->vertexCache.drawCall != primitiveDrawCall)
+		{
+			task->vertexCache.clear();
+			task->vertexCache.drawCall = primitiveDrawCall;
+		}
+
+		unsigned int batch[128][3];   // FIXME: Adjust to dynamic batch size
+
+		switch(draw->drawType)
+		{
+		case DRAW_POINTLIST:
+			{
+				unsigned int index = start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index;
+					batch[i][1] = index;
+					batch[i][2] = index;
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_LINELIST:
+			{
+				unsigned int index = 2 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index + 0;
+					batch[i][1] = index + 1;
+					batch[i][2] = index + 1;
+
+					index += 2;
+				}
+			}
+			break;
+		case DRAW_LINESTRIP:
+			{
+				unsigned int index = start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index + 0;
+					batch[i][1] = index + 1;
+					batch[i][2] = index + 1;
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_LINELOOP:
+			{
+				unsigned int index = start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = (index + 0) % loop;
+					batch[i][1] = (index + 1) % loop;
+					batch[i][2] = (index + 1) % loop;
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_TRIANGLELIST:
+			{
+				unsigned int index = 3 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index + 0;
+					batch[i][1] = index + 1;
+					batch[i][2] = index + 2;
+
+					index += 3;
+				}
+			}
+			break;
+		case DRAW_TRIANGLESTRIP:
+			{
+				unsigned int index = start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					if(leadingVertexFirst)
+					{
+						batch[i][0] = index + 0;
+						batch[i][1] = index + (index & 1) + 1;
+						batch[i][2] = index + (~index & 1) + 1;
+					}
+					else
+					{
+						batch[i][0] = index + (index & 1);
+						batch[i][1] = index + (~index & 1);
+						batch[i][2] = index + 2;
+					}
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_TRIANGLEFAN:
+			{
+				unsigned int index = start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					if(leadingVertexFirst)
+					{
+						batch[i][0] = index + 1;
+						batch[i][1] = index + 2;
+						batch[i][2] = 0;
+					}
+					else
+					{
+						batch[i][0] = 0;
+						batch[i][1] = index + 1;
+						batch[i][2] = index + 2;
+					}
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDPOINTLIST8:
+			{
+				const unsigned char *index = (const unsigned char*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = *index;
+					batch[i][1] = *index;
+					batch[i][2] = *index;
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDPOINTLIST16:
+			{
+				const unsigned short *index = (const unsigned short*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = *index;
+					batch[i][1] = *index;
+					batch[i][2] = *index;
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDPOINTLIST32:
+			{
+				const unsigned int *index = (const unsigned int*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = *index;
+					batch[i][1] = *index;
+					batch[i][2] = *index;
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINELIST8:
+			{
+				const unsigned char *index = (const unsigned char*)indices + 2 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[1];
+
+					index += 2;
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINELIST16:
+			{
+				const unsigned short *index = (const unsigned short*)indices + 2 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[1];
+
+					index += 2;
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINELIST32:
+			{
+				const unsigned int *index = (const unsigned int*)indices + 2 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[1];
+
+					index += 2;
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINESTRIP8:
+			{
+				const unsigned char *index = (const unsigned char*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[1];
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINESTRIP16:
+			{
+				const unsigned short *index = (const unsigned short*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[1];
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINESTRIP32:
+			{
+				const unsigned int *index = (const unsigned int*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[1];
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINELOOP8:
+			{
+				const unsigned char *index = (const unsigned char*)indices;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[(start + i + 0) % loop];
+					batch[i][1] = index[(start + i + 1) % loop];
+					batch[i][2] = index[(start + i + 1) % loop];
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINELOOP16:
+			{
+				const unsigned short *index = (const unsigned short*)indices;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[(start + i + 0) % loop];
+					batch[i][1] = index[(start + i + 1) % loop];
+					batch[i][2] = index[(start + i + 1) % loop];
+				}
+			}
+			break;
+		case DRAW_INDEXEDLINELOOP32:
+			{
+				const unsigned int *index = (const unsigned int*)indices;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[(start + i + 0) % loop];
+					batch[i][1] = index[(start + i + 1) % loop];
+					batch[i][2] = index[(start + i + 1) % loop];
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLELIST8:
+			{
+				const unsigned char *index = (const unsigned char*)indices + 3 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[2];
+
+					index += 3;
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLELIST16:
+			{
+				const unsigned short *index = (const unsigned short*)indices + 3 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[2];
+
+					index += 3;
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLELIST32:
+			{
+				const unsigned int *index = (const unsigned int*)indices + 3 * start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[1];
+					batch[i][2] = index[2];
+
+					index += 3;
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLESTRIP8:
+			{
+				const unsigned char *index = (const unsigned char*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[((start + i) & 1) + 1];
+					batch[i][2] = index[(~(start + i) & 1) + 1];
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLESTRIP16:
+			{
+				const unsigned short *index = (const unsigned short*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[((start + i) & 1) + 1];
+					batch[i][2] = index[(~(start + i) & 1) + 1];
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLESTRIP32:
+			{
+				const unsigned int *index = (const unsigned int*)indices + start;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[0];
+					batch[i][1] = index[((start + i) & 1) + 1];
+					batch[i][2] = index[(~(start + i) & 1) + 1];
+
+					index += 1;
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLEFAN8:
+			{
+				const unsigned char *index = (const unsigned char*)indices;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[start + i + 1];
+					batch[i][1] = index[start + i + 2];
+					batch[i][2] = index[0];
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLEFAN16:
+			{
+				const unsigned short *index = (const unsigned short*)indices;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[start + i + 1];
+					batch[i][1] = index[start + i + 2];
+					batch[i][2] = index[0];
+				}
+			}
+			break;
+		case DRAW_INDEXEDTRIANGLEFAN32:
+			{
+				const unsigned int *index = (const unsigned int*)indices;
+
+				for(unsigned int i = 0; i < triangleCount; i++)
+				{
+					batch[i][0] = index[start + i + 1];
+					batch[i][1] = index[start + i + 2];
+					batch[i][2] = index[0];
+				}
+			}
+			break;
+		case DRAW_QUADLIST:
+			{
+				unsigned int index = 4 * start / 2;
+
+				for(unsigned int i = 0; i < triangleCount; i += 2)
+				{
+					batch[i+0][0] = index + 0;
+					batch[i+0][1] = index + 1;
+					batch[i+0][2] = index + 2;
+
+					batch[i+1][0] = index + 0;
+					batch[i+1][1] = index + 2;
+					batch[i+1][2] = index + 3;
+
+					index += 4;
+				}
+			}
+			break;
+		default:
+			ASSERT(false);
+			return;
+		}
+
+		task->primitiveStart = start;
+		task->vertexCount = triangleCount * 3;
+		vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
+	}
+
+	int Renderer::setupSolidTriangles(int unit, int count)
+	{
+		Triangle *triangle = triangleBatch[unit];
+		Primitive *primitive = primitiveBatch[unit];
+
+		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+		SetupProcessor::State &state = draw.setupState;
+		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
+
+		int ms = state.multiSample;
+		int pos = state.positionRegister;
+		const DrawData *data = draw.data;
+		int visible = 0;
+
+		for(int i = 0; i < count; i++, triangle++)
+		{
+			Vertex &v0 = triangle->v0;
+			Vertex &v1 = triangle->v1;
+			Vertex &v2 = triangle->v2;
+
+			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
+			{
+				Polygon polygon(&v0.v[pos], &v1.v[pos], &v2.v[pos]);
+
+				int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags | draw.clipFlags;
+
+				if(clipFlagsOr != Clipper::CLIP_FINITE)
+				{
+					if(!clipper->clip(polygon, clipFlagsOr, draw))
+					{
+						continue;
+					}
+				}
+
+				if(setupRoutine(primitive, triangle, &polygon, data))
+				{
+					primitive += ms;
+					visible++;
+				}
+			}
+		}
+
+		return visible;
+	}
+
+	int Renderer::setupWireframeTriangle(int unit, int count)
+	{
+		Triangle *triangle = triangleBatch[unit];
+		Primitive *primitive = primitiveBatch[unit];
+		int visible = 0;
+
+		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+		SetupProcessor::State &state = draw.setupState;
+
+		const Vertex &v0 = triangle[0].v0;
+		const Vertex &v1 = triangle[0].v1;
+		const Vertex &v2 = triangle[0].v2;
+
+		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+		if(state.cullMode == CULL_CLOCKWISE)
+		{
+			if(d >= 0) return 0;
+		}
+		else if(state.cullMode == CULL_COUNTERCLOCKWISE)
+		{
+			if(d <= 0) return 0;
+		}
+
+		// Copy attributes
+		triangle[1].v0 = v1;
+		triangle[1].v1 = v2;
+		triangle[2].v0 = v2;
+		triangle[2].v1 = v0;
+
+		if(state.color[0][0].flat)   // FIXME
+		{
+			for(int i = 0; i < 2; i++)
+			{
+				triangle[1].v0.C[i] = triangle[0].v0.C[i];
+				triangle[1].v1.C[i] = triangle[0].v0.C[i];
+				triangle[2].v0.C[i] = triangle[0].v0.C[i];
+				triangle[2].v1.C[i] = triangle[0].v0.C[i];
+			}
+		}
+
+		for(int i = 0; i < 3; i++)
+		{
+			if(setupLine(*primitive, *triangle, draw))
+			{
+				primitive->area = 0.5f * d;
+
+				primitive++;
+				visible++;
+			}
+
+			triangle++;
+		}
+
+		return visible;
+	}
+
+	int Renderer::setupVertexTriangle(int unit, int count)
+	{
+		Triangle *triangle = triangleBatch[unit];
+		Primitive *primitive = primitiveBatch[unit];
+		int visible = 0;
+
+		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+		SetupProcessor::State &state = draw.setupState;
+
+		const Vertex &v0 = triangle[0].v0;
+		const Vertex &v1 = triangle[0].v1;
+		const Vertex &v2 = triangle[0].v2;
+
+		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w + (v0.x * v2.y - v0.y * v2.x) * v1.w + (v2.x * v1.y - v1.x * v2.y) * v0.w;
+
+		if(state.cullMode == CULL_CLOCKWISE)
+		{
+			if(d >= 0) return 0;
+		}
+		else if(state.cullMode == CULL_COUNTERCLOCKWISE)
+		{
+			if(d <= 0) return 0;
+		}
+
+		// Copy attributes
+		triangle[1].v0 = v1;
+		triangle[2].v0 = v2;
+
+		for(int i = 0; i < 3; i++)
+		{
+			if(setupPoint(*primitive, *triangle, draw))
+			{
+				primitive->area = 0.5f * d;
+
+				primitive++;
+				visible++;
+			}
+
+			triangle++;
+		}
+
+		return visible;
+	}
+
+	int Renderer::setupLines(int unit, int count)
+	{
+		Triangle *triangle = triangleBatch[unit];
+		Primitive *primitive = primitiveBatch[unit];
+		int visible = 0;
+
+		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+		SetupProcessor::State &state = draw.setupState;
+
+		int ms = state.multiSample;
+
+		for(int i = 0; i < count; i++)
+		{
+			if(setupLine(*primitive, *triangle, draw))
+			{
+				primitive += ms;
+				visible++;
+			}
+
+			triangle++;
+		}
+
+		return visible;
+	}
+
+	int Renderer::setupPoints(int unit, int count)
+	{
+		Triangle *triangle = triangleBatch[unit];
+		Primitive *primitive = primitiveBatch[unit];
+		int visible = 0;
+
+		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
+		SetupProcessor::State &state = draw.setupState;
+
+		int ms = state.multiSample;
+
+		for(int i = 0; i < count; i++)
+		{
+			if(setupPoint(*primitive, *triangle, draw))
+			{
+				primitive += ms;
+				visible++;
+			}
+
+			triangle++;
+		}
+
+		return visible;
+	}
+
+	bool Renderer::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+	{
+		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
+		const SetupProcessor::State &state = draw.setupState;
+		const DrawData &data = *draw.data;
+
+		float lineWidth = data.lineWidth;
+
+		Vertex &v0 = triangle.v0;
+		Vertex &v1 = triangle.v1;
+
+		int pos = state.positionRegister;
+
+		const float4 &P0 = v0.v[pos];
+		const float4 &P1 = v1.v[pos];
+
+		if(P0.w <= 0 && P1.w <= 0)
+		{
+			return false;
+		}
+
+		const float W = data.Wx16[0] * (1.0f / 16.0f);
+		const float H = data.Hx16[0] * (1.0f / 16.0f);
+
+		float dx = W * (P1.x / P1.w - P0.x / P0.w);
+		float dy = H * (P1.y / P1.w - P0.y / P0.w);
+
+		if(dx == 0 && dy == 0)
+		{
+			return false;
+		}
+
+		if(state.multiSample > 1)   // Rectangle
+		{
+			float4 P[4];
+			int C[4];
+
+			P[0] = P0;
+			P[1] = P1;
+			P[2] = P1;
+			P[3] = P0;
+
+			float scale = lineWidth * 0.5f / sqrt(dx*dx + dy*dy);
+
+			dx *= scale;
+			dy *= scale;
+
+			float dx0h = dx * P0.w / H;
+			float dy0w = dy * P0.w / W;
+
+			float dx1h = dx * P1.w / H;
+			float dy1w = dy * P1.w / W;
+
+			P[0].x += -dy0w;
+			P[0].y += +dx0h;
+			C[0] = clipper->computeClipFlags(P[0]);
+
+			P[1].x += -dy1w;
+			P[1].y += +dx1h;
+			C[1] = clipper->computeClipFlags(P[1]);
+
+			P[2].x += +dy1w;
+			P[2].y += -dx1h;
+			C[2] = clipper->computeClipFlags(P[2]);
+
+			P[3].x += +dy0w;
+			P[3].y += -dx0h;
+			C[3] = clipper->computeClipFlags(P[3]);
+
+			if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
+			{
+				Polygon polygon(P, 4);
+
+				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;
+
+				if(clipFlagsOr != Clipper::CLIP_FINITE)
+				{
+					if(!clipper->clip(polygon, clipFlagsOr, draw))
+					{
+						return false;
+					}
+				}
+
+				return setupRoutine(&primitive, &triangle, &polygon, &data);
+			}
+		}
+		else   // Diamond test convention
+		{
+			float4 P[8];
+			int C[8];
+
+			P[0] = P0;
+			P[1] = P0;
+			P[2] = P0;
+			P[3] = P0;
+			P[4] = P1;
+			P[5] = P1;
+			P[6] = P1;
+			P[7] = P1;
+
+			float dx0 = lineWidth * 0.5f * P0.w / W;
+			float dy0 = lineWidth * 0.5f * P0.w / H;
+
+			float dx1 = lineWidth * 0.5f * P1.w / W;
+			float dy1 = lineWidth * 0.5f * P1.w / H;
+
+			P[0].x += -dx0;
+			C[0] = clipper->computeClipFlags(P[0]);
+
+			P[1].y += +dy0;
+			C[1] = clipper->computeClipFlags(P[1]);
+
+			P[2].x += +dx0;
+			C[2] = clipper->computeClipFlags(P[2]);
+
+			P[3].y += -dy0;
+			C[3] = clipper->computeClipFlags(P[3]);
+
+			P[4].x += -dx1;
+			C[4] = clipper->computeClipFlags(P[4]);
+
+			P[5].y += +dy1;
+			C[5] = clipper->computeClipFlags(P[5]);
+
+			P[6].x += +dx1;
+			C[6] = clipper->computeClipFlags(P[6]);
+
+			P[7].y += -dy1;
+			C[7] = clipper->computeClipFlags(P[7]);
+
+			if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
+			{
+				float4 L[6];
+
+				if(dx > -dy)
+				{
+					if(dx > dy)   // Right
+					{
+						L[0] = P[0];
+						L[1] = P[1];
+						L[2] = P[5];
+						L[3] = P[6];
+						L[4] = P[7];
+						L[5] = P[3];
+					}
+					else   // Down
+					{
+						L[0] = P[0];
+						L[1] = P[4];
+						L[2] = P[5];
+						L[3] = P[6];
+						L[4] = P[2];
+						L[5] = P[3];
+					}
+				}
+				else
+				{
+					if(dx > dy)   // Up
+					{
+						L[0] = P[0];
+						L[1] = P[1];
+						L[2] = P[2];
+						L[3] = P[6];
+						L[4] = P[7];
+						L[5] = P[4];
+					}
+					else   // Left
+					{
+						L[0] = P[1];
+						L[1] = P[2];
+						L[2] = P[3];
+						L[3] = P[7];
+						L[4] = P[4];
+						L[5] = P[5];
+					}
+				}
+
+				Polygon polygon(L, 6);
+
+				int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7] | draw.clipFlags;
+
+				if(clipFlagsOr != Clipper::CLIP_FINITE)
+				{
+					if(!clipper->clip(polygon, clipFlagsOr, draw))
+					{
+						return false;
+					}
+				}
+
+				return setupRoutine(&primitive, &triangle, &polygon, &data);
+			}
+		}
+
+		return false;
+	}
+
+	bool Renderer::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+	{
+		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
+		const SetupProcessor::State &state = draw.setupState;
+		const DrawData &data = *draw.data;
+
+		Vertex &v = triangle.v0;
+
+		float pSize;
+
+		int pts = state.pointSizeRegister;
+
+		if(state.pointSizeRegister != Unused)
+		{
+			pSize = v.v[pts].y;
+		}
+		else
+		{
+			pSize = data.point.pointSize[0];
+		}
+
+		pSize = clamp(pSize, data.point.pointSizeMin, data.point.pointSizeMax);
+
+		float4 P[4];
+		int C[4];
+
+		int pos = state.positionRegister;
+
+		P[0] = v.v[pos];
+		P[1] = v.v[pos];
+		P[2] = v.v[pos];
+		P[3] = v.v[pos];
+
+		const float X = pSize * P[0].w * data.halfPixelX[0];
+		const float Y = pSize * P[0].w * data.halfPixelY[0];
+
+		P[0].x -= X;
+		P[0].y += Y;
+		C[0] = clipper->computeClipFlags(P[0]);
+
+		P[1].x += X;
+		P[1].y += Y;
+		C[1] = clipper->computeClipFlags(P[1]);
+
+		P[2].x += X;
+		P[2].y -= Y;
+		C[2] = clipper->computeClipFlags(P[2]);
+
+		P[3].x -= X;
+		P[3].y -= Y;
+		C[3] = clipper->computeClipFlags(P[3]);
+
+		triangle.v1 = triangle.v0;
+		triangle.v2 = triangle.v0;
+
+		triangle.v1.X += iround(16 * 0.5f * pSize);
+		triangle.v2.Y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
+
+		Polygon polygon(P, 4);
+
+		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
+		{
+			int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | draw.clipFlags;
+
+			if(clipFlagsOr != Clipper::CLIP_FINITE)
+			{
+				if(!clipper->clip(polygon, clipFlagsOr, draw))
+				{
+					return false;
+				}
+			}
+
+			return setupRoutine(&primitive, &triangle, &polygon, &data);
+		}
+
+		return false;
+	}
+
+	void Renderer::initializeThreads()
+	{
+		unitCount = ceilPow2(threadCount);
+		clusterCount = ceilPow2(threadCount);
+
+		for(int i = 0; i < unitCount; i++)
+		{
+			triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
+			primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
+		}
+
+		for(int i = 0; i < threadCount; i++)
+		{
+			vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
+			vertexTask[i]->vertexCache.drawCall = -1;
+
+			task[i].type = Task::SUSPEND;
+
+			resume[i] = new Event();
+			suspend[i] = new Event();
+
+			Parameters parameters;
+			parameters.threadIndex = i;
+			parameters.renderer = this;
+
+			exitThreads = false;
+			worker[i] = new Thread(threadFunction, &parameters);
+
+			suspend[i]->wait();
+			suspend[i]->signal();
+		}
+	}
+
+	void Renderer::terminateThreads()
+	{
+		while(threadsAwake != 0)
+		{
+			Thread::sleep(1);
+		}
+
+		for(int thread = 0; thread < threadCount; thread++)
+		{
+			if(worker[thread])
+			{
+				exitThreads = true;
+				resume[thread]->signal();
+				worker[thread]->join();
+
+				delete worker[thread];
+				worker[thread] = 0;
+				delete resume[thread];
+				resume[thread] = 0;
+				delete suspend[thread];
+				suspend[thread] = 0;
+			}
+
+			deallocate(vertexTask[thread]);
+			vertexTask[thread] = 0;
+		}
+
+		for(int i = 0; i < 16; i++)
+		{
+			deallocate(triangleBatch[i]);
+			triangleBatch[i] = 0;
+
+			deallocate(primitiveBatch[i]);
+			primitiveBatch[i] = 0;
+		}
+	}
+
+	void Renderer::loadConstants(const VertexShader *vertexShader)
+	{
+		if(!vertexShader) return;
+
+		size_t count = vertexShader->getLength();
+
+		for(size_t i = 0; i < count; i++)
+		{
+			const Shader::Instruction *instruction = vertexShader->getInstruction(i);
+
+			if(instruction->opcode == Shader::OPCODE_DEF)
+			{
+				int index = instruction->dst.index;
+				float value[4];
+
+				value[0] = instruction->src[0].value[0];
+				value[1] = instruction->src[0].value[1];
+				value[2] = instruction->src[0].value[2];
+				value[3] = instruction->src[0].value[3];
+
+				setVertexShaderConstantF(index, value);
+			}
+			else if(instruction->opcode == Shader::OPCODE_DEFI)
+			{
+				int index = instruction->dst.index;
+				int integer[4];
+
+				integer[0] = instruction->src[0].integer[0];
+				integer[1] = instruction->src[0].integer[1];
+				integer[2] = instruction->src[0].integer[2];
+				integer[3] = instruction->src[0].integer[3];
+
+				setVertexShaderConstantI(index, integer);
+			}
+			else if(instruction->opcode == Shader::OPCODE_DEFB)
+			{
+				int index = instruction->dst.index;
+				int boolean = instruction->src[0].boolean[0];
+
+				setVertexShaderConstantB(index, &boolean);
+			}
+		}
+	}
+
+	void Renderer::loadConstants(const PixelShader *pixelShader)
+	{
+		if(!pixelShader) return;
+
+		size_t count = pixelShader->getLength();
+
+		for(size_t i = 0; i < count; i++)
+		{
+			const Shader::Instruction *instruction = pixelShader->getInstruction(i);
+
+			if(instruction->opcode == Shader::OPCODE_DEF)
+			{
+				int index = instruction->dst.index;
+				float value[4];
+
+				value[0] = instruction->src[0].value[0];
+				value[1] = instruction->src[0].value[1];
+				value[2] = instruction->src[0].value[2];
+				value[3] = instruction->src[0].value[3];
+
+				setPixelShaderConstantF(index, value);
+			}
+			else if(instruction->opcode == Shader::OPCODE_DEFI)
+			{
+				int index = instruction->dst.index;
+				int integer[4];
+
+				integer[0] = instruction->src[0].integer[0];
+				integer[1] = instruction->src[0].integer[1];
+				integer[2] = instruction->src[0].integer[2];
+				integer[3] = instruction->src[0].integer[3];
+
+				setPixelShaderConstantI(index, integer);
+			}
+			else if(instruction->opcode == Shader::OPCODE_DEFB)
+			{
+				int index = instruction->dst.index;
+				int boolean = instruction->src[0].boolean[0];
+
+				setPixelShaderConstantB(index, &boolean);
+			}
+		}
+	}
+
+	void Renderer::setIndexBuffer(Resource *indexBuffer)
+	{
+		context->indexBuffer = indexBuffer;
+	}
+
+	void Renderer::setMultiSampleMask(unsigned int mask)
+	{
+		context->sampleMask = mask;
+	}
+
+	void Renderer::setTransparencyAntialiasing(TransparencyAntialiasing transparencyAntialiasing)
+	{
+		sw::transparencyAntialiasing = transparencyAntialiasing;
+	}
+
+	bool Renderer::isReadWriteTexture(int sampler)
+	{
+		for(int index = 0; index < RENDERTARGETS; index++)
+		{
+			if(context->renderTarget[index] && context->texture[sampler] == context->renderTarget[index]->getResource())
+			{
+				return true;
+			}
+		}
+
+		if(context->depthBuffer && context->texture[sampler] == context->depthBuffer->getResource())
+		{
+			return true;
+		}
+
+		return false;
+	}
+
+	void Renderer::updateClipper()
+	{
+		if(updateClipPlanes)
+		{
+			if(VertexProcessor::isFixedFunction())   // User plane in world space
+			{
+				const Matrix &scissorWorld = getViewTransform();
+
+				if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = scissorWorld * userPlane[0];
+				if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = scissorWorld * userPlane[1];
+				if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = scissorWorld * userPlane[2];
+				if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = scissorWorld * userPlane[3];
+				if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = scissorWorld * userPlane[4];
+				if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = scissorWorld * userPlane[5];
+			}
+			else   // User plane in clip space
+			{
+				if(clipFlags & Clipper::CLIP_PLANE0) clipPlane[0] = userPlane[0];
+				if(clipFlags & Clipper::CLIP_PLANE1) clipPlane[1] = userPlane[1];
+				if(clipFlags & Clipper::CLIP_PLANE2) clipPlane[2] = userPlane[2];
+				if(clipFlags & Clipper::CLIP_PLANE3) clipPlane[3] = userPlane[3];
+				if(clipFlags & Clipper::CLIP_PLANE4) clipPlane[4] = userPlane[4];
+				if(clipFlags & Clipper::CLIP_PLANE5) clipPlane[5] = userPlane[5];
+			}
+
+			updateClipPlanes = false;
+		}
+	}
+
+	void Renderer::setTextureResource(unsigned int sampler, Resource *resource)
+	{
+		ASSERT(sampler < TOTAL_IMAGE_UNITS);
+
+		context->texture[sampler] = resource;
+	}
+
+	void Renderer::setTextureLevel(unsigned int sampler, unsigned int face, unsigned int level, Surface *surface, TextureType type)
+	{
+		ASSERT(sampler < TOTAL_IMAGE_UNITS && face < 6 && level < MIPMAP_LEVELS);
+
+		context->sampler[sampler].setTextureLevel(face, level, surface, type);
+	}
+
+	void Renderer::setTextureFilter(SamplerType type, int sampler, FilterType textureFilter)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setTextureFilter(sampler, textureFilter);
+		}
+		else
+		{
+			VertexProcessor::setTextureFilter(sampler, textureFilter);
+		}
+	}
+
+	void Renderer::setMipmapFilter(SamplerType type, int sampler, MipmapType mipmapFilter)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setMipmapFilter(sampler, mipmapFilter);
+		}
+		else
+		{
+			VertexProcessor::setMipmapFilter(sampler, mipmapFilter);
+		}
+	}
+
+	void Renderer::setGatherEnable(SamplerType type, int sampler, bool enable)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setGatherEnable(sampler, enable);
+		}
+		else
+		{
+			VertexProcessor::setGatherEnable(sampler, enable);
+		}
+	}
+
+	void Renderer::setAddressingModeU(SamplerType type, int sampler, AddressingMode addressMode)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setAddressingModeU(sampler, addressMode);
+		}
+		else
+		{
+			VertexProcessor::setAddressingModeU(sampler, addressMode);
+		}
+	}
+
+	void Renderer::setAddressingModeV(SamplerType type, int sampler, AddressingMode addressMode)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setAddressingModeV(sampler, addressMode);
+		}
+		else
+		{
+			VertexProcessor::setAddressingModeV(sampler, addressMode);
+		}
+	}
+
+	void Renderer::setAddressingModeW(SamplerType type, int sampler, AddressingMode addressMode)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setAddressingModeW(sampler, addressMode);
+		}
+		else
+		{
+			VertexProcessor::setAddressingModeW(sampler, addressMode);
+		}
+	}
+
+	void Renderer::setReadSRGB(SamplerType type, int sampler, bool sRGB)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setReadSRGB(sampler, sRGB);
+		}
+		else
+		{
+			VertexProcessor::setReadSRGB(sampler, sRGB);
+		}
+	}
+
+	void Renderer::setMipmapLOD(SamplerType type, int sampler, float bias)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setMipmapLOD(sampler, bias);
+		}
+		else
+		{
+			VertexProcessor::setMipmapLOD(sampler, bias);
+		}
+	}
+
+	void Renderer::setBorderColor(SamplerType type, int sampler, const Color<float> &borderColor)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setBorderColor(sampler, borderColor);
+		}
+		else
+		{
+			VertexProcessor::setBorderColor(sampler, borderColor);
+		}
+	}
+
+	void Renderer::setMaxAnisotropy(SamplerType type, int sampler, float maxAnisotropy)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
+		}
+		else
+		{
+			VertexProcessor::setMaxAnisotropy(sampler, maxAnisotropy);
+		}
+	}
+
+	void Renderer::setHighPrecisionFiltering(SamplerType type, int sampler, bool highPrecisionFiltering)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
+		}
+		else
+		{
+			VertexProcessor::setHighPrecisionFiltering(sampler, highPrecisionFiltering);
+		}
+	}
+
+	void Renderer::setSwizzleR(SamplerType type, int sampler, SwizzleType swizzleR)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setSwizzleR(sampler, swizzleR);
+		}
+		else
+		{
+			VertexProcessor::setSwizzleR(sampler, swizzleR);
+		}
+	}
+
+	void Renderer::setSwizzleG(SamplerType type, int sampler, SwizzleType swizzleG)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setSwizzleG(sampler, swizzleG);
+		}
+		else
+		{
+			VertexProcessor::setSwizzleG(sampler, swizzleG);
+		}
+	}
+
+	void Renderer::setSwizzleB(SamplerType type, int sampler, SwizzleType swizzleB)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setSwizzleB(sampler, swizzleB);
+		}
+		else
+		{
+			VertexProcessor::setSwizzleB(sampler, swizzleB);
+		}
+	}
+
+	void Renderer::setSwizzleA(SamplerType type, int sampler, SwizzleType swizzleA)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setSwizzleA(sampler, swizzleA);
+		}
+		else
+		{
+			VertexProcessor::setSwizzleA(sampler, swizzleA);
+		}
+	}
+
+	void Renderer::setCompareFunc(SamplerType type, int sampler, CompareFunc compFunc)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setCompareFunc(sampler, compFunc);
+		}
+		else
+		{
+			VertexProcessor::setCompareFunc(sampler, compFunc);
+		}
+	}
+
+	void Renderer::setBaseLevel(SamplerType type, int sampler, int baseLevel)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setBaseLevel(sampler, baseLevel);
+		}
+		else
+		{
+			VertexProcessor::setBaseLevel(sampler, baseLevel);
+		}
+	}
+
+	void Renderer::setMaxLevel(SamplerType type, int sampler, int maxLevel)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setMaxLevel(sampler, maxLevel);
+		}
+		else
+		{
+			VertexProcessor::setMaxLevel(sampler, maxLevel);
+		}
+	}
+
+	void Renderer::setMinLod(SamplerType type, int sampler, float minLod)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setMinLod(sampler, minLod);
+		}
+		else
+		{
+			VertexProcessor::setMinLod(sampler, minLod);
+		}
+	}
+
+	void Renderer::setMaxLod(SamplerType type, int sampler, float maxLod)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setMaxLod(sampler, maxLod);
+		}
+		else
+		{
+			VertexProcessor::setMaxLod(sampler, maxLod);
+		}
+	}
+
+	void Renderer::setSyncRequired(SamplerType type, int sampler, bool syncRequired)
+	{
+		if(type == SAMPLER_PIXEL)
+		{
+			PixelProcessor::setSyncRequired(sampler, syncRequired);
+		}
+		else
+		{
+			VertexProcessor::setSyncRequired(sampler, syncRequired);
+		}
+	}
+
+	void Renderer::setPointSpriteEnable(bool pointSpriteEnable)
+	{
+		context->setPointSpriteEnable(pointSpriteEnable);
+	}
+
+	void Renderer::setPointScaleEnable(bool pointScaleEnable)
+	{
+		context->setPointScaleEnable(pointScaleEnable);
+	}
+
+	void Renderer::setLineWidth(float width)
+	{
+		context->lineWidth = width;
+	}
+
+	void Renderer::setDepthBias(float bias)
+	{
+		context->depthBias = bias;
+	}
+
+	void Renderer::setSlopeDepthBias(float slopeBias)
+	{
+		context->slopeDepthBias = slopeBias;
+	}
+
+	void Renderer::setRasterizerDiscard(bool rasterizerDiscard)
+	{
+		context->rasterizerDiscard = rasterizerDiscard;
+	}
+
+	void Renderer::setPixelShader(const PixelShader *shader)
+	{
+		context->pixelShader = shader;
+
+		loadConstants(shader);
+	}
+
+	void Renderer::setVertexShader(const VertexShader *shader)
+	{
+		context->vertexShader = shader;
+
+		loadConstants(shader);
+	}
+
+	void Renderer::setPixelShaderConstantF(unsigned int index, const float value[4], unsigned int count)
+	{
+		for(unsigned int i = 0; i < DRAW_COUNT; i++)
+		{
+			if(drawCall[i]->psDirtyConstF < index + count)
+			{
+				drawCall[i]->psDirtyConstF = index + count;
+			}
+		}
+
+		for(unsigned int i = 0; i < count; i++)
+		{
+			PixelProcessor::setFloatConstant(index + i, value);
+			value += 4;
+		}
+	}
+
+	void Renderer::setPixelShaderConstantI(unsigned int index, const int value[4], unsigned int count)
+	{
+		for(unsigned int i = 0; i < DRAW_COUNT; i++)
+		{
+			if(drawCall[i]->psDirtyConstI < index + count)
+			{
+				drawCall[i]->psDirtyConstI = index + count;
+			}
+		}
+
+		for(unsigned int i = 0; i < count; i++)
+		{
+			PixelProcessor::setIntegerConstant(index + i, value);
+			value += 4;
+		}
+	}
+
+	void Renderer::setPixelShaderConstantB(unsigned int index, const int *boolean, unsigned int count)
+	{
+		for(unsigned int i = 0; i < DRAW_COUNT; i++)
+		{
+			if(drawCall[i]->psDirtyConstB < index + count)
+			{
+				drawCall[i]->psDirtyConstB = index + count;
+			}
+		}
+
+		for(unsigned int i = 0; i < count; i++)
+		{
+			PixelProcessor::setBooleanConstant(index + i, *boolean);
+			boolean++;
+		}
+	}
+
+	void Renderer::setVertexShaderConstantF(unsigned int index, const float value[4], unsigned int count)
+	{
+		for(unsigned int i = 0; i < DRAW_COUNT; i++)
+		{
+			if(drawCall[i]->vsDirtyConstF < index + count)
+			{
+				drawCall[i]->vsDirtyConstF = index + count;
+			}
+		}
+
+		for(unsigned int i = 0; i < count; i++)
+		{
+			VertexProcessor::setFloatConstant(index + i, value);
+			value += 4;
+		}
+	}
+
+	void Renderer::setVertexShaderConstantI(unsigned int index, const int value[4], unsigned int count)
+	{
+		for(unsigned int i = 0; i < DRAW_COUNT; i++)
+		{
+			if(drawCall[i]->vsDirtyConstI < index + count)
+			{
+				drawCall[i]->vsDirtyConstI = index + count;
+			}
+		}
+
+		for(unsigned int i = 0; i < count; i++)
+		{
+			VertexProcessor::setIntegerConstant(index + i, value);
+			value += 4;
+		}
+	}
+
+	void Renderer::setVertexShaderConstantB(unsigned int index, const int *boolean, unsigned int count)
+	{
+		for(unsigned int i = 0; i < DRAW_COUNT; i++)
+		{
+			if(drawCall[i]->vsDirtyConstB < index + count)
+			{
+				drawCall[i]->vsDirtyConstB = index + count;
+			}
+		}
+
+		for(unsigned int i = 0; i < count; i++)
+		{
+			VertexProcessor::setBooleanConstant(index + i, *boolean);
+			boolean++;
+		}
+	}
+
+	void Renderer::setModelMatrix(const Matrix &M, int i)
+	{
+		VertexProcessor::setModelMatrix(M, i);
+	}
+
+	void Renderer::setViewMatrix(const Matrix &V)
+	{
+		VertexProcessor::setViewMatrix(V);
+		updateClipPlanes = true;
+	}
+
+	void Renderer::setBaseMatrix(const Matrix &B)
+	{
+		VertexProcessor::setBaseMatrix(B);
+		updateClipPlanes = true;
+	}
+
+	void Renderer::setProjectionMatrix(const Matrix &P)
+	{
+		VertexProcessor::setProjectionMatrix(P);
+		updateClipPlanes = true;
+	}
+
+	void Renderer::addQuery(Query *query)
+	{
+		queries.push_back(query);
+	}
+
+	void Renderer::removeQuery(Query *query)
+	{
+		queries.remove(query);
+	}
+
+	#if PERF_HUD
+		int Renderer::getThreadCount()
+		{
+			return threadCount;
+		}
+
+		int64_t Renderer::getVertexTime(int thread)
+		{
+			return vertexTime[thread];
+		}
+
+		int64_t Renderer::getSetupTime(int thread)
+		{
+			return setupTime[thread];
+		}
+
+		int64_t Renderer::getPixelTime(int thread)
+		{
+			return pixelTime[thread];
+		}
+
+		void Renderer::resetTimers()
+		{
+			for(int thread = 0; thread < threadCount; thread++)
+			{
+				vertexTime[thread] = 0;
+				setupTime[thread] = 0;
+				pixelTime[thread] = 0;
+			}
+		}
+	#endif
+
+	void Renderer::setViewport(const Viewport &viewport)
+	{
+		this->viewport = viewport;
+	}
+
+	void Renderer::setScissor(const Rect &scissor)
+	{
+		this->scissor = scissor;
+	}
+
+	void Renderer::setClipFlags(int flags)
+	{
+		clipFlags = flags << 8;   // Bottom 8 bits used by legacy frustum
+	}
+
+	void Renderer::setClipPlane(unsigned int index, const float plane[4])
+	{
+		if(index < MAX_CLIP_PLANES)
+		{
+			userPlane[index] = plane;
+		}
+		else ASSERT(false);
+
+		updateClipPlanes = true;
+	}
+
+	void Renderer::updateConfiguration(bool initialUpdate)
+	{
+		bool newConfiguration = swiftConfig->hasNewConfiguration();
+
+		if(newConfiguration || initialUpdate)
+		{
+			terminateThreads();
+
+			SwiftConfig::Configuration configuration = {};
+			swiftConfig->getConfiguration(configuration);
+
+			precacheVertex = !newConfiguration && configuration.precache;
+			precacheSetup = !newConfiguration && configuration.precache;
+			precachePixel = !newConfiguration && configuration.precache;
+
+			VertexProcessor::setRoutineCacheSize(configuration.vertexRoutineCacheSize);
+			PixelProcessor::setRoutineCacheSize(configuration.pixelRoutineCacheSize);
+			SetupProcessor::setRoutineCacheSize(configuration.setupRoutineCacheSize);
+
+			switch(configuration.textureSampleQuality)
+			{
+			case 0:  Sampler::setFilterQuality(FILTER_POINT);       break;
+			case 1:  Sampler::setFilterQuality(FILTER_LINEAR);      break;
+			case 2:  Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
+			default: Sampler::setFilterQuality(FILTER_ANISOTROPIC); break;
+			}
+
+			switch(configuration.mipmapQuality)
+			{
+			case 0:  Sampler::setMipmapQuality(MIPMAP_POINT);  break;
+			case 1:  Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
+			default: Sampler::setMipmapQuality(MIPMAP_LINEAR); break;
+			}
+
+			setPerspectiveCorrection(configuration.perspectiveCorrection);
+
+			switch(configuration.transcendentalPrecision)
+			{
+			case 0:
+				logPrecision = APPROXIMATE;
+				expPrecision = APPROXIMATE;
+				rcpPrecision = APPROXIMATE;
+				rsqPrecision = APPROXIMATE;
+				break;
+			case 1:
+				logPrecision = PARTIAL;
+				expPrecision = PARTIAL;
+				rcpPrecision = PARTIAL;
+				rsqPrecision = PARTIAL;
+				break;
+			case 2:
+				logPrecision = ACCURATE;
+				expPrecision = ACCURATE;
+				rcpPrecision = ACCURATE;
+				rsqPrecision = ACCURATE;
+				break;
+			case 3:
+				logPrecision = WHQL;
+				expPrecision = WHQL;
+				rcpPrecision = WHQL;
+				rsqPrecision = WHQL;
+				break;
+			case 4:
+				logPrecision = IEEE;
+				expPrecision = IEEE;
+				rcpPrecision = IEEE;
+				rsqPrecision = IEEE;
+				break;
+			default:
+				logPrecision = ACCURATE;
+				expPrecision = ACCURATE;
+				rcpPrecision = ACCURATE;
+				rsqPrecision = ACCURATE;
+				break;
+			}
+
+			switch(configuration.transparencyAntialiasing)
+			{
+			case 0:  transparencyAntialiasing = TRANSPARENCY_NONE;              break;
+			case 1:  transparencyAntialiasing = TRANSPARENCY_ALPHA_TO_COVERAGE; break;
+			default: transparencyAntialiasing = TRANSPARENCY_NONE;              break;
+			}
+
+			switch(configuration.threadCount)
+			{
+			case -1: threadCount = CPUID::coreCount();        break;
+			case 0:  threadCount = CPUID::processAffinity();  break;
+			default: threadCount = configuration.threadCount; break;
+			}
+
+			CPUID::setEnableSSE4_1(configuration.enableSSE4_1);
+			CPUID::setEnableSSSE3(configuration.enableSSSE3);
+			CPUID::setEnableSSE3(configuration.enableSSE3);
+			CPUID::setEnableSSE2(configuration.enableSSE2);
+			CPUID::setEnableSSE(configuration.enableSSE);
+
+			for(int pass = 0; pass < 10; pass++)
+			{
+				optimization[pass] = configuration.optimization[pass];
+			}
+
+			forceWindowed = configuration.forceWindowed;
+			complementaryDepthBuffer = configuration.complementaryDepthBuffer;
+			postBlendSRGB = configuration.postBlendSRGB;
+			exactColorRounding = configuration.exactColorRounding;
+			forceClearRegisters = configuration.forceClearRegisters;
+
+		#ifndef NDEBUG
+			minPrimitives = configuration.minPrimitives;
+			maxPrimitives = configuration.maxPrimitives;
+		#endif
+		}
+
+		if(!initialUpdate && !worker[0])
+		{
+			initializeThreads();
+		}
+	}
+}
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
new file mode 100644
index 0000000..ce22866
--- /dev/null
+++ b/src/Device/Renderer.hpp
@@ -0,0 +1,508 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Renderer_hpp
+#define sw_Renderer_hpp
+
+#include "VertexProcessor.hpp"
+#include "PixelProcessor.hpp"
+#include "SetupProcessor.hpp"
+#include "Plane.hpp"
+#include "Blitter.hpp"
+#include "Common/MutexLock.hpp"
+#include "Common/Thread.hpp"
+#include "Main/Config.hpp"
+
+#include <list>
+
+namespace sw
+{
+	class Clipper;
+	struct DrawCall;
+	class PixelShader;
+	class VertexShader;
+	class SwiftConfig;
+	struct Task;
+	class Resource;
+	struct Constants;
+
+	enum TranscendentalPrecision
+	{
+		APPROXIMATE,
+		PARTIAL,	// 2^-10
+		ACCURATE,
+		WHQL,		// 2^-21
+		IEEE		// 2^-23
+	};
+
+	extern TranscendentalPrecision logPrecision;
+	extern TranscendentalPrecision expPrecision;
+	extern TranscendentalPrecision rcpPrecision;
+	extern TranscendentalPrecision rsqPrecision;
+	extern bool perspectiveCorrection;
+
+	struct Conventions
+	{
+		bool halfIntegerCoordinates;
+		bool symmetricNormalizedDepth;
+		bool booleanFaceRegister;
+		bool fullPixelPositionRegister;
+		bool leadingVertexFirst;
+		bool secondaryColor;
+		bool colorsDefaultToZero;
+	};
+
+	static const Conventions OpenGL =
+	{
+		true,    // halfIntegerCoordinates
+		true,    // symmetricNormalizedDepth
+		true,    // booleanFaceRegister
+		true,    // fullPixelPositionRegister
+		false,   // leadingVertexFirst
+		false,   // secondaryColor
+		true,    // colorsDefaultToZero
+	};
+
+	static const Conventions Direct3D =
+	{
+		false,   // halfIntegerCoordinates
+		false,   // symmetricNormalizedDepth
+		false,   // booleanFaceRegister
+		false,   // fullPixelPositionRegister
+		true,    // leadingVertexFirst
+		true,    // secondardyColor
+		false,   // colorsDefaultToZero
+	};
+
+	struct Query
+	{
+		enum Type { FRAGMENTS_PASSED, TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN };
+
+		Query(Type type) : building(false), reference(0), data(0), type(type)
+		{
+		}
+
+		void begin()
+		{
+			building = true;
+			data = 0;
+		}
+
+		void end()
+		{
+			building = false;
+		}
+
+		bool building;
+		AtomicInt reference;
+		AtomicInt data;
+
+		const Type type;
+	};
+
+	struct DrawData
+	{
+		const Constants *constants;
+
+		const void *input[MAX_VERTEX_INPUTS];
+		unsigned int stride[MAX_VERTEX_INPUTS];
+		Texture mipmap[TOTAL_IMAGE_UNITS];
+		const void *indices;
+
+		struct VS
+		{
+			float4 c[VERTEX_UNIFORM_VECTORS + 1];   // One extra for indices out of range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+			byte* u[MAX_UNIFORM_BUFFER_BINDINGS];
+			byte* t[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS];
+			unsigned int reg[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Offset used when reading from registers, in components
+			unsigned int row[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Number of rows to read
+			unsigned int col[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Number of columns to read
+			unsigned int str[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS]; // Number of components between each varying in output buffer
+			int4 i[16];
+			bool b[16];
+		};
+
+		struct PS
+		{
+			word4 cW[8][4];
+			float4 c[FRAGMENT_UNIFORM_VECTORS];
+			byte* u[MAX_UNIFORM_BUFFER_BINDINGS];
+			int4 i[16];
+			bool b[16];
+		};
+
+		union
+		{
+			VS vs;
+			VertexProcessor::FixedFunction ff;
+		};
+
+		PS ps;
+
+		int instanceID;
+
+		VertexProcessor::PointSprite point;
+		float lineWidth;
+
+		PixelProcessor::Stencil stencil[2];   // clockwise, counterclockwise
+		PixelProcessor::Stencil stencilCCW;
+		PixelProcessor::Fog fog;
+		PixelProcessor::Factor factor;
+		unsigned int occlusion[16];   // Number of pixels passing depth test
+
+		#if PERF_PROFILE
+			int64_t cycles[PERF_TIMERS][16];
+		#endif
+
+		TextureStage::Uniforms textureStage[8];
+
+		float4 Wx16;
+		float4 Hx16;
+		float4 X0x16;
+		float4 Y0x16;
+		float4 XXXX;
+		float4 YYYY;
+		float4 halfPixelX;
+		float4 halfPixelY;
+		float viewportHeight;
+		float slopeDepthBias;
+		float depthRange;
+		float depthNear;
+		Plane clipPlane[6];
+
+		unsigned int *colorBuffer[RENDERTARGETS];
+		int colorPitchB[RENDERTARGETS];
+		int colorSliceB[RENDERTARGETS];
+		float *depthBuffer;
+		int depthPitchB;
+		int depthSliceB;
+		unsigned char *stencilBuffer;
+		int stencilPitchB;
+		int stencilSliceB;
+
+		int scissorX0;
+		int scissorX1;
+		int scissorY0;
+		int scissorY1;
+
+		float4 a2c0;
+		float4 a2c1;
+		float4 a2c2;
+		float4 a2c3;
+	};
+
+	struct Viewport
+	{
+		float x0;
+		float y0;
+		float width;
+		float height;
+		float minZ;
+		float maxZ;
+	};
+
+	class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+	{
+		struct Task
+		{
+			enum Type
+			{
+				PRIMITIVES,
+				PIXELS,
+
+				RESUME,
+				SUSPEND
+			};
+
+			AtomicInt type;
+			AtomicInt primitiveUnit;
+			AtomicInt pixelCluster;
+		};
+
+		struct PrimitiveProgress
+		{
+			void init()
+			{
+				drawCall = 0;
+				firstPrimitive = 0;
+				primitiveCount = 0;
+				visible = 0;
+				references = 0;
+			}
+
+			AtomicInt drawCall;
+			AtomicInt firstPrimitive;
+			AtomicInt primitiveCount;
+			AtomicInt visible;
+			AtomicInt references;
+		};
+
+		struct PixelProgress
+		{
+			void init()
+			{
+				drawCall = 0;
+				processedPrimitives = 0;
+				executing = false;
+			}
+
+			AtomicInt drawCall;
+			AtomicInt processedPrimitives;
+			AtomicInt executing;
+		};
+
+	public:
+		Renderer(Context *context, Conventions conventions, bool exactColorRounding);
+
+		virtual ~Renderer();
+
+		void *operator new(size_t size);
+		void operator delete(void * mem);
+
+		void draw(DrawType drawType, unsigned int indexOffset, unsigned int count, bool update = true);
+
+		void clear(void *value, Format format, Surface *dest, const Rect &rect, unsigned int rgbaMask);
+		void blit(Surface *source, const SliceRectF &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil = false, bool sRGBconversion = true);
+		void blit3D(Surface *source, Surface *dest);
+
+		void setIndexBuffer(Resource *indexBuffer);
+
+		void setMultiSampleMask(unsigned int mask);
+		void setTransparencyAntialiasing(TransparencyAntialiasing transparencyAntialiasing);
+
+		void setTextureResource(unsigned int sampler, Resource *resource);
+		void setTextureLevel(unsigned int sampler, unsigned int face, unsigned int level, Surface *surface, TextureType type);
+
+		void setTextureFilter(SamplerType type, int sampler, FilterType textureFilter);
+		void setMipmapFilter(SamplerType type, int sampler, MipmapType mipmapFilter);
+		void setGatherEnable(SamplerType type, int sampler, bool enable);
+		void setAddressingModeU(SamplerType type, int sampler, AddressingMode addressingMode);
+		void setAddressingModeV(SamplerType type, int sampler, AddressingMode addressingMode);
+		void setAddressingModeW(SamplerType type, int sampler, AddressingMode addressingMode);
+		void setReadSRGB(SamplerType type, int sampler, bool sRGB);
+		void setMipmapLOD(SamplerType type, int sampler, float bias);
+		void setBorderColor(SamplerType type, int sampler, const Color<float> &borderColor);
+		void setMaxAnisotropy(SamplerType type, int sampler, float maxAnisotropy);
+		void setHighPrecisionFiltering(SamplerType type, int sampler, bool highPrecisionFiltering);
+		void setSwizzleR(SamplerType type, int sampler, SwizzleType swizzleR);
+		void setSwizzleG(SamplerType type, int sampler, SwizzleType swizzleG);
+		void setSwizzleB(SamplerType type, int sampler, SwizzleType swizzleB);
+		void setSwizzleA(SamplerType type, int sampler, SwizzleType swizzleA);
+		void setCompareFunc(SamplerType type, int sampler, CompareFunc compare);
+		void setBaseLevel(SamplerType type, int sampler, int baseLevel);
+		void setMaxLevel(SamplerType type, int sampler, int maxLevel);
+		void setMinLod(SamplerType type, int sampler, float minLod);
+		void setMaxLod(SamplerType type, int sampler, float maxLod);
+		void setSyncRequired(SamplerType type, int sampler, bool syncRequired);
+
+		void setPointSpriteEnable(bool pointSpriteEnable);
+		void setPointScaleEnable(bool pointScaleEnable);
+		void setLineWidth(float width);
+
+		void setDepthBias(float bias);
+		void setSlopeDepthBias(float slopeBias);
+
+		void setRasterizerDiscard(bool rasterizerDiscard);
+
+		// Programmable pipelines
+		void setPixelShader(const PixelShader *shader);
+		void setVertexShader(const VertexShader *shader);
+
+		void setPixelShaderConstantF(unsigned int index, const float value[4], unsigned int count = 1);
+		void setPixelShaderConstantI(unsigned int index, const int value[4], unsigned int count = 1);
+		void setPixelShaderConstantB(unsigned int index, const int *boolean, unsigned int count = 1);
+
+		void setVertexShaderConstantF(unsigned int index, const float value[4], unsigned int count = 1);
+		void setVertexShaderConstantI(unsigned int index, const int value[4], unsigned int count = 1);
+		void setVertexShaderConstantB(unsigned int index, const int *boolean, unsigned int count = 1);
+
+		// Viewport & Clipper
+		void setViewport(const Viewport &viewport);
+		void setScissor(const Rect &scissor);
+		void setClipFlags(int flags);
+		void setClipPlane(unsigned int index, const float plane[4]);
+
+		// Partial transform
+		void setModelMatrix(const Matrix &M, int i = 0);
+		void setViewMatrix(const Matrix &V);
+		void setBaseMatrix(const Matrix &B);
+		void setProjectionMatrix(const Matrix &P);
+
+		void addQuery(Query *query);
+		void removeQuery(Query *query);
+
+		void synchronize();
+
+		#if PERF_HUD
+			// Performance timers
+			int getThreadCount();
+			int64_t getVertexTime(int thread);
+			int64_t getSetupTime(int thread);
+			int64_t getPixelTime(int thread);
+			void resetTimers();
+		#endif
+
+		static int getClusterCount() { return clusterCount; }
+
+	private:
+		static void threadFunction(void *parameters);
+		void threadLoop(int threadIndex);
+		void taskLoop(int threadIndex);
+		void findAvailableTasks();
+		void scheduleTask(int threadIndex);
+		void executeTask(int threadIndex);
+		void finishRendering(Task &pixelTask);
+
+		void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
+
+		int setupSolidTriangles(int batch, int count);
+		int setupWireframeTriangle(int batch, int count);
+		int setupVertexTriangle(int batch, int count);
+		int setupLines(int batch, int count);
+		int setupPoints(int batch, int count);
+
+		bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+		bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+
+		bool isReadWriteTexture(int sampler);
+		void updateClipper();
+		void updateConfiguration(bool initialUpdate = false);
+		void initializeThreads();
+		void terminateThreads();
+
+		void loadConstants(const VertexShader *vertexShader);
+		void loadConstants(const PixelShader *pixelShader);
+
+		Context *context;
+		Clipper *clipper;
+		Blitter *blitter;
+		Viewport viewport;
+		Rect scissor;
+		int clipFlags;
+
+		Triangle *triangleBatch[16];
+		Primitive *primitiveBatch[16];
+
+		// User-defined clipping planes
+		Plane userPlane[MAX_CLIP_PLANES];
+		Plane clipPlane[MAX_CLIP_PLANES];   // Tranformed to clip space
+		bool updateClipPlanes;
+
+		AtomicInt exitThreads;
+		AtomicInt threadsAwake;
+		Thread *worker[16];
+		Event *resume[16];         // Events for resuming threads
+		Event *suspend[16];        // Events for suspending threads
+		Event *resumeApp;          // Event for resuming the application thread
+
+		PrimitiveProgress primitiveProgress[16];
+		PixelProgress pixelProgress[16];
+		Task task[16];   // Current tasks for threads
+
+		enum {
+			DRAW_COUNT = 16,   // Number of draw calls buffered (must be power of 2)
+			DRAW_COUNT_BITS = DRAW_COUNT - 1,
+		};
+		DrawCall *drawCall[DRAW_COUNT];
+		DrawCall *drawList[DRAW_COUNT];
+
+		AtomicInt currentDraw;
+		AtomicInt nextDraw;
+
+		enum {
+			TASK_COUNT = 32,   // Size of the task queue (must be power of 2)
+			TASK_COUNT_BITS = TASK_COUNT - 1,
+		};
+		Task taskQueue[TASK_COUNT];
+		AtomicInt qHead;
+		AtomicInt qSize;
+
+		static AtomicInt unitCount;
+		static AtomicInt clusterCount;
+
+		MutexLock schedulerMutex;
+
+		#if PERF_HUD
+			int64_t vertexTime[16];
+			int64_t setupTime[16];
+			int64_t pixelTime[16];
+		#endif
+
+		VertexTask *vertexTask[16];
+
+		SwiftConfig *swiftConfig;
+
+		std::list<Query*> queries;
+		Resource *sync;
+
+		VertexProcessor::State vertexState;
+		SetupProcessor::State setupState;
+		PixelProcessor::State pixelState;
+
+		Routine *vertexRoutine;
+		Routine *setupRoutine;
+		Routine *pixelRoutine;
+	};
+
+	struct DrawCall
+	{
+		DrawCall();
+
+		~DrawCall();
+
+		AtomicInt drawType;
+		AtomicInt batchSize;
+
+		Routine *vertexRoutine;
+		Routine *setupRoutine;
+		Routine *pixelRoutine;
+
+		VertexProcessor::RoutinePointer vertexPointer;
+		SetupProcessor::RoutinePointer setupPointer;
+		PixelProcessor::RoutinePointer pixelPointer;
+
+		int (Renderer::*setupPrimitives)(int batch, int count);
+		SetupProcessor::State setupState;
+
+		Resource *vertexStream[MAX_VERTEX_INPUTS];
+		Resource *indexBuffer;
+		Surface *renderTarget[RENDERTARGETS];
+		Surface *depthBuffer;
+		Surface *stencilBuffer;
+		Resource *texture[TOTAL_IMAGE_UNITS];
+		Resource* pUniformBuffers[MAX_UNIFORM_BUFFER_BINDINGS];
+		Resource* vUniformBuffers[MAX_UNIFORM_BUFFER_BINDINGS];
+		Resource* transformFeedbackBuffers[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS];
+
+		unsigned int vsDirtyConstF;
+		unsigned int vsDirtyConstI;
+		unsigned int vsDirtyConstB;
+
+		unsigned int psDirtyConstF;
+		unsigned int psDirtyConstI;
+		unsigned int psDirtyConstB;
+
+		std::list<Query*> *queries;
+
+		AtomicInt clipFlags;
+
+		AtomicInt primitive;    // Current primitive to enter pipeline
+		AtomicInt count;        // Number of primitives to render
+		AtomicInt references;   // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
+
+		DrawData *data;
+	};
+}
+
+#endif   // sw_Renderer_hpp
diff --git a/src/Device/RoutineCache.hpp b/src/Device/RoutineCache.hpp
new file mode 100644
index 0000000..74dd842
--- /dev/null
+++ b/src/Device/RoutineCache.hpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_RoutineCache_hpp
+#define sw_RoutineCache_hpp
+
+#include "LRUCache.hpp"
+
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+	template<class State>
+	class RoutineCache : public LRUCache<State, Routine>
+	{
+	public:
+		RoutineCache(int n, const char *precache = 0);
+		~RoutineCache();
+
+	private:
+		const char *precache;
+		#if defined(_WIN32)
+		HMODULE precacheDLL;
+		#endif
+	};
+
+	template<class State>
+	RoutineCache<State>::RoutineCache(int n, const char *precache) : LRUCache<State, Routine>(n), precache(precache)
+	{
+	}
+
+	template<class State>
+	RoutineCache<State>::~RoutineCache()
+	{
+	}
+}
+
+#endif   // sw_RoutineCache_hpp
diff --git a/src/Device/Sampler.cpp b/src/Device/Sampler.cpp
new file mode 100644
index 0000000..efac4c6
--- /dev/null
+++ b/src/Device/Sampler.cpp
@@ -0,0 +1,514 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Sampler.hpp"
+
+#include "Context.hpp"
+#include "Surface.hpp"
+#include "Shader/PixelRoutine.hpp"
+#include "Common/Debug.hpp"
+
+#include <memory.h>
+#include <string.h>
+
+namespace sw
+{
+	FilterType Sampler::maximumTextureFilterQuality = FILTER_LINEAR;
+	MipmapType Sampler::maximumMipmapFilterQuality = MIPMAP_POINT;
+
+	Sampler::State::State()
+	{
+		memset(this, 0, sizeof(State));
+	}
+
+	Sampler::Sampler()
+	{
+		// FIXME: Mipmap::init
+		static const unsigned int zero = 0x00FF00FF;
+
+		for(int level = 0; level < MIPMAP_LEVELS; level++)
+		{
+			Mipmap &mipmap = texture.mipmap[level];
+
+			memset(&mipmap, 0, sizeof(Mipmap));
+
+			for(int face = 0; face < 6; face++)
+			{
+				mipmap.buffer[face] = &zero;
+			}
+		}
+
+		externalTextureFormat = FORMAT_NULL;
+		internalTextureFormat = FORMAT_NULL;
+		textureType = TEXTURE_NULL;
+
+		textureFilter = FILTER_LINEAR;
+		addressingModeU = ADDRESSING_WRAP;
+		addressingModeV = ADDRESSING_WRAP;
+		addressingModeW = ADDRESSING_WRAP;
+		mipmapFilterState = MIPMAP_NONE;
+		sRGB = false;
+		gather = false;
+		highPrecisionFiltering = false;
+		border = 0;
+
+		swizzleR = SWIZZLE_RED;
+		swizzleG = SWIZZLE_GREEN;
+		swizzleB = SWIZZLE_BLUE;
+		swizzleA = SWIZZLE_ALPHA;
+
+		compare = COMPARE_BYPASS;
+
+		texture.LOD = 0.0f;
+		exp2LOD = 1.0f;
+
+		texture.baseLevel = 0;
+		texture.maxLevel = 1000;
+		texture.maxLod = MAX_TEXTURE_LOD;
+		texture.minLod = 0;
+	}
+
+	Sampler::~Sampler()
+	{
+	}
+
+	Sampler::State Sampler::samplerState() const
+	{
+		State state;
+
+		if(textureType != TEXTURE_NULL)
+		{
+			state.textureType = textureType;
+			state.textureFormat = internalTextureFormat;
+			state.textureFilter = getTextureFilter();
+			state.addressingModeU = getAddressingModeU();
+			state.addressingModeV = getAddressingModeV();
+			state.addressingModeW = getAddressingModeW();
+			state.mipmapFilter = mipmapFilter();
+			state.sRGB = (sRGB && Surface::isSRGBreadable(externalTextureFormat)) || Surface::isSRGBformat(internalTextureFormat);
+			state.swizzleR = swizzleR;
+			state.swizzleG = swizzleG;
+			state.swizzleB = swizzleB;
+			state.swizzleA = swizzleA;
+			state.highPrecisionFiltering = highPrecisionFiltering;
+			state.compare = getCompareFunc();
+
+			#if PERF_PROFILE
+				state.compressedFormat = Surface::isCompressed(externalTextureFormat);
+			#endif
+		}
+
+		return state;
+	}
+
+	void Sampler::setTextureLevel(int face, int level, Surface *surface, TextureType type)
+	{
+		if(surface)
+		{
+			Mipmap &mipmap = texture.mipmap[level];
+
+			border = surface->getBorder();
+			mipmap.buffer[face] = surface->lockInternal(-border, -border, 0, LOCK_UNLOCKED, PRIVATE);
+
+			if(face == 0)
+			{
+				externalTextureFormat = surface->getExternalFormat();
+				internalTextureFormat = surface->getInternalFormat();
+
+				int width = surface->getWidth();
+				int height = surface->getHeight();
+				int depth = surface->getDepth();
+				int pitchP = surface->getInternalPitchP();
+				int sliceP = surface->getInternalSliceP();
+
+				if(level == 0)
+				{
+					texture.widthHeightLOD[0] = width * exp2LOD;
+					texture.widthHeightLOD[1] = width * exp2LOD;
+					texture.widthHeightLOD[2] = height * exp2LOD;
+					texture.widthHeightLOD[3] = height * exp2LOD;
+
+					texture.widthLOD[0] = width * exp2LOD;
+					texture.widthLOD[1] = width * exp2LOD;
+					texture.widthLOD[2] = width * exp2LOD;
+					texture.widthLOD[3] = width * exp2LOD;
+
+					texture.heightLOD[0] = height * exp2LOD;
+					texture.heightLOD[1] = height * exp2LOD;
+					texture.heightLOD[2] = height * exp2LOD;
+					texture.heightLOD[3] = height * exp2LOD;
+
+					texture.depthLOD[0] = depth * exp2LOD;
+					texture.depthLOD[1] = depth * exp2LOD;
+					texture.depthLOD[2] = depth * exp2LOD;
+					texture.depthLOD[3] = depth * exp2LOD;
+				}
+
+				if(Surface::isFloatFormat(internalTextureFormat))
+				{
+					mipmap.fWidth[0] = (float)width / 65536.0f;
+					mipmap.fWidth[1] = (float)width / 65536.0f;
+					mipmap.fWidth[2] = (float)width / 65536.0f;
+					mipmap.fWidth[3] = (float)width / 65536.0f;
+
+					mipmap.fHeight[0] = (float)height / 65536.0f;
+					mipmap.fHeight[1] = (float)height / 65536.0f;
+					mipmap.fHeight[2] = (float)height / 65536.0f;
+					mipmap.fHeight[3] = (float)height / 65536.0f;
+
+					mipmap.fDepth[0] = (float)depth / 65536.0f;
+					mipmap.fDepth[1] = (float)depth / 65536.0f;
+					mipmap.fDepth[2] = (float)depth / 65536.0f;
+					mipmap.fDepth[3] = (float)depth / 65536.0f;
+				}
+
+				short halfTexelU = 0x8000 / width;
+				short halfTexelV = 0x8000 / height;
+				short halfTexelW = 0x8000 / depth;
+
+				mipmap.uHalf[0] = halfTexelU;
+				mipmap.uHalf[1] = halfTexelU;
+				mipmap.uHalf[2] = halfTexelU;
+				mipmap.uHalf[3] = halfTexelU;
+
+				mipmap.vHalf[0] = halfTexelV;
+				mipmap.vHalf[1] = halfTexelV;
+				mipmap.vHalf[2] = halfTexelV;
+				mipmap.vHalf[3] = halfTexelV;
+
+				mipmap.wHalf[0] = halfTexelW;
+				mipmap.wHalf[1] = halfTexelW;
+				mipmap.wHalf[2] = halfTexelW;
+				mipmap.wHalf[3] = halfTexelW;
+
+				mipmap.width[0] = width;
+				mipmap.width[1] = width;
+				mipmap.width[2] = width;
+				mipmap.width[3] = width;
+
+				mipmap.height[0] = height;
+				mipmap.height[1] = height;
+				mipmap.height[2] = height;
+				mipmap.height[3] = height;
+
+				mipmap.depth[0] = depth;
+				mipmap.depth[1] = depth;
+				mipmap.depth[2] = depth;
+				mipmap.depth[3] = depth;
+
+				mipmap.onePitchP[0] = 1;
+				mipmap.onePitchP[1] = pitchP;
+				mipmap.onePitchP[2] = 1;
+				mipmap.onePitchP[3] = pitchP;
+
+				mipmap.pitchP[0] = pitchP;
+				mipmap.pitchP[1] = pitchP;
+				mipmap.pitchP[2] = pitchP;
+				mipmap.pitchP[3] = pitchP;
+
+				mipmap.sliceP[0] = sliceP;
+				mipmap.sliceP[1] = sliceP;
+				mipmap.sliceP[2] = sliceP;
+				mipmap.sliceP[3] = sliceP;
+
+				if(internalTextureFormat == FORMAT_YV12_BT601 ||
+				   internalTextureFormat == FORMAT_YV12_BT709 ||
+				   internalTextureFormat == FORMAT_YV12_JFIF)
+				{
+					unsigned int YStride = pitchP;
+					unsigned int YSize = YStride * height;
+					unsigned int CStride = align<16>(YStride / 2);
+					unsigned int CSize = CStride * height / 2;
+
+					mipmap.buffer[1] = (byte*)mipmap.buffer[0] + YSize;
+					mipmap.buffer[2] = (byte*)mipmap.buffer[1] + CSize;
+
+					texture.mipmap[1].width[0] = width / 2;
+					texture.mipmap[1].width[1] = width / 2;
+					texture.mipmap[1].width[2] = width / 2;
+					texture.mipmap[1].width[3] = width / 2;
+					texture.mipmap[1].height[0] = height / 2;
+					texture.mipmap[1].height[1] = height / 2;
+					texture.mipmap[1].height[2] = height / 2;
+					texture.mipmap[1].height[3] = height / 2;
+					texture.mipmap[1].onePitchP[0] = 1;
+					texture.mipmap[1].onePitchP[1] = CStride;
+					texture.mipmap[1].onePitchP[2] = 1;
+					texture.mipmap[1].onePitchP[3] = CStride;
+				}
+			}
+		}
+
+		textureType = type;
+	}
+
+	void Sampler::setTextureFilter(FilterType textureFilter)
+	{
+		this->textureFilter = (FilterType)min(textureFilter, maximumTextureFilterQuality);
+	}
+
+	void Sampler::setMipmapFilter(MipmapType mipmapFilter)
+	{
+		mipmapFilterState = (MipmapType)min(mipmapFilter, maximumMipmapFilterQuality);
+	}
+
+	void Sampler::setGatherEnable(bool enable)
+	{
+		gather = enable;
+	}
+
+	void Sampler::setAddressingModeU(AddressingMode addressingMode)
+	{
+		addressingModeU = addressingMode;
+	}
+
+	void Sampler::setAddressingModeV(AddressingMode addressingMode)
+	{
+		addressingModeV = addressingMode;
+	}
+
+	void Sampler::setAddressingModeW(AddressingMode addressingMode)
+	{
+		addressingModeW = addressingMode;
+	}
+
+	void Sampler::setReadSRGB(bool sRGB)
+	{
+		this->sRGB = sRGB;
+	}
+
+	void Sampler::setBorderColor(const Color<float> &borderColor)
+	{
+		// FIXME: Compact into generic function   // FIXME: Clamp
+		short r = iround(0xFFFF * borderColor.r);
+		short g = iround(0xFFFF * borderColor.g);
+		short b = iround(0xFFFF * borderColor.b);
+		short a = iround(0xFFFF * borderColor.a);
+
+		texture.borderColor4[0][0] = texture.borderColor4[0][1] = texture.borderColor4[0][2] = texture.borderColor4[0][3] = r;
+		texture.borderColor4[1][0] = texture.borderColor4[1][1] = texture.borderColor4[1][2] = texture.borderColor4[1][3] = g;
+		texture.borderColor4[2][0] = texture.borderColor4[2][1] = texture.borderColor4[2][2] = texture.borderColor4[2][3] = b;
+		texture.borderColor4[3][0] = texture.borderColor4[3][1] = texture.borderColor4[3][2] = texture.borderColor4[3][3] = a;
+
+		texture.borderColorF[0][0] = texture.borderColorF[0][1] = texture.borderColorF[0][2] = texture.borderColorF[0][3] = borderColor.r;
+		texture.borderColorF[1][0] = texture.borderColorF[1][1] = texture.borderColorF[1][2] = texture.borderColorF[1][3] = borderColor.g;
+		texture.borderColorF[2][0] = texture.borderColorF[2][1] = texture.borderColorF[2][2] = texture.borderColorF[2][3] = borderColor.b;
+		texture.borderColorF[3][0] = texture.borderColorF[3][1] = texture.borderColorF[3][2] = texture.borderColorF[3][3] = borderColor.a;
+	}
+
+	void Sampler::setMaxAnisotropy(float maxAnisotropy)
+	{
+		texture.maxAnisotropy = maxAnisotropy;
+	}
+
+	void Sampler::setHighPrecisionFiltering(bool highPrecisionFiltering)
+	{
+		this->highPrecisionFiltering = highPrecisionFiltering;
+	}
+
+	void Sampler::setSwizzleR(SwizzleType swizzleR)
+	{
+		this->swizzleR = swizzleR;
+	}
+
+	void Sampler::setSwizzleG(SwizzleType swizzleG)
+	{
+		this->swizzleG = swizzleG;
+	}
+
+	void Sampler::setSwizzleB(SwizzleType swizzleB)
+	{
+		this->swizzleB = swizzleB;
+	}
+
+	void Sampler::setSwizzleA(SwizzleType swizzleA)
+	{
+		this->swizzleA = swizzleA;
+	}
+
+	void Sampler::setCompareFunc(CompareFunc compare)
+	{
+		this->compare = compare;
+	}
+
+	void Sampler::setBaseLevel(int baseLevel)
+	{
+		texture.baseLevel = baseLevel;
+	}
+
+	void Sampler::setMaxLevel(int maxLevel)
+	{
+		texture.maxLevel = maxLevel;
+	}
+
+	void Sampler::setMinLod(float minLod)
+	{
+		texture.minLod = clamp(minLod, 0.0f, (float)(MAX_TEXTURE_LOD));
+	}
+
+	void Sampler::setMaxLod(float maxLod)
+	{
+		texture.maxLod = clamp(maxLod, 0.0f, (float)(MAX_TEXTURE_LOD));
+	}
+
+	void Sampler::setFilterQuality(FilterType maximumFilterQuality)
+	{
+		Sampler::maximumTextureFilterQuality = maximumFilterQuality;
+	}
+
+	void Sampler::setMipmapQuality(MipmapType maximumFilterQuality)
+	{
+		Sampler::maximumMipmapFilterQuality = maximumFilterQuality;
+	}
+
+	void Sampler::setMipmapLOD(float LOD)
+	{
+		texture.LOD = LOD;
+		exp2LOD = exp2(LOD);
+	}
+
+	bool Sampler::hasTexture() const
+	{
+		return textureType != TEXTURE_NULL;
+	}
+
+	bool Sampler::hasUnsignedTexture() const
+	{
+		return Surface::isUnsignedComponent(internalTextureFormat, 0) &&
+		       Surface::isUnsignedComponent(internalTextureFormat, 1) &&
+		       Surface::isUnsignedComponent(internalTextureFormat, 2) &&
+		       Surface::isUnsignedComponent(internalTextureFormat, 3);
+	}
+
+	bool Sampler::hasCubeTexture() const
+	{
+		return textureType == TEXTURE_CUBE;
+	}
+
+	bool Sampler::hasVolumeTexture() const
+	{
+		return textureType == TEXTURE_3D || textureType == TEXTURE_2D_ARRAY;
+	}
+
+	void Sampler::setSyncRequired(bool isSyncRequired)
+	{
+		syncRequired = isSyncRequired;
+	}
+
+	bool Sampler::requiresSync() const
+	{
+		return syncRequired;
+	}
+
+	const Texture &Sampler::getTextureData()
+	{
+		return texture;
+	}
+
+	MipmapType Sampler::mipmapFilter() const
+	{
+		if(mipmapFilterState != MIPMAP_NONE)
+		{
+			for(int i = 1; i < MIPMAP_LEVELS; i++)
+			{
+				if(texture.mipmap[0].buffer[0] != texture.mipmap[i].buffer[0])
+				{
+					return mipmapFilterState;
+				}
+			}
+		}
+
+		// Only one mipmap level
+		return MIPMAP_NONE;
+	}
+
+	TextureType Sampler::getTextureType() const
+	{
+		return textureType;
+	}
+
+	FilterType Sampler::getTextureFilter() const
+	{
+		// Don't filter 1x1 textures.
+		if(texture.mipmap[0].width[0] == 1 && texture.mipmap[0].height[0] == 1 && texture.mipmap[0].depth[0] == 1)
+		{
+			if(mipmapFilter() == MIPMAP_NONE)
+			{
+				return FILTER_POINT;
+			}
+		}
+
+		FilterType filter = textureFilter;
+
+		if(gather && Surface::componentCount(internalTextureFormat) == 1)
+		{
+			filter = FILTER_GATHER;
+		}
+
+		if(textureType != TEXTURE_2D || texture.maxAnisotropy == 1.0f)
+		{
+			return (FilterType)min(filter, FILTER_LINEAR);
+		}
+
+		return filter;
+	}
+
+	AddressingMode Sampler::getAddressingModeU() const
+	{
+		if(textureType == TEXTURE_CUBE)
+		{
+			return border ? ADDRESSING_SEAMLESS : ADDRESSING_CLAMP;
+		}
+
+		return addressingModeU;
+	}
+
+	AddressingMode Sampler::getAddressingModeV() const
+	{
+		if(textureType == TEXTURE_CUBE)
+		{
+			return border ? ADDRESSING_SEAMLESS : ADDRESSING_CLAMP;
+		}
+
+		return addressingModeV;
+	}
+
+	AddressingMode Sampler::getAddressingModeW() const
+	{
+		if(textureType == TEXTURE_2D_ARRAY ||
+		   textureType == TEXTURE_2D ||
+		   textureType == TEXTURE_CUBE ||
+		   textureType == TEXTURE_RECTANGLE)
+		{
+			return ADDRESSING_LAYER;
+		}
+
+		return addressingModeW;
+	}
+
+	CompareFunc Sampler::getCompareFunc() const
+	{
+		if(getTextureFilter() == FILTER_GATHER)
+		{
+			return COMPARE_BYPASS;
+		}
+
+		if(internalTextureFormat == FORMAT_D32FS8_SHADOW)
+		{
+			return COMPARE_LESSEQUAL;
+		}
+
+		return compare;
+	}
+}
diff --git a/src/Device/Sampler.hpp b/src/Device/Sampler.hpp
new file mode 100644
index 0000000..af225c5
--- /dev/null
+++ b/src/Device/Sampler.hpp
@@ -0,0 +1,248 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Sampler_hpp
+#define sw_Sampler_hpp
+
+#include "Main/Config.hpp"
+#include "Renderer/Surface.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	struct Mipmap
+	{
+		const void *buffer[6];
+
+		float4 fWidth;
+		float4 fHeight;
+		float4 fDepth;
+
+		short uHalf[4];
+		short vHalf[4];
+		short wHalf[4];
+		short width[4];
+		short height[4];
+		short depth[4];
+		short onePitchP[4];
+		int4 pitchP;
+		int4 sliceP;
+	};
+
+	struct Texture
+	{
+		Mipmap mipmap[MIPMAP_LEVELS];
+
+		float LOD;
+		float4 widthHeightLOD;
+		float4 widthLOD;
+		float4 heightLOD;
+		float4 depthLOD;
+
+		word4 borderColor4[4];
+		float4 borderColorF[4];
+		float maxAnisotropy;
+		int baseLevel;
+		int maxLevel;
+		float minLod;
+		float maxLod;
+	};
+
+	enum SamplerType
+	{
+		SAMPLER_PIXEL,
+		SAMPLER_VERTEX
+	};
+
+	enum TextureType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		TEXTURE_NULL,
+		TEXTURE_2D,
+		TEXTURE_RECTANGLE,
+		TEXTURE_CUBE,
+		TEXTURE_3D,
+		TEXTURE_2D_ARRAY,
+
+		TEXTURE_LAST = TEXTURE_2D_ARRAY
+	};
+
+	enum FilterType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		FILTER_POINT,
+		FILTER_GATHER,
+		FILTER_MIN_POINT_MAG_LINEAR,
+		FILTER_MIN_LINEAR_MAG_POINT,
+		FILTER_LINEAR,
+		FILTER_ANISOTROPIC,
+
+		FILTER_LAST = FILTER_ANISOTROPIC
+	};
+
+	enum MipmapType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		MIPMAP_NONE,
+		MIPMAP_POINT,
+		MIPMAP_LINEAR,
+
+		MIPMAP_LAST = MIPMAP_LINEAR
+	};
+
+	enum AddressingMode ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		ADDRESSING_WRAP,
+		ADDRESSING_CLAMP,
+		ADDRESSING_MIRROR,
+		ADDRESSING_MIRRORONCE,
+		ADDRESSING_BORDER,     // Single color
+		ADDRESSING_SEAMLESS,   // Border of pixels
+		ADDRESSING_LAYER,
+		ADDRESSING_TEXELFETCH,
+
+		ADDRESSING_LAST = ADDRESSING_TEXELFETCH
+	};
+
+	enum CompareFunc ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		COMPARE_BYPASS,
+		COMPARE_LESSEQUAL,
+		COMPARE_GREATEREQUAL,
+		COMPARE_LESS,
+		COMPARE_GREATER,
+		COMPARE_EQUAL,
+		COMPARE_NOTEQUAL,
+		COMPARE_ALWAYS,
+		COMPARE_NEVER,
+
+		COMPARE_LAST = COMPARE_NEVER
+	};
+
+	enum SwizzleType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		SWIZZLE_RED,
+		SWIZZLE_GREEN,
+		SWIZZLE_BLUE,
+		SWIZZLE_ALPHA,
+		SWIZZLE_ZERO,
+		SWIZZLE_ONE,
+
+		SWIZZLE_LAST = SWIZZLE_ONE
+	};
+
+	class Sampler
+	{
+	public:
+		struct State
+		{
+			State();
+
+			TextureType textureType        : BITS(TEXTURE_LAST);
+			Format textureFormat           : BITS(FORMAT_LAST);
+			FilterType textureFilter       : BITS(FILTER_LAST);
+			AddressingMode addressingModeU : BITS(ADDRESSING_LAST);
+			AddressingMode addressingModeV : BITS(ADDRESSING_LAST);
+			AddressingMode addressingModeW : BITS(ADDRESSING_LAST);
+			MipmapType mipmapFilter        : BITS(FILTER_LAST);
+			bool sRGB                      : 1;
+			SwizzleType swizzleR           : BITS(SWIZZLE_LAST);
+			SwizzleType swizzleG           : BITS(SWIZZLE_LAST);
+			SwizzleType swizzleB           : BITS(SWIZZLE_LAST);
+			SwizzleType swizzleA           : BITS(SWIZZLE_LAST);
+			bool highPrecisionFiltering    : 1;
+			CompareFunc compare            : BITS(COMPARE_LAST);
+
+			#if PERF_PROFILE
+			bool compressedFormat          : 1;
+			#endif
+		};
+
+		Sampler();
+
+		~Sampler();
+
+		State samplerState() const;
+
+		void setTextureLevel(int face, int level, Surface *surface, TextureType type);
+
+		void setTextureFilter(FilterType textureFilter);
+		void setMipmapFilter(MipmapType mipmapFilter);
+		void setGatherEnable(bool enable);
+		void setAddressingModeU(AddressingMode addressingMode);
+		void setAddressingModeV(AddressingMode addressingMode);
+		void setAddressingModeW(AddressingMode addressingMode);
+		void setReadSRGB(bool sRGB);
+		void setBorderColor(const Color<float> &borderColor);
+		void setMaxAnisotropy(float maxAnisotropy);
+		void setHighPrecisionFiltering(bool highPrecisionFiltering);
+		void setSwizzleR(SwizzleType swizzleR);
+		void setSwizzleG(SwizzleType swizzleG);
+		void setSwizzleB(SwizzleType swizzleB);
+		void setSwizzleA(SwizzleType swizzleA);
+		void setCompareFunc(CompareFunc compare);
+		void setBaseLevel(int baseLevel);
+		void setMaxLevel(int maxLevel);
+		void setMinLod(float minLod);
+		void setMaxLod(float maxLod);
+		void setSyncRequired(bool isSincRequired);
+
+		static void setFilterQuality(FilterType maximumFilterQuality);
+		static void setMipmapQuality(MipmapType maximumFilterQuality);
+		void setMipmapLOD(float lod);
+
+		bool hasTexture() const;
+		bool hasUnsignedTexture() const;
+		bool hasCubeTexture() const;
+		bool hasVolumeTexture() const;
+		bool requiresSync() const;
+
+		const Texture &getTextureData();
+
+	private:
+		MipmapType mipmapFilter() const;
+		TextureType getTextureType() const;
+		FilterType getTextureFilter() const;
+		AddressingMode getAddressingModeU() const;
+		AddressingMode getAddressingModeV() const;
+		AddressingMode getAddressingModeW() const;
+		CompareFunc getCompareFunc() const;
+
+		Format externalTextureFormat;
+		Format internalTextureFormat;
+		TextureType textureType;
+
+		FilterType textureFilter;
+		AddressingMode addressingModeU;
+		AddressingMode addressingModeV;
+		AddressingMode addressingModeW;
+		MipmapType mipmapFilterState;
+		bool sRGB;
+		bool gather;
+		bool highPrecisionFiltering;
+		bool syncRequired;
+		int border;
+
+		SwizzleType swizzleR;
+		SwizzleType swizzleG;
+		SwizzleType swizzleB;
+		SwizzleType swizzleA;
+		CompareFunc compare;
+
+		Texture texture;
+		float exp2LOD;
+
+		static FilterType maximumTextureFilterQuality;
+		static MipmapType maximumMipmapFilterQuality;
+	};
+}
+
+#endif   // sw_Sampler_hpp
diff --git a/src/Device/SetupProcessor.cpp b/src/Device/SetupProcessor.cpp
new file mode 100644
index 0000000..c5c2a16
--- /dev/null
+++ b/src/Device/SetupProcessor.cpp
@@ -0,0 +1,248 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SetupProcessor.hpp"
+
+#include "Primitive.hpp"
+#include "Polygon.hpp"
+#include "Context.hpp"
+#include "Renderer.hpp"
+#include "Shader/SetupRoutine.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	extern bool complementaryDepthBuffer;
+	extern bool fullPixelPositionRegister;
+
+	bool precacheSetup = false;
+
+	unsigned int SetupProcessor::States::computeHash()
+	{
+		unsigned int *state = (unsigned int*)this;
+		unsigned int hash = 0;
+
+		for(unsigned int i = 0; i < sizeof(States) / 4; i++)
+		{
+			hash ^= state[i];
+		}
+
+		return hash;
+	}
+
+	SetupProcessor::State::State(int i)
+	{
+		memset(this, 0, sizeof(State));
+	}
+
+	bool SetupProcessor::State::operator==(const State &state) const
+	{
+		if(hash != state.hash)
+		{
+			return false;
+		}
+
+		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+	}
+
+	SetupProcessor::SetupProcessor(Context *context) : context(context)
+	{
+		routineCache = 0;
+		setRoutineCacheSize(1024);
+	}
+
+	SetupProcessor::~SetupProcessor()
+	{
+		delete routineCache;
+		routineCache = 0;
+	}
+
+	SetupProcessor::State SetupProcessor::update() const
+	{
+		State state;
+
+		bool vPosZW = (context->pixelShader && context->pixelShader->isVPosDeclared() && fullPixelPositionRegister);
+
+		state.isDrawPoint = context->isDrawPoint(true);
+		state.isDrawLine = context->isDrawLine(true);
+		state.isDrawTriangle = context->isDrawTriangle(false);
+		state.isDrawSolidTriangle = context->isDrawTriangle(true);
+		state.interpolateZ = context->depthBufferActive() || context->pixelFogActive() != FOG_NONE || vPosZW;
+		state.interpolateW = context->perspectiveActive() || vPosZW;
+		state.perspective = context->perspectiveActive();
+		state.pointSprite = context->pointSpriteActive();
+		state.cullMode = context->cullMode;
+		state.twoSidedStencil = context->stencilActive() && context->twoSidedStencil;
+		state.slopeDepthBias = context->slopeDepthBias != 0.0f;
+		state.vFace = context->pixelShader && context->pixelShader->isVFaceDeclared();
+
+		state.positionRegister = Pos;
+		state.pointSizeRegister = Unused;
+
+		state.multiSample = context->getMultiSampleCount();
+		state.rasterizerDiscard = context->rasterizerDiscard;
+
+		if(context->vertexShader)
+		{
+			state.positionRegister = context->vertexShader->getPositionRegister();
+			state.pointSizeRegister = context->vertexShader->getPointSizeRegister();
+		}
+		else if(context->pointSizeActive())
+		{
+			state.pointSizeRegister = Pts;
+		}
+
+		for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+		{
+			for(int component = 0; component < 4; component++)
+			{
+				state.gradient[interpolant][component].attribute = Unused;
+				state.gradient[interpolant][component].flat = false;
+				state.gradient[interpolant][component].wrap = false;
+			}
+		}
+
+		state.fog.attribute = Unused;
+		state.fog.flat = false;
+		state.fog.wrap = false;
+
+		const bool point = context->isDrawPoint(true);
+		const bool sprite = context->pointSpriteActive();
+		const bool flatShading = (context->shadingMode == SHADING_FLAT) || point;
+
+		if(context->vertexShader && context->pixelShader)
+		{
+			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					int project = context->isProjectionComponent(interpolant - 2, component) ? 1 : 0;
+					const Shader::Semantic& semantic = context->pixelShader->getInput(interpolant, component - project);
+
+					if(semantic.active())
+					{
+						int input = interpolant;
+						for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+						{
+							if(semantic == context->vertexShader->getOutput(i, component - project))
+							{
+								input = i;
+								break;
+							}
+						}
+
+						bool flat = point;
+
+						switch(semantic.usage)
+						{
+						case Shader::USAGE_TEXCOORD: flat = point && !sprite;             break;
+						case Shader::USAGE_COLOR:    flat = semantic.flat || flatShading; break;
+						}
+
+						state.gradient[interpolant][component].attribute = input;
+						state.gradient[interpolant][component].flat = flat;
+					}
+				}
+			}
+		}
+		else if(context->preTransformed && context->pixelShader)
+		{
+			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					const Shader::Semantic& semantic = context->pixelShader->getInput(interpolant, component);
+
+					switch(semantic.usage)
+					{
+					case 0xFF:
+						break;
+					case Shader::USAGE_TEXCOORD:
+						state.gradient[interpolant][component].attribute = T0 + semantic.index;
+						state.gradient[interpolant][component].flat = semantic.flat || (point && !sprite);
+						break;
+					case Shader::USAGE_COLOR:
+						state.gradient[interpolant][component].attribute = C0 + semantic.index;
+						state.gradient[interpolant][component].flat = semantic.flat || flatShading;
+						break;
+					default:
+						ASSERT(false);
+					}
+				}
+			}
+		}
+		else if(context->pixelShaderModel() < 0x0300)
+		{
+			for(int coordinate = 0; coordinate < 8; coordinate++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					if(context->textureActive(coordinate, component))
+					{
+						state.texture[coordinate][component].attribute = T0 + coordinate;
+						state.texture[coordinate][component].flat = point && !sprite;
+						state.texture[coordinate][component].wrap = (context->textureWrap[coordinate] & (1 << component)) != 0;
+					}
+				}
+			}
+
+			for(int color = 0; color < 2; color++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					if(context->colorActive(color, component))
+					{
+						state.color[color][component].attribute = C0 + color;
+						state.color[color][component].flat = flatShading;
+					}
+				}
+			}
+		}
+		else ASSERT(false);
+
+		if(context->fogActive())
+		{
+			state.fog.attribute = Fog;
+			state.fog.flat = point;
+		}
+
+		state.hash = state.computeHash();
+
+		return state;
+	}
+
+	Routine *SetupProcessor::routine(const State &state)
+	{
+		Routine *routine = routineCache->query(state);
+
+		if(!routine)
+		{
+			SetupRoutine *generator = new SetupRoutine(state);
+			generator->generate();
+			routine = generator->getRoutine();
+			delete generator;
+
+			routineCache->add(state, routine);
+		}
+
+		return routine;
+	}
+
+	void SetupProcessor::setRoutineCacheSize(int cacheSize)
+	{
+		delete routineCache;
+		routineCache = new RoutineCache<State>(clamp(cacheSize, 1, 65536), precacheSetup ? "sw-setup" : 0);
+	}
+}
diff --git a/src/Device/SetupProcessor.hpp b/src/Device/SetupProcessor.hpp
new file mode 100644
index 0000000..be0adc7
--- /dev/null
+++ b/src/Device/SetupProcessor.hpp
@@ -0,0 +1,105 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SetupProcessor_hpp
+#define sw_SetupProcessor_hpp
+
+#include "Context.hpp"
+#include "RoutineCache.hpp"
+#include "Shader/VertexShader.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	struct Primitive;
+	struct Triangle;
+	struct Polygon;
+	struct Vertex;
+	struct DrawCall;
+	struct DrawData;
+
+	class SetupProcessor
+	{
+	public:
+		struct States
+		{
+			unsigned int computeHash();
+
+			bool isDrawPoint               : 1;
+			bool isDrawLine                : 1;
+			bool isDrawTriangle            : 1;
+			bool isDrawSolidTriangle       : 1;
+			bool interpolateZ              : 1;
+			bool interpolateW              : 1;
+			bool perspective               : 1;
+			bool pointSprite               : 1;
+			unsigned int positionRegister  : BITS(VERTEX_OUTPUT_LAST);
+			unsigned int pointSizeRegister : BITS(VERTEX_OUTPUT_LAST);
+			CullMode cullMode              : BITS(CULL_LAST);
+			bool twoSidedStencil           : 1;
+			bool slopeDepthBias            : 1;
+			bool vFace                     : 1;
+			unsigned int multiSample       : 3;   // 1, 2 or 4
+			bool rasterizerDiscard         : 1;
+
+			struct Gradient
+			{
+				unsigned char attribute : BITS(VERTEX_OUTPUT_LAST);
+				bool flat               : 1;
+				bool wrap               : 1;
+			};
+
+			union
+			{
+				struct
+				{
+					Gradient color[2][4];
+					Gradient texture[8][4];
+					Gradient fog;
+				};
+
+				Gradient gradient[MAX_FRAGMENT_INPUTS][4];
+			};
+		};
+
+		struct State : States
+		{
+			State(int i = 0);
+
+			bool operator==(const State &states) const;
+
+			unsigned int hash;
+		};
+
+		typedef bool (*RoutinePointer)(Primitive *primitive, const Triangle *triangle, const Polygon *polygon, const DrawData *draw);
+
+		SetupProcessor(Context *context);
+
+		~SetupProcessor();
+
+	protected:
+		State update() const;
+		Routine *routine(const State &state);
+
+		void setRoutineCacheSize(int cacheSize);
+
+	private:
+		Context *const context;
+
+		RoutineCache<State> *routineCache;
+	};
+}
+
+#endif   // sw_SetupProcessor_hpp
diff --git a/src/Device/Stream.hpp b/src/Device/Stream.hpp
new file mode 100644
index 0000000..969d8b2
--- /dev/null
+++ b/src/Device/Stream.hpp
@@ -0,0 +1,105 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Stream_hpp
+#define sw_Stream_hpp
+
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	class Resource;
+
+	enum StreamType ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+	{
+		STREAMTYPE_COLOR,     // 4 normalized unsigned bytes, ZYXW order
+		STREAMTYPE_UDEC3,     // 3 unsigned 10-bit fields
+		STREAMTYPE_DEC3N,     // 3 normalized signed 10-bit fields
+		STREAMTYPE_INDICES,   // 4 unsigned bytes, stored unconverted into X component
+		STREAMTYPE_FLOAT,     // Normalization ignored
+		STREAMTYPE_BYTE,
+		STREAMTYPE_SBYTE,
+		STREAMTYPE_SHORT,
+		STREAMTYPE_USHORT,
+		STREAMTYPE_INT,
+		STREAMTYPE_UINT,
+		STREAMTYPE_FIXED,     // Normalization ignored (16.16 format)
+		STREAMTYPE_HALF,      // Normalization ignored
+		STREAMTYPE_2_10_10_10_INT,
+		STREAMTYPE_2_10_10_10_UINT,
+
+		STREAMTYPE_LAST = STREAMTYPE_2_10_10_10_UINT
+	};
+
+	struct StreamResource
+	{
+		Resource *resource;
+		const void *buffer;
+		unsigned int stride;
+	};
+
+	struct Stream : public StreamResource
+	{
+		Stream(Resource *resource = 0, const void *buffer = 0, unsigned int stride = 0)
+		{
+			this->resource = resource;
+			this->buffer = buffer;
+			this->stride = stride;
+		}
+
+		Stream &define(StreamType type, unsigned int count, bool normalized = false)
+		{
+			this->type = type;
+			this->count = count;
+			this->normalized = normalized;
+
+			return *this;
+		}
+
+		Stream &define(const void *buffer, StreamType type, unsigned int count, bool normalized = false)
+		{
+			this->buffer = buffer;
+			this->type = type;
+			this->count = count;
+			this->normalized = normalized;
+
+			return *this;
+		}
+
+		Stream &defaults()
+		{
+			static const float4 null = {0, 0, 0, 1};
+
+			resource = 0;
+			buffer = &null;
+			stride = 0;
+			type = STREAMTYPE_FLOAT;
+			count = 0;
+			normalized = false;
+
+			return *this;
+		}
+
+		operator bool() const   // Returns true if stream contains data
+		{
+			return count != 0;
+		}
+
+		StreamType type;
+		unsigned char count;
+		bool normalized;
+	};
+}
+
+#endif   // sw_Stream_hpp
diff --git a/src/Device/Surface.cpp b/src/Device/Surface.cpp
new file mode 100644
index 0000000..e06f2bd
--- /dev/null
+++ b/src/Device/Surface.cpp
@@ -0,0 +1,6217 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Surface.hpp"
+
+#include "Color.hpp"
+#include "Context.hpp"
+#include "ETC_Decoder.hpp"
+#include "Renderer.hpp"
+#include "Common/Half.hpp"
+#include "Common/Memory.hpp"
+#include "Common/CPUID.hpp"
+#include "Common/Resource.hpp"
+#include "Common/Debug.hpp"
+#include "Reactor/Reactor.hpp"
+
+#if defined(__i386__) || defined(__x86_64__)
+	#include <xmmintrin.h>
+	#include <emmintrin.h>
+#endif
+
+#undef min
+#undef max
+
+namespace sw
+{
+	extern bool quadLayoutEnabled;
+	extern bool complementaryDepthBuffer;
+	extern TranscendentalPrecision logPrecision;
+
+	unsigned int *Surface::palette = 0;
+	unsigned int Surface::paletteID = 0;
+
+	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
+	{
+		ASSERT((x >= -border) && (x < (width + border)));
+		ASSERT((y >= -border) && (y < (height + border)));
+		ASSERT((z >= 0) && (z < depth));
+
+		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
+
+		for(int i = 0; i < samples; i++)
+		{
+			write(element, color);
+			element += sliceB;
+		}
+	}
+
+	void Surface::Buffer::write(int x, int y, const Color<float> &color)
+	{
+		ASSERT((x >= -border) && (x < (width + border)));
+		ASSERT((y >= -border) && (y < (height + border)));
+
+		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
+
+		for(int i = 0; i < samples; i++)
+		{
+			write(element, color);
+			element += sliceB;
+		}
+	}
+
+	inline void Surface::Buffer::write(void *element, const Color<float> &color)
+	{
+		float r = color.r;
+		float g = color.g;
+		float b = color.b;
+		float a = color.a;
+
+		if(isSRGBformat(format))
+		{
+			r = linearToSRGB(r);
+			g = linearToSRGB(g);
+			b = linearToSRGB(b);
+		}
+
+		switch(format)
+		{
+		case FORMAT_A8:
+			*(unsigned char*)element = unorm<8>(a);
+			break;
+		case FORMAT_R8_SNORM:
+			*(char*)element = snorm<8>(r);
+			break;
+		case FORMAT_R8:
+			*(unsigned char*)element = unorm<8>(r);
+			break;
+		case FORMAT_R8I:
+			*(char*)element = scast<8>(r);
+			break;
+		case FORMAT_R8UI:
+			*(unsigned char*)element = ucast<8>(r);
+			break;
+		case FORMAT_R16I:
+			*(short*)element = scast<16>(r);
+			break;
+		case FORMAT_R16UI:
+			*(unsigned short*)element = ucast<16>(r);
+			break;
+		case FORMAT_R32I:
+			*(int*)element = static_cast<int>(r);
+			break;
+		case FORMAT_R32UI:
+			*(unsigned int*)element = static_cast<unsigned int>(r);
+			break;
+		case FORMAT_R3G3B2:
+			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
+			break;
+		case FORMAT_A8R3G3B2:
+			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
+			break;
+		case FORMAT_X4R4G4B4:
+			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
+			break;
+		case FORMAT_A4R4G4B4:
+			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
+			break;
+		case FORMAT_R4G4B4A4:
+			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
+			break;
+		case FORMAT_R5G6B5:
+			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
+			break;
+		case FORMAT_A1R5G5B5:
+			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
+			break;
+		case FORMAT_R5G5B5A1:
+			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
+			break;
+		case FORMAT_X1R5G5B5:
+			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
+			break;
+		case FORMAT_A8R8G8B8:
+			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
+			break;
+		case FORMAT_X8R8G8B8:
+			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
+			break;
+		case FORMAT_A8B8G8R8_SNORM:
+			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
+			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
+			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
+			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
+			break;
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_A8:
+			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
+			break;
+		case FORMAT_A8B8G8R8I:
+			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
+			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
+			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
+			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
+			break;
+		case FORMAT_A8B8G8R8UI:
+			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
+			break;
+		case FORMAT_X8B8G8R8_SNORM:
+			*(unsigned int*)element = 0x7F000000 |
+			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
+			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
+			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_SRGB8_X8:
+			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
+			break;
+		case FORMAT_X8B8G8R8I:
+			*(unsigned int*)element = 0x7F000000 |
+			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
+			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
+			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
+		case FORMAT_X8B8G8R8UI:
+			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
+			break;
+		case FORMAT_A2R10G10B10:
+			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
+			break;
+		case FORMAT_A2B10G10R10:
+		case FORMAT_A2B10G10R10UI:
+			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
+			break;
+		case FORMAT_G8R8_SNORM:
+			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
+			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
+			break;
+		case FORMAT_G8R8:
+			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
+			break;
+		case FORMAT_G8R8I:
+			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
+			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
+			break;
+		case FORMAT_G8R8UI:
+			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
+			break;
+		case FORMAT_G16R16:
+			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
+			break;
+		case FORMAT_G16R16I:
+			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
+			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
+			break;
+		case FORMAT_G16R16UI:
+			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
+			break;
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
+			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
+			break;
+		case FORMAT_A16B16G16R16:
+			((unsigned short*)element)[0] = unorm<16>(r);
+			((unsigned short*)element)[1] = unorm<16>(g);
+			((unsigned short*)element)[2] = unorm<16>(b);
+			((unsigned short*)element)[3] = unorm<16>(a);
+			break;
+		case FORMAT_A16B16G16R16I:
+			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
+			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
+			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
+			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
+			break;
+		case FORMAT_A16B16G16R16UI:
+			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
+			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
+			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
+			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
+			break;
+		case FORMAT_X16B16G16R16I:
+			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
+			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
+			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
+			break;
+		case FORMAT_X16B16G16R16UI:
+			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
+			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
+			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
+			break;
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
+			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
+			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
+			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
+			break;
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
+			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
+			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
+			break;
+		case FORMAT_V8U8:
+			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
+			break;
+		case FORMAT_L6V5U5:
+			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
+			break;
+		case FORMAT_Q8W8V8U8:
+			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
+			break;
+		case FORMAT_X8L8V8U8:
+			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
+			break;
+		case FORMAT_V16U16:
+			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
+			break;
+		case FORMAT_A2W10V10U10:
+			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
+			break;
+		case FORMAT_A16W16V16U16:
+			((unsigned short*)element)[0] = snorm<16>(r);
+			((unsigned short*)element)[1] = snorm<16>(g);
+			((unsigned short*)element)[2] = snorm<16>(b);
+			((unsigned short*)element)[3] = unorm<16>(a);
+			break;
+		case FORMAT_Q16W16V16U16:
+			((unsigned short*)element)[0] = snorm<16>(r);
+			((unsigned short*)element)[1] = snorm<16>(g);
+			((unsigned short*)element)[2] = snorm<16>(b);
+			((unsigned short*)element)[3] = snorm<16>(a);
+			break;
+		case FORMAT_R8G8B8:
+			((unsigned char*)element)[0] = unorm<8>(b);
+			((unsigned char*)element)[1] = unorm<8>(g);
+			((unsigned char*)element)[2] = unorm<8>(r);
+			break;
+		case FORMAT_B8G8R8:
+			((unsigned char*)element)[0] = unorm<8>(r);
+			((unsigned char*)element)[1] = unorm<8>(g);
+			((unsigned char*)element)[2] = unorm<8>(b);
+			break;
+		case FORMAT_R16F:
+			*(half*)element = (half)r;
+			break;
+		case FORMAT_A16F:
+			*(half*)element = (half)a;
+			break;
+		case FORMAT_G16R16F:
+			((half*)element)[0] = (half)r;
+			((half*)element)[1] = (half)g;
+			break;
+		case FORMAT_X16B16G16R16F_UNSIGNED:
+			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
+			// Fall through to FORMAT_X16B16G16R16F.
+		case FORMAT_X16B16G16R16F:
+			((half*)element)[3] = 1.0f;
+			// Fall through to FORMAT_B16G16R16F.
+		case FORMAT_B16G16R16F:
+			((half*)element)[0] = (half)r;
+			((half*)element)[1] = (half)g;
+			((half*)element)[2] = (half)b;
+			break;
+		case FORMAT_A16B16G16R16F:
+			((half*)element)[0] = (half)r;
+			((half*)element)[1] = (half)g;
+			((half*)element)[2] = (half)b;
+			((half*)element)[3] = (half)a;
+			break;
+		case FORMAT_A32F:
+			*(float*)element = a;
+			break;
+		case FORMAT_R32F:
+			*(float*)element = r;
+			break;
+		case FORMAT_G32R32F:
+			((float*)element)[0] = r;
+			((float*)element)[1] = g;
+			break;
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
+			// Fall through to FORMAT_X32B32G32R32F.
+		case FORMAT_X32B32G32R32F:
+			((float*)element)[3] = 1.0f;
+			// Fall through to FORMAT_B32G32R32F.
+		case FORMAT_B32G32R32F:
+			((float*)element)[0] = r;
+			((float*)element)[1] = g;
+			((float*)element)[2] = b;
+			break;
+		case FORMAT_A32B32G32R32F:
+			((float*)element)[0] = r;
+			((float*)element)[1] = g;
+			((float*)element)[2] = b;
+			((float*)element)[3] = a;
+			break;
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+			*((float*)element) = r;
+			break;
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+			*((float*)element) = 1 - r;
+			break;
+		case FORMAT_S8:
+			*((unsigned char*)element) = unorm<8>(r);
+			break;
+		case FORMAT_L8:
+			*(unsigned char*)element = unorm<8>(r);
+			break;
+		case FORMAT_A4L4:
+			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
+			break;
+		case FORMAT_L16:
+			*(unsigned short*)element = unorm<16>(r);
+			break;
+		case FORMAT_A8L8:
+			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
+			break;
+		case FORMAT_L16F:
+			*(half*)element = (half)r;
+			break;
+		case FORMAT_A16L16F:
+			((half*)element)[0] = (half)r;
+			((half*)element)[1] = (half)a;
+			break;
+		case FORMAT_L32F:
+			*(float*)element = r;
+			break;
+		case FORMAT_A32L32F:
+			((float*)element)[0] = r;
+			((float*)element)[1] = a;
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	Color<float> Surface::Buffer::read(int x, int y, int z) const
+	{
+		ASSERT((x >= -border) && (x < (width + border)));
+		ASSERT((y >= -border) && (y < (height + border)));
+		ASSERT((z >= 0) && (z < depth));
+
+		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
+
+		return read(element);
+	}
+
+	Color<float> Surface::Buffer::read(int x, int y) const
+	{
+		ASSERT((x >= -border) && (x < (width + border)));
+		ASSERT((y >= -border) && (y < (height + border)));
+
+		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
+
+		return read(element);
+	}
+
+	inline Color<float> Surface::Buffer::read(void *element) const
+	{
+		float r = 0.0f;
+		float g = 0.0f;
+		float b = 0.0f;
+		float a = 1.0f;
+
+		switch(format)
+		{
+		case FORMAT_P8:
+			{
+				ASSERT(palette);
+
+				unsigned int abgr = palette[*(unsigned char*)element];
+
+				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
+				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
+			}
+			break;
+		case FORMAT_A8P8:
+			{
+				ASSERT(palette);
+
+				unsigned int bgr = palette[((unsigned char*)element)[0]];
+
+				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
+				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+			}
+			break;
+		case FORMAT_A8:
+			r = 0;
+			g = 0;
+			b = 0;
+			a = *(unsigned char*)element * (1.0f / 0xFF);
+			break;
+		case FORMAT_R8_SNORM:
+			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
+			break;
+		case FORMAT_R8:
+			r = *(unsigned char*)element * (1.0f / 0xFF);
+			break;
+		case FORMAT_R8I:
+			r = *(signed char*)element;
+			break;
+		case FORMAT_R8UI:
+			r = *(unsigned char*)element;
+			break;
+		case FORMAT_R3G3B2:
+			{
+				unsigned char rgb = *(unsigned char*)element;
+
+				r = (rgb & 0xE0) * (1.0f / 0xE0);
+				g = (rgb & 0x1C) * (1.0f / 0x1C);
+				b = (rgb & 0x03) * (1.0f / 0x03);
+			}
+			break;
+		case FORMAT_A8R3G3B2:
+			{
+				unsigned short argb = *(unsigned short*)element;
+
+				a = (argb & 0xFF00) * (1.0f / 0xFF00);
+				r = (argb & 0x00E0) * (1.0f / 0x00E0);
+				g = (argb & 0x001C) * (1.0f / 0x001C);
+				b = (argb & 0x0003) * (1.0f / 0x0003);
+			}
+			break;
+		case FORMAT_X4R4G4B4:
+			{
+				unsigned short rgb = *(unsigned short*)element;
+
+				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
+				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
+				b = (rgb & 0x000F) * (1.0f / 0x000F);
+			}
+			break;
+		case FORMAT_A4R4G4B4:
+			{
+				unsigned short argb = *(unsigned short*)element;
+
+				a = (argb & 0xF000) * (1.0f / 0xF000);
+				r = (argb & 0x0F00) * (1.0f / 0x0F00);
+				g = (argb & 0x00F0) * (1.0f / 0x00F0);
+				b = (argb & 0x000F) * (1.0f / 0x000F);
+			}
+			break;
+		case FORMAT_R4G4B4A4:
+			{
+				unsigned short rgba = *(unsigned short*)element;
+
+				r = (rgba & 0xF000) * (1.0f / 0xF000);
+				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
+				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
+				a = (rgba & 0x000F) * (1.0f / 0x000F);
+			}
+			break;
+		case FORMAT_R5G6B5:
+			{
+				unsigned short rgb = *(unsigned short*)element;
+
+				r = (rgb & 0xF800) * (1.0f / 0xF800);
+				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
+				b = (rgb & 0x001F) * (1.0f / 0x001F);
+			}
+			break;
+		case FORMAT_A1R5G5B5:
+			{
+				unsigned short argb = *(unsigned short*)element;
+
+				a = (argb & 0x8000) * (1.0f / 0x8000);
+				r = (argb & 0x7C00) * (1.0f / 0x7C00);
+				g = (argb & 0x03E0) * (1.0f / 0x03E0);
+				b = (argb & 0x001F) * (1.0f / 0x001F);
+			}
+			break;
+		case FORMAT_R5G5B5A1:
+			{
+				unsigned short rgba = *(unsigned short*)element;
+
+				r = (rgba & 0xF800) * (1.0f / 0xF800);
+				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
+				b = (rgba & 0x003E) * (1.0f / 0x003E);
+				a = (rgba & 0x0001) * (1.0f / 0x0001);
+			}
+			break;
+		case FORMAT_X1R5G5B5:
+			{
+				unsigned short xrgb = *(unsigned short*)element;
+
+				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
+				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
+				b = (xrgb & 0x001F) * (1.0f / 0x001F);
+			}
+			break;
+		case FORMAT_A8R8G8B8:
+			{
+				unsigned int argb = *(unsigned int*)element;
+
+				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
+				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
+				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
+				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
+			}
+			break;
+		case FORMAT_X8R8G8B8:
+			{
+				unsigned int xrgb = *(unsigned int*)element;
+
+				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
+				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
+				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
+			}
+			break;
+		case FORMAT_A8B8G8R8_SNORM:
+			{
+				signed char* abgr = (signed char*)element;
+
+				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
+				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
+				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
+				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
+			}
+			break;
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_A8:
+			{
+				unsigned int abgr = *(unsigned int*)element;
+
+				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
+				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
+			}
+			break;
+		case FORMAT_A8B8G8R8I:
+			{
+				signed char* abgr = (signed char*)element;
+
+				r = abgr[0];
+				g = abgr[1];
+				b = abgr[2];
+				a = abgr[3];
+			}
+			break;
+		case FORMAT_A8B8G8R8UI:
+			{
+				unsigned char* abgr = (unsigned char*)element;
+
+				r = abgr[0];
+				g = abgr[1];
+				b = abgr[2];
+				a = abgr[3];
+			}
+			break;
+		case FORMAT_X8B8G8R8_SNORM:
+			{
+				signed char* bgr = (signed char*)element;
+
+				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
+				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
+				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
+			}
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_SRGB8_X8:
+			{
+				unsigned int xbgr = *(unsigned int*)element;
+
+				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
+				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
+				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
+			}
+			break;
+		case FORMAT_X8B8G8R8I:
+			{
+				signed char* bgr = (signed char*)element;
+
+				r = bgr[0];
+				g = bgr[1];
+				b = bgr[2];
+			}
+			break;
+		case FORMAT_X8B8G8R8UI:
+			{
+				unsigned char* bgr = (unsigned char*)element;
+
+				r = bgr[0];
+				g = bgr[1];
+				b = bgr[2];
+			}
+			break;
+		case FORMAT_G8R8_SNORM:
+			{
+				signed char* gr = (signed char*)element;
+
+				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
+				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
+			}
+			break;
+		case FORMAT_G8R8:
+			{
+				unsigned short gr = *(unsigned short*)element;
+
+				g = (gr & 0xFF00) * (1.0f / 0xFF00);
+				r = (gr & 0x00FF) * (1.0f / 0x00FF);
+			}
+			break;
+		case FORMAT_G8R8I:
+			{
+				signed char* gr = (signed char*)element;
+
+				r = gr[0];
+				g = gr[1];
+			}
+			break;
+		case FORMAT_G8R8UI:
+			{
+				unsigned char* gr = (unsigned char*)element;
+
+				r = gr[0];
+				g = gr[1];
+			}
+			break;
+		case FORMAT_R16I:
+			r = *((short*)element);
+			break;
+		case FORMAT_R16UI:
+			r = *((unsigned short*)element);
+			break;
+		case FORMAT_G16R16I:
+			{
+				short* gr = (short*)element;
+
+				r = gr[0];
+				g = gr[1];
+			}
+			break;
+		case FORMAT_G16R16:
+			{
+				unsigned int gr = *(unsigned int*)element;
+
+				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
+				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
+			}
+			break;
+		case FORMAT_G16R16UI:
+			{
+				unsigned short* gr = (unsigned short*)element;
+
+				r = gr[0];
+				g = gr[1];
+			}
+			break;
+		case FORMAT_A2R10G10B10:
+			{
+				unsigned int argb = *(unsigned int*)element;
+
+				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
+				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
+				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
+				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
+			}
+			break;
+		case FORMAT_A2B10G10R10:
+			{
+				unsigned int abgr = *(unsigned int*)element;
+
+				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
+				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
+				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
+				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
+			}
+			break;
+		case FORMAT_A2B10G10R10UI:
+			{
+				unsigned int abgr = *(unsigned int*)element;
+
+				a = static_cast<float>((abgr & 0xC0000000) >> 30);
+				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
+				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
+				r = static_cast<float>(abgr & 0x000003FF);
+			}
+			break;
+		case FORMAT_A16B16G16R16I:
+			{
+				short* abgr = (short*)element;
+
+				r = abgr[0];
+				g = abgr[1];
+				b = abgr[2];
+				a = abgr[3];
+			}
+			break;
+		case FORMAT_A16B16G16R16:
+			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
+			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
+			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
+			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
+			break;
+		case FORMAT_A16B16G16R16UI:
+			{
+				unsigned short* abgr = (unsigned short*)element;
+
+				r = abgr[0];
+				g = abgr[1];
+				b = abgr[2];
+				a = abgr[3];
+			}
+			break;
+		case FORMAT_X16B16G16R16I:
+			{
+				short* bgr = (short*)element;
+
+				r = bgr[0];
+				g = bgr[1];
+				b = bgr[2];
+			}
+			break;
+		case FORMAT_X16B16G16R16UI:
+			{
+				unsigned short* bgr = (unsigned short*)element;
+
+				r = bgr[0];
+				g = bgr[1];
+				b = bgr[2];
+			}
+			break;
+		case FORMAT_A32B32G32R32I:
+			{
+				int* abgr = (int*)element;
+
+				r = static_cast<float>(abgr[0]);
+				g = static_cast<float>(abgr[1]);
+				b = static_cast<float>(abgr[2]);
+				a = static_cast<float>(abgr[3]);
+			}
+			break;
+		case FORMAT_A32B32G32R32UI:
+			{
+				unsigned int* abgr = (unsigned int*)element;
+
+				r = static_cast<float>(abgr[0]);
+				g = static_cast<float>(abgr[1]);
+				b = static_cast<float>(abgr[2]);
+				a = static_cast<float>(abgr[3]);
+			}
+			break;
+		case FORMAT_X32B32G32R32I:
+			{
+				int* bgr = (int*)element;
+
+				r = static_cast<float>(bgr[0]);
+				g = static_cast<float>(bgr[1]);
+				b = static_cast<float>(bgr[2]);
+			}
+			break;
+		case FORMAT_X32B32G32R32UI:
+			{
+				unsigned int* bgr = (unsigned int*)element;
+
+				r = static_cast<float>(bgr[0]);
+				g = static_cast<float>(bgr[1]);
+				b = static_cast<float>(bgr[2]);
+			}
+			break;
+		case FORMAT_G32R32I:
+			{
+				int* gr = (int*)element;
+
+				r = static_cast<float>(gr[0]);
+				g = static_cast<float>(gr[1]);
+			}
+			break;
+		case FORMAT_G32R32UI:
+			{
+				unsigned int* gr = (unsigned int*)element;
+
+				r = static_cast<float>(gr[0]);
+				g = static_cast<float>(gr[1]);
+			}
+			break;
+		case FORMAT_R32I:
+			r = static_cast<float>(*((int*)element));
+			break;
+		case FORMAT_R32UI:
+			r = static_cast<float>(*((unsigned int*)element));
+			break;
+		case FORMAT_V8U8:
+			{
+				unsigned short vu = *(unsigned short*)element;
+
+				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
+				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
+			}
+			break;
+		case FORMAT_L6V5U5:
+			{
+				unsigned short lvu = *(unsigned short*)element;
+
+				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
+				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
+				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
+			}
+			break;
+		case FORMAT_Q8W8V8U8:
+			{
+				unsigned int qwvu = *(unsigned int*)element;
+
+				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
+				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
+				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
+				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
+			}
+			break;
+		case FORMAT_X8L8V8U8:
+			{
+				unsigned int xlvu = *(unsigned int*)element;
+
+				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
+				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
+				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
+			}
+			break;
+		case FORMAT_R8G8B8:
+			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
+			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
+			break;
+		case FORMAT_B8G8R8:
+			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
+			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
+			break;
+		case FORMAT_V16U16:
+			{
+				unsigned int vu = *(unsigned int*)element;
+
+				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
+				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
+			}
+			break;
+		case FORMAT_A2W10V10U10:
+			{
+				unsigned int awvu = *(unsigned int*)element;
+
+				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
+				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
+				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
+				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
+			}
+			break;
+		case FORMAT_A16W16V16U16:
+			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
+			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
+			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
+			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
+			break;
+		case FORMAT_Q16W16V16U16:
+			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
+			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
+			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
+			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
+			break;
+		case FORMAT_L8:
+			r =
+			g =
+			b = *(unsigned char*)element * (1.0f / 0xFF);
+			break;
+		case FORMAT_A4L4:
+			{
+				unsigned char al = *(unsigned char*)element;
+
+				r =
+				g =
+				b = (al & 0x0F) * (1.0f / 0x0F);
+				a = (al & 0xF0) * (1.0f / 0xF0);
+			}
+			break;
+		case FORMAT_L16:
+			r =
+			g =
+			b = *(unsigned short*)element * (1.0f / 0xFFFF);
+			break;
+		case FORMAT_A8L8:
+			r =
+			g =
+			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
+			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
+			break;
+		case FORMAT_L16F:
+			r =
+			g =
+			b = *(half*)element;
+			break;
+		case FORMAT_A16L16F:
+			r =
+			g =
+			b = ((half*)element)[0];
+			a = ((half*)element)[1];
+			break;
+		case FORMAT_L32F:
+			r =
+			g =
+			b = *(float*)element;
+			break;
+		case FORMAT_A32L32F:
+			r =
+			g =
+			b = ((float*)element)[0];
+			a = ((float*)element)[1];
+			break;
+		case FORMAT_A16F:
+			a = *(half*)element;
+			break;
+		case FORMAT_R16F:
+			r = *(half*)element;
+			break;
+		case FORMAT_G16R16F:
+			r = ((half*)element)[0];
+			g = ((half*)element)[1];
+			break;
+		case FORMAT_X16B16G16R16F:
+		case FORMAT_X16B16G16R16F_UNSIGNED:
+		case FORMAT_B16G16R16F:
+			r = ((half*)element)[0];
+			g = ((half*)element)[1];
+			b = ((half*)element)[2];
+			break;
+		case FORMAT_A16B16G16R16F:
+			r = ((half*)element)[0];
+			g = ((half*)element)[1];
+			b = ((half*)element)[2];
+			a = ((half*)element)[3];
+			break;
+		case FORMAT_A32F:
+			a = *(float*)element;
+			break;
+		case FORMAT_R32F:
+			r = *(float*)element;
+			break;
+		case FORMAT_G32R32F:
+			r = ((float*)element)[0];
+			g = ((float*)element)[1];
+			break;
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_B32G32R32F:
+			r = ((float*)element)[0];
+			g = ((float*)element)[1];
+			b = ((float*)element)[2];
+			break;
+		case FORMAT_A32B32G32R32F:
+			r = ((float*)element)[0];
+			g = ((float*)element)[1];
+			b = ((float*)element)[2];
+			a = ((float*)element)[3];
+			break;
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+			r = *(float*)element;
+			g = r;
+			b = r;
+			a = r;
+			break;
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+			r = 1.0f - *(float*)element;
+			g = r;
+			b = r;
+			a = r;
+			break;
+		case FORMAT_S8:
+			r = *(unsigned char*)element * (1.0f / 0xFF);
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if(isSRGBformat(format))
+		{
+			r = sRGBtoLinear(r);
+			g = sRGBtoLinear(g);
+			b = sRGBtoLinear(b);
+		}
+
+		return Color<float>(r, g, b, a);
+	}
+
+	Color<float> Surface::Buffer::sample(float x, float y, float z) const
+	{
+		x -= 0.5f;
+		y -= 0.5f;
+		z -= 0.5f;
+
+		int x0 = clamp((int)x, 0, width - 1);
+		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
+
+		int y0 = clamp((int)y, 0, height - 1);
+		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
+
+		int z0 = clamp((int)z, 0, depth - 1);
+		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
+
+		Color<float> c000 = read(x0, y0, z0);
+		Color<float> c100 = read(x1, y0, z0);
+		Color<float> c010 = read(x0, y1, z0);
+		Color<float> c110 = read(x1, y1, z0);
+		Color<float> c001 = read(x0, y0, z1);
+		Color<float> c101 = read(x1, y0, z1);
+		Color<float> c011 = read(x0, y1, z1);
+		Color<float> c111 = read(x1, y1, z1);
+
+		float fx = x - x0;
+		float fy = y - y0;
+		float fz = z - z0;
+
+		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
+		c100 *= fx * (1 - fy) * (1 - fz);
+		c010 *= (1 - fx) * fy * (1 - fz);
+		c110 *= fx * fy * (1 - fz);
+		c001 *= (1 - fx) * (1 - fy) * fz;
+		c101 *= fx * (1 - fy) * fz;
+		c011 *= (1 - fx) * fy * fz;
+		c111 *= fx * fy * fz;
+
+		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
+	}
+
+	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
+	{
+		x -= 0.5f;
+		y -= 0.5f;
+
+		int x0 = clamp((int)x, 0, width - 1);
+		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
+
+		int y0 = clamp((int)y, 0, height - 1);
+		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
+
+		Color<float> c00 = read(x0, y0, layer);
+		Color<float> c10 = read(x1, y0, layer);
+		Color<float> c01 = read(x0, y1, layer);
+		Color<float> c11 = read(x1, y1, layer);
+
+		float fx = x - x0;
+		float fy = y - y0;
+
+		c00 *= (1 - fx) * (1 - fy);
+		c10 *= fx * (1 - fy);
+		c01 *= (1 - fx) * fy;
+		c11 *= fx * fy;
+
+		return c00 + c10 + c01 + c11;
+	}
+
+	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
+	{
+		this->lock = lock;
+
+		switch(lock)
+		{
+		case LOCK_UNLOCKED:
+		case LOCK_READONLY:
+		case LOCK_UPDATE:
+			break;
+		case LOCK_WRITEONLY:
+		case LOCK_READWRITE:
+		case LOCK_DISCARD:
+			dirty = true;
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if(buffer)
+		{
+			x += border;
+			y += border;
+
+			switch(format)
+			{
+			case FORMAT_DXT1:
+			case FORMAT_ATI1:
+			case FORMAT_ETC1:
+			case FORMAT_R11_EAC:
+			case FORMAT_SIGNED_R11_EAC:
+			case FORMAT_RGB8_ETC2:
+			case FORMAT_SRGB8_ETC2:
+			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
+			case FORMAT_RG11_EAC:
+			case FORMAT_SIGNED_RG11_EAC:
+			case FORMAT_RGBA8_ETC2_EAC:
+			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+			case FORMAT_RGBA_ASTC_4x4_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_5x4_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_5x5_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_6x5_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_6x6_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_8x5_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_8x6_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_8x8_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_10x5_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_10x6_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_10x8_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_10x10_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_12x10_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
+			case FORMAT_RGBA_ASTC_12x12_KHR:
+			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
+			case FORMAT_DXT3:
+			case FORMAT_DXT5:
+			case FORMAT_ATI2:
+				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
+			default:
+				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
+			}
+		}
+
+		return nullptr;
+	}
+
+	void Surface::Buffer::unlockRect()
+	{
+		lock = LOCK_UNLOCKED;
+	}
+
+	class SurfaceImplementation : public Surface
+	{
+	public:
+		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
+			: Surface(width, height, depth, format, pixels, pitch, slice) {}
+		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
+			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
+		~SurfaceImplementation() override {};
+
+		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
+		{
+			return Surface::lockInternal(x, y, z, lock, client);
+		}
+
+		void unlockInternal() override
+		{
+			Surface::unlockInternal();
+		}
+	};
+
+	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
+	{
+		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
+	}
+
+	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
+	{
+		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
+	}
+
+	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
+	{
+		resource = new Resource(0);
+		hasParent = false;
+		ownExternal = false;
+		depth = max(1, depth);
+
+		external.buffer = pixels;
+		external.width = width;
+		external.height = height;
+		external.depth = depth;
+		external.samples = 1;
+		external.format = format;
+		external.bytes = bytes(external.format);
+		external.pitchB = pitch;
+		external.pitchP = external.bytes ? pitch / external.bytes : 0;
+		external.sliceB = slice;
+		external.sliceP = external.bytes ? slice / external.bytes : 0;
+		external.border = 0;
+		external.lock = LOCK_UNLOCKED;
+		external.dirty = true;
+
+		internal.buffer = nullptr;
+		internal.width = width;
+		internal.height = height;
+		internal.depth = depth;
+		internal.samples = 1;
+		internal.format = selectInternalFormat(format);
+		internal.bytes = bytes(internal.format);
+		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
+		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
+		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
+		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
+		internal.border = 0;
+		internal.lock = LOCK_UNLOCKED;
+		internal.dirty = false;
+
+		stencil.buffer = nullptr;
+		stencil.width = width;
+		stencil.height = height;
+		stencil.depth = depth;
+		stencil.samples = 1;
+		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
+		stencil.bytes = bytes(stencil.format);
+		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
+		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
+		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
+		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
+		stencil.border = 0;
+		stencil.lock = LOCK_UNLOCKED;
+		stencil.dirty = false;
+
+		dirtyContents = true;
+		paletteUsed = 0;
+	}
+
+	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
+	{
+		resource = texture ? texture : new Resource(0);
+		hasParent = texture != nullptr;
+		ownExternal = true;
+		depth = max(1, depth);
+		samples = max(1, samples);
+
+		external.buffer = nullptr;
+		external.width = width;
+		external.height = height;
+		external.depth = depth;
+		external.samples = (short)samples;
+		external.format = format;
+		external.bytes = bytes(external.format);
+		external.pitchB = !pitchPprovided ? pitchB(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided * external.bytes;
+		external.pitchP = !pitchPprovided ? pitchP(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided;
+		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
+		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
+		external.border = 0;
+		external.lock = LOCK_UNLOCKED;
+		external.dirty = false;
+
+		internal.buffer = nullptr;
+		internal.width = width;
+		internal.height = height;
+		internal.depth = depth;
+		internal.samples = (short)samples;
+		internal.format = selectInternalFormat(format);
+		internal.bytes = bytes(internal.format);
+		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
+		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
+		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
+		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
+		internal.border = (short)border;
+		internal.lock = LOCK_UNLOCKED;
+		internal.dirty = false;
+
+		stencil.buffer = nullptr;
+		stencil.width = width;
+		stencil.height = height;
+		stencil.depth = depth;
+		stencil.samples = (short)samples;
+		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
+		stencil.bytes = bytes(stencil.format);
+		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
+		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
+		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
+		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
+		stencil.border = 0;
+		stencil.lock = LOCK_UNLOCKED;
+		stencil.dirty = false;
+
+		dirtyContents = true;
+		paletteUsed = 0;
+	}
+
+	Surface::~Surface()
+	{
+		// sync() must be called before this destructor to ensure all locks have been released.
+		// We can't call it here because the parent resource may already have been destroyed.
+		ASSERT(isUnlocked());
+
+		if(!hasParent)
+		{
+			resource->destruct();
+		}
+
+		if(ownExternal)
+		{
+			deallocate(external.buffer);
+		}
+
+		if(internal.buffer != external.buffer)
+		{
+			deallocate(internal.buffer);
+		}
+
+		deallocate(stencil.buffer);
+
+		external.buffer = nullptr;
+		internal.buffer = nullptr;
+		stencil.buffer = nullptr;
+	}
+
+	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
+	{
+		resource->lock(client);
+
+		if(!external.buffer)
+		{
+			if(internal.buffer && identicalBuffers())
+			{
+				external.buffer = internal.buffer;
+			}
+			else
+			{
+				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
+			}
+		}
+
+		if(internal.dirty)
+		{
+			if(lock != LOCK_DISCARD)
+			{
+				update(external, internal);
+			}
+
+			internal.dirty = false;
+		}
+
+		switch(lock)
+		{
+		case LOCK_READONLY:
+			break;
+		case LOCK_WRITEONLY:
+		case LOCK_READWRITE:
+		case LOCK_DISCARD:
+			dirtyContents = true;
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		return external.lockRect(x, y, z, lock);
+	}
+
+	void Surface::unlockExternal()
+	{
+		external.unlockRect();
+
+		resource->unlock();
+	}
+
+	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
+	{
+		if(lock != LOCK_UNLOCKED)
+		{
+			resource->lock(client);
+		}
+
+		if(!internal.buffer)
+		{
+			if(external.buffer && identicalBuffers())
+			{
+				internal.buffer = external.buffer;
+			}
+			else
+			{
+				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
+			}
+		}
+
+		// FIXME: WHQL requires conversion to lower external precision and back
+		if(logPrecision >= WHQL)
+		{
+			if(internal.dirty && renderTarget && internal.format != external.format)
+			{
+				if(lock != LOCK_DISCARD)
+				{
+					switch(external.format)
+					{
+					case FORMAT_R3G3B2:
+					case FORMAT_A8R3G3B2:
+					case FORMAT_A1R5G5B5:
+					case FORMAT_A2R10G10B10:
+					case FORMAT_A2B10G10R10:
+						lockExternal(0, 0, 0, LOCK_READWRITE, client);
+						unlockExternal();
+						break;
+					default:
+						// Difference passes WHQL
+						break;
+					}
+				}
+			}
+		}
+
+		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
+		{
+			if(lock != LOCK_DISCARD)
+			{
+				update(internal, external);
+			}
+
+			external.dirty = false;
+			paletteUsed = Surface::paletteID;
+		}
+
+		switch(lock)
+		{
+		case LOCK_UNLOCKED:
+		case LOCK_READONLY:
+			break;
+		case LOCK_WRITEONLY:
+		case LOCK_READWRITE:
+		case LOCK_DISCARD:
+			dirtyContents = true;
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if(lock == LOCK_READONLY && client == PUBLIC)
+		{
+			resolve();
+		}
+
+		return internal.lockRect(x, y, z, lock);
+	}
+
+	void Surface::unlockInternal()
+	{
+		internal.unlockRect();
+
+		resource->unlock();
+	}
+
+	void *Surface::lockStencil(int x, int y, int front, Accessor client)
+	{
+		resource->lock(client);
+
+		if(stencil.format == FORMAT_NULL)
+		{
+			return nullptr;
+		}
+
+		if(!stencil.buffer)
+		{
+			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
+		}
+
+		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
+	}
+
+	void Surface::unlockStencil()
+	{
+		stencil.unlockRect();
+
+		resource->unlock();
+	}
+
+	int Surface::bytes(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_NULL:				return 0;
+		case FORMAT_P8:					return 1;
+		case FORMAT_A8P8:				return 2;
+		case FORMAT_A8:					return 1;
+		case FORMAT_R8I:				return 1;
+		case FORMAT_R8:					return 1;
+		case FORMAT_R3G3B2:				return 1;
+		case FORMAT_R16I:				return 2;
+		case FORMAT_R16UI:				return 2;
+		case FORMAT_A8R3G3B2:			return 2;
+		case FORMAT_R5G6B5:				return 2;
+		case FORMAT_A1R5G5B5:			return 2;
+		case FORMAT_X1R5G5B5:			return 2;
+		case FORMAT_R5G5B5A1:           return 2;
+		case FORMAT_X4R4G4B4:			return 2;
+		case FORMAT_A4R4G4B4:			return 2;
+		case FORMAT_R4G4B4A4:           return 2;
+		case FORMAT_R8G8B8:				return 3;
+		case FORMAT_B8G8R8:             return 3;
+		case FORMAT_R32I:				return 4;
+		case FORMAT_R32UI:				return 4;
+		case FORMAT_X8R8G8B8:			return 4;
+	//	case FORMAT_X8G8R8B8Q:			return 4;
+		case FORMAT_A8R8G8B8:			return 4;
+	//	case FORMAT_A8G8R8B8Q:			return 4;
+		case FORMAT_X8B8G8R8I:			return 4;
+		case FORMAT_X8B8G8R8:			return 4;
+		case FORMAT_SRGB8_X8:			return 4;
+		case FORMAT_SRGB8_A8:			return 4;
+		case FORMAT_A8B8G8R8I:			return 4;
+		case FORMAT_R8UI:				return 1;
+		case FORMAT_G8R8UI:				return 2;
+		case FORMAT_X8B8G8R8UI:			return 4;
+		case FORMAT_A8B8G8R8UI:			return 4;
+		case FORMAT_A8B8G8R8:			return 4;
+		case FORMAT_R8_SNORM:			return 1;
+		case FORMAT_G8R8_SNORM:		return 2;
+		case FORMAT_X8B8G8R8_SNORM:	return 4;
+		case FORMAT_A8B8G8R8_SNORM:	return 4;
+		case FORMAT_A2R10G10B10:		return 4;
+		case FORMAT_A2B10G10R10:		return 4;
+		case FORMAT_A2B10G10R10UI:		return 4;
+		case FORMAT_G8R8I:				return 2;
+		case FORMAT_G8R8:				return 2;
+		case FORMAT_G16R16I:			return 4;
+		case FORMAT_G16R16UI:			return 4;
+		case FORMAT_G16R16:				return 4;
+		case FORMAT_G32R32I:			return 8;
+		case FORMAT_G32R32UI:			return 8;
+		case FORMAT_X16B16G16R16I:		return 8;
+		case FORMAT_X16B16G16R16UI:		return 8;
+		case FORMAT_A16B16G16R16I:		return 8;
+		case FORMAT_A16B16G16R16UI:		return 8;
+		case FORMAT_A16B16G16R16:		return 8;
+		case FORMAT_X32B32G32R32I:		return 16;
+		case FORMAT_X32B32G32R32UI:		return 16;
+		case FORMAT_A32B32G32R32I:		return 16;
+		case FORMAT_A32B32G32R32UI:		return 16;
+		// Compressed formats
+		case FORMAT_DXT1:				return 2;   // Column of four pixels
+		case FORMAT_DXT3:				return 4;   // Column of four pixels
+		case FORMAT_DXT5:				return 4;   // Column of four pixels
+		case FORMAT_ATI1:				return 2;   // Column of four pixels
+		case FORMAT_ATI2:				return 4;   // Column of four pixels
+		case FORMAT_ETC1:				return 2;   // Column of four pixels
+		case FORMAT_R11_EAC:			return 2;
+		case FORMAT_SIGNED_R11_EAC:		return 2;
+		case FORMAT_RG11_EAC:			return 4;
+		case FORMAT_SIGNED_RG11_EAC:	return 4;
+		case FORMAT_RGB8_ETC2:			return 2;
+		case FORMAT_SRGB8_ETC2:			return 2;
+		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
+		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
+		case FORMAT_RGBA8_ETC2_EAC:			return 4;
+		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
+		case FORMAT_RGBA_ASTC_4x4_KHR:
+		case FORMAT_RGBA_ASTC_5x4_KHR:
+		case FORMAT_RGBA_ASTC_5x5_KHR:
+		case FORMAT_RGBA_ASTC_6x5_KHR:
+		case FORMAT_RGBA_ASTC_6x6_KHR:
+		case FORMAT_RGBA_ASTC_8x5_KHR:
+		case FORMAT_RGBA_ASTC_8x6_KHR:
+		case FORMAT_RGBA_ASTC_8x8_KHR:
+		case FORMAT_RGBA_ASTC_10x5_KHR:
+		case FORMAT_RGBA_ASTC_10x6_KHR:
+		case FORMAT_RGBA_ASTC_10x8_KHR:
+		case FORMAT_RGBA_ASTC_10x10_KHR:
+		case FORMAT_RGBA_ASTC_12x10_KHR:
+		case FORMAT_RGBA_ASTC_12x12_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
+		// Bumpmap formats
+		case FORMAT_V8U8:				return 2;
+		case FORMAT_L6V5U5:				return 2;
+		case FORMAT_Q8W8V8U8:			return 4;
+		case FORMAT_X8L8V8U8:			return 4;
+		case FORMAT_A2W10V10U10:		return 4;
+		case FORMAT_V16U16:				return 4;
+		case FORMAT_A16W16V16U16:		return 8;
+		case FORMAT_Q16W16V16U16:		return 8;
+		// Luminance formats
+		case FORMAT_L8:					return 1;
+		case FORMAT_A4L4:				return 1;
+		case FORMAT_L16:				return 2;
+		case FORMAT_A8L8:				return 2;
+		case FORMAT_L16F:               return 2;
+		case FORMAT_A16L16F:            return 4;
+		case FORMAT_L32F:               return 4;
+		case FORMAT_A32L32F:            return 8;
+		// Floating-point formats
+		case FORMAT_A16F:				return 2;
+		case FORMAT_R16F:				return 2;
+		case FORMAT_G16R16F:			return 4;
+		case FORMAT_B16G16R16F:			return 6;
+		case FORMAT_X16B16G16R16F:		return 8;
+		case FORMAT_A16B16G16R16F:		return 8;
+		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
+		case FORMAT_A32F:				return 4;
+		case FORMAT_R32F:				return 4;
+		case FORMAT_G32R32F:			return 8;
+		case FORMAT_B32G32R32F:			return 12;
+		case FORMAT_X32B32G32R32F:		return 16;
+		case FORMAT_A32B32G32R32F:		return 16;
+		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
+		// Depth/stencil formats
+		case FORMAT_D16:				return 2;
+		case FORMAT_D32:				return 4;
+		case FORMAT_D24X8:				return 4;
+		case FORMAT_D24S8:				return 4;
+		case FORMAT_D24FS8:				return 4;
+		case FORMAT_D32F:				return 4;
+		case FORMAT_D32FS8:				return 4;
+		case FORMAT_D32F_COMPLEMENTARY:	return 4;
+		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
+		case FORMAT_D32F_LOCKABLE:		return 4;
+		case FORMAT_D32FS8_TEXTURE:		return 4;
+		case FORMAT_D32F_SHADOW:		return 4;
+		case FORMAT_D32FS8_SHADOW:		return 4;
+		case FORMAT_DF24S8:				return 4;
+		case FORMAT_DF16S8:				return 2;
+		case FORMAT_INTZ:				return 4;
+		case FORMAT_S8:					return 1;
+		case FORMAT_YV12_BT601:         return 1;   // Y plane only
+		case FORMAT_YV12_BT709:         return 1;   // Y plane only
+		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
+		default:
+			ASSERT(false);
+		}
+
+		return 0;
+	}
+
+	int Surface::pitchB(int width, int border, Format format, bool target)
+	{
+		width += 2 * border;
+
+		// Render targets require 2x2 quads
+		if(target || isDepth(format) || isStencil(format))
+		{
+			width = align<2>(width);
+		}
+
+		switch(format)
+		{
+		case FORMAT_DXT1:
+		case FORMAT_ETC1:
+		case FORMAT_R11_EAC:
+		case FORMAT_SIGNED_R11_EAC:
+		case FORMAT_RGB8_ETC2:
+		case FORMAT_SRGB8_ETC2:
+		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
+		case FORMAT_RG11_EAC:
+		case FORMAT_SIGNED_RG11_EAC:
+		case FORMAT_RGBA8_ETC2_EAC:
+		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+		case FORMAT_RGBA_ASTC_4x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
+		case FORMAT_RGBA_ASTC_5x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+		case FORMAT_RGBA_ASTC_5x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+			return 16 * ((width + 4) / 5);
+		case FORMAT_RGBA_ASTC_6x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+		case FORMAT_RGBA_ASTC_6x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+			return 16 * ((width + 5) / 6);
+		case FORMAT_RGBA_ASTC_8x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+		case FORMAT_RGBA_ASTC_8x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+		case FORMAT_RGBA_ASTC_8x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+			return 16 * ((width + 7) / 8);
+		case FORMAT_RGBA_ASTC_10x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+		case FORMAT_RGBA_ASTC_10x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+		case FORMAT_RGBA_ASTC_10x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+		case FORMAT_RGBA_ASTC_10x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+			return 16 * ((width + 9) / 10);
+		case FORMAT_RGBA_ASTC_12x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+		case FORMAT_RGBA_ASTC_12x12_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+			return 16 * ((width + 11) / 12);
+		case FORMAT_DXT3:
+		case FORMAT_DXT5:
+			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
+		case FORMAT_ATI1:
+			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
+		case FORMAT_ATI2:
+			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return align<16>(width);
+		default:
+			return bytes(format) * width;
+		}
+	}
+
+	int Surface::pitchP(int width, int border, Format format, bool target)
+	{
+		int B = bytes(format);
+
+		return B > 0 ? pitchB(width, border, format, target) / B : 0;
+	}
+
+	int Surface::sliceB(int width, int height, int border, Format format, bool target)
+	{
+		height += 2 * border;
+
+		// Render targets require 2x2 quads
+		if(target || isDepth(format) || isStencil(format))
+		{
+			height = align<2>(height);
+		}
+
+		switch(format)
+		{
+		case FORMAT_DXT1:
+		case FORMAT_DXT3:
+		case FORMAT_DXT5:
+		case FORMAT_ETC1:
+		case FORMAT_R11_EAC:
+		case FORMAT_SIGNED_R11_EAC:
+		case FORMAT_RG11_EAC:
+		case FORMAT_SIGNED_RG11_EAC:
+		case FORMAT_RGB8_ETC2:
+		case FORMAT_SRGB8_ETC2:
+		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+		case FORMAT_RGBA8_ETC2_EAC:
+		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+		case FORMAT_RGBA_ASTC_4x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+		case FORMAT_RGBA_ASTC_5x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
+		case FORMAT_RGBA_ASTC_5x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+		case FORMAT_RGBA_ASTC_6x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+		case FORMAT_RGBA_ASTC_8x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+		case FORMAT_RGBA_ASTC_10x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+			return pitchB(width, border, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
+		case FORMAT_RGBA_ASTC_6x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+		case FORMAT_RGBA_ASTC_8x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+		case FORMAT_RGBA_ASTC_10x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+			return pitchB(width, border, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
+		case FORMAT_RGBA_ASTC_8x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+		case FORMAT_RGBA_ASTC_10x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+			return pitchB(width, border, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
+		case FORMAT_RGBA_ASTC_10x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+		case FORMAT_RGBA_ASTC_12x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+			return pitchB(width, border, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
+		case FORMAT_RGBA_ASTC_12x12_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+			return pitchB(width, border, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
+		case FORMAT_ATI1:
+		case FORMAT_ATI2:
+			return pitchB(width, border, format, target) * align<4>(height);   // Pitch computed per row
+		default:
+			return pitchB(width, border, format, target) * height;   // Pitch computed per row
+		}
+	}
+
+	int Surface::sliceP(int width, int height, int border, Format format, bool target)
+	{
+		int B = bytes(format);
+
+		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
+	}
+
+	void Surface::update(Buffer &destination, Buffer &source)
+	{
+	//	ASSERT(source.lock != LOCK_UNLOCKED);
+	//	ASSERT(destination.lock != LOCK_UNLOCKED);
+
+		if(destination.buffer != source.buffer)
+		{
+			ASSERT(source.dirty && !destination.dirty);
+
+			switch(source.format)
+			{
+			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
+			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
+			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
+			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
+			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
+			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
+			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
+			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
+			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
+			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
+			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
+			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
+			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
+			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
+			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
+			case FORMAT_ETC1:
+			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
+			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
+			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
+			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
+			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
+			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
+			default:				genericUpdate(destination, source);		break;
+			}
+		}
+	}
+
+	void Surface::genericUpdate(Buffer &destination, Buffer &source)
+	{
+		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+		int rowBytes = width * source.bytes;
+
+		for(int z = 0; z < depth; z++)
+		{
+			unsigned char *sourceRow = sourceSlice;
+			unsigned char *destinationRow = destinationSlice;
+
+			for(int y = 0; y < height; y++)
+			{
+				if(source.format == destination.format)
+				{
+					memcpy(destinationRow, sourceRow, rowBytes);
+				}
+				else
+				{
+					unsigned char *sourceElement = sourceRow;
+					unsigned char *destinationElement = destinationRow;
+
+					for(int x = 0; x < width; x++)
+					{
+						Color<float> color = source.read(sourceElement);
+						destination.write(destinationElement, color);
+
+						sourceElement += source.bytes;
+						destinationElement += destination.bytes;
+					}
+				}
+
+				sourceRow += source.pitchB;
+				destinationRow += destination.pitchB;
+			}
+
+			sourceSlice += source.sliceB;
+			destinationSlice += destination.sliceB;
+		}
+
+		source.unlockRect();
+		destination.unlockRect();
+	}
+
+	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
+	{
+		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
+		{
+			unsigned char *sourceRow = sourceSlice;
+			unsigned char *destinationRow = destinationSlice;
+
+			for(int y = 0; y < height; y++)
+			{
+				unsigned char *sourceElement = sourceRow;
+				unsigned char *destinationElement = destinationRow;
+
+				for(int x = 0; x < width; x++)
+				{
+					unsigned int b = sourceElement[0];
+					unsigned int g = sourceElement[1];
+					unsigned int r = sourceElement[2];
+
+					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
+
+					sourceElement += source.bytes;
+					destinationElement += destination.bytes;
+				}
+
+				sourceRow += source.pitchB;
+				destinationRow += destination.pitchB;
+			}
+
+			sourceSlice += source.sliceB;
+			destinationSlice += destination.sliceB;
+		}
+
+		source.unlockRect();
+		destination.unlockRect();
+	}
+
+	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
+	{
+		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
+		{
+			unsigned char *sourceRow = sourceSlice;
+			unsigned char *destinationRow = destinationSlice;
+
+			for(int y = 0; y < height; y++)
+			{
+				unsigned char *sourceElement = sourceRow;
+				unsigned char *destinationElement = destinationRow;
+
+				for(int x = 0; x < width; x++)
+				{
+					unsigned int xrgb = *(unsigned short*)sourceElement;
+
+					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
+					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
+					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
+
+					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
+
+					sourceElement += source.bytes;
+					destinationElement += destination.bytes;
+				}
+
+				sourceRow += source.pitchB;
+				destinationRow += destination.pitchB;
+			}
+
+			sourceSlice += source.sliceB;
+			destinationSlice += destination.sliceB;
+		}
+
+		source.unlockRect();
+		destination.unlockRect();
+	}
+
+	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
+	{
+		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
+		{
+			unsigned char *sourceRow = sourceSlice;
+			unsigned char *destinationRow = destinationSlice;
+
+			for(int y = 0; y < height; y++)
+			{
+				unsigned char *sourceElement = sourceRow;
+				unsigned char *destinationElement = destinationRow;
+
+				for(int x = 0; x < width; x++)
+				{
+					unsigned int argb = *(unsigned short*)sourceElement;
+
+					unsigned int a =   (argb & 0x8000) * 130560;
+					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
+					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
+					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
+
+					*(unsigned int*)destinationElement = a | r | g | b;
+
+					sourceElement += source.bytes;
+					destinationElement += destination.bytes;
+				}
+
+				sourceRow += source.pitchB;
+				destinationRow += destination.pitchB;
+			}
+
+			sourceSlice += source.sliceB;
+			destinationSlice += destination.sliceB;
+		}
+
+		source.unlockRect();
+		destination.unlockRect();
+	}
+
+	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
+	{
+		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
+		{
+			unsigned char *sourceRow = sourceSlice;
+			unsigned char *destinationRow = destinationSlice;
+
+			for(int y = 0; y < height; y++)
+			{
+				unsigned char *sourceElement = sourceRow;
+				unsigned char *destinationElement = destinationRow;
+
+				for(int x = 0; x < width; x++)
+				{
+					unsigned int xrgb = *(unsigned short*)sourceElement;
+
+					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
+					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
+					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
+
+					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
+
+					sourceElement += source.bytes;
+					destinationElement += destination.bytes;
+				}
+
+				sourceRow += source.pitchB;
+				destinationRow += destination.pitchB;
+			}
+
+			sourceSlice += source.sliceB;
+			destinationSlice += destination.sliceB;
+		}
+
+		source.unlockRect();
+		destination.unlockRect();
+	}
+
+	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
+	{
+		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
+		{
+			unsigned char *sourceRow = sourceSlice;
+			unsigned char *destinationRow = destinationSlice;
+
+			for(int y = 0; y < height; y++)
+			{
+				unsigned char *sourceElement = sourceRow;
+				unsigned char *destinationElement = destinationRow;
+
+				for(int x = 0; x < width; x++)
+				{
+					unsigned int argb = *(unsigned short*)sourceElement;
+
+					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
+					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
+					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
+					unsigned int b =  (argb & 0x000F) * 0x00000011;
+
+					*(unsigned int*)destinationElement = a | r | g | b;
+
+					sourceElement += source.bytes;
+					destinationElement += destination.bytes;
+				}
+
+				sourceRow += source.pitchB;
+				destinationRow += destination.pitchB;
+			}
+
+			sourceSlice += source.sliceB;
+			destinationSlice += destination.sliceB;
+		}
+
+		source.unlockRect();
+		destination.unlockRect();
+	}
+
+	void Surface::decodeP8(Buffer &destination, Buffer &source)
+	{
+		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
+		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
+
+		int depth = min(destination.depth, source.depth);
+		int height = min(destination.height, source.height);
+		int width = min(destination.width, source.width);
+
+		for(int z = 0; z < depth; z++)
+		{
+			unsigned char *sourceRow = sourceSlice;
+			unsigned char *destinationRow = destinationSlice;
+
+			for(int y = 0; y < height; y++)
+			{
+				unsigned char *sourceElement = sourceRow;
+				unsigned char *destinationElement = destinationRow;
+
+				for(int x = 0; x < width; x++)
+				{
+					unsigned int abgr = palette[*(unsigned char*)sourceElement];
+
+					unsigned int r = (abgr & 0x000000FF) << 16;
+					unsigned int g = (abgr & 0x0000FF00) << 0;
+					unsigned int b = (abgr & 0x00FF0000) >> 16;
+					unsigned int a = (abgr & 0xFF000000) >> 0;
+
+					*(unsigned int*)destinationElement = a | r | g | b;
+
+					sourceElement += source.bytes;
+					destinationElement += destination.bytes;
+				}
+
+				sourceRow += source.pitchB;
+				destinationRow += destination.pitchB;
+			}
+
+			sourceSlice += source.sliceB;
+			destinationSlice += destination.sliceB;
+		}
+
+		source.unlockRect();
+		destination.unlockRect();
+	}
+
+	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
+	{
+		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+		for(int z = 0; z < external.depth; z++)
+		{
+			unsigned int *dest = destSlice;
+
+			for(int y = 0; y < external.height; y += 4)
+			{
+				for(int x = 0; x < external.width; x += 4)
+				{
+					Color<byte> c[4];
+
+					c[0] = source->c0;
+					c[1] = source->c1;
+
+					if(source->c0 > source->c1)   // No transparency
+					{
+						// c2 = 2 / 3 * c0 + 1 / 3 * c1
+						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
+						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
+						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
+						c[2].a = 0xFF;
+
+						// c3 = 1 / 3 * c0 + 2 / 3 * c1
+						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
+						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
+						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
+						c[3].a = 0xFF;
+					}
+					else   // c3 transparent
+					{
+						// c2 = 1 / 2 * c0 + 1 / 2 * c1
+						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
+						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
+						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
+						c[2].a = 0xFF;
+
+						c[3].r = 0;
+						c[3].g = 0;
+						c[3].b = 0;
+						c[3].a = 0;
+					}
+
+					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+					{
+						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+						{
+							dest[(x + i) + (y + j) * internal.pitchP] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
+						}
+					}
+
+					source++;
+				}
+			}
+
+			(byte*&)destSlice += internal.sliceB;
+		}
+
+		external.unlockRect();
+		internal.unlockRect();
+	}
+
+	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
+	{
+		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+		for(int z = 0; z < external.depth; z++)
+		{
+			unsigned int *dest = destSlice;
+
+			for(int y = 0; y < external.height; y += 4)
+			{
+				for(int x = 0; x < external.width; x += 4)
+				{
+					Color<byte> c[4];
+
+					c[0] = source->c0;
+					c[1] = source->c1;
+
+					// c2 = 2 / 3 * c0 + 1 / 3 * c1
+					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
+					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
+					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
+
+					// c3 = 1 / 3 * c0 + 2 / 3 * c1
+					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
+					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
+					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
+
+					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+					{
+						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+						{
+							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
+							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
+
+							dest[(x + i) + (y + j) * internal.pitchP] = color;
+						}
+					}
+
+					source++;
+				}
+			}
+
+			(byte*&)destSlice += internal.sliceB;
+		}
+
+		external.unlockRect();
+		internal.unlockRect();
+	}
+
+	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
+	{
+		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+		for(int z = 0; z < external.depth; z++)
+		{
+			unsigned int *dest = destSlice;
+
+			for(int y = 0; y < external.height; y += 4)
+			{
+				for(int x = 0; x < external.width; x += 4)
+				{
+					Color<byte> c[4];
+
+					c[0] = source->c0;
+					c[1] = source->c1;
+
+					// c2 = 2 / 3 * c0 + 1 / 3 * c1
+					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
+					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
+					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
+
+					// c3 = 1 / 3 * c0 + 2 / 3 * c1
+					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
+					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
+					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
+
+					byte a[8];
+
+					a[0] = source->a0;
+					a[1] = source->a1;
+
+					if(a[0] > a[1])
+					{
+						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
+						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
+						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
+						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
+						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
+						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
+					}
+					else
+					{
+						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
+						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
+						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
+						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
+						a[6] = 0;
+						a[7] = 0xFF;
+					}
+
+					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+					{
+						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+						{
+							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
+							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
+
+							dest[(x + i) + (y + j) * internal.pitchP] = color;
+						}
+					}
+
+					source++;
+				}
+			}
+
+			(byte*&)destSlice += internal.sliceB;
+		}
+
+		external.unlockRect();
+		internal.unlockRect();
+	}
+
+	void Surface::decodeATI1(Buffer &internal, Buffer &external)
+	{
+		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+		for(int z = 0; z < external.depth; z++)
+		{
+			byte *dest = destSlice;
+
+			for(int y = 0; y < external.height; y += 4)
+			{
+				for(int x = 0; x < external.width; x += 4)
+				{
+					byte r[8];
+
+					r[0] = source->r0;
+					r[1] = source->r1;
+
+					if(r[0] > r[1])
+					{
+						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
+						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
+						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
+						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
+						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
+						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
+					}
+					else
+					{
+						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
+						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
+						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
+						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
+						r[6] = 0;
+						r[7] = 0xFF;
+					}
+
+					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+					{
+						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+						{
+							dest[(x + i) + (y + j) * internal.pitchP] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
+						}
+					}
+
+					source++;
+				}
+			}
+
+			destSlice += internal.sliceB;
+		}
+
+		external.unlockRect();
+		internal.unlockRect();
+	}
+
+	void Surface::decodeATI2(Buffer &internal, Buffer &external)
+	{
+		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
+		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
+
+		for(int z = 0; z < external.depth; z++)
+		{
+			word *dest = destSlice;
+
+			for(int y = 0; y < external.height; y += 4)
+			{
+				for(int x = 0; x < external.width; x += 4)
+				{
+					byte X[8];
+
+					X[0] = source->x0;
+					X[1] = source->x1;
+
+					if(X[0] > X[1])
+					{
+						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
+						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
+						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
+						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
+						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
+						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
+					}
+					else
+					{
+						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
+						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
+						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
+						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
+						X[6] = 0;
+						X[7] = 0xFF;
+					}
+
+					byte Y[8];
+
+					Y[0] = source->y0;
+					Y[1] = source->y1;
+
+					if(Y[0] > Y[1])
+					{
+						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
+						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
+						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
+						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
+						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
+						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
+					}
+					else
+					{
+						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
+						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
+						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
+						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
+						Y[6] = 0;
+						Y[7] = 0xFF;
+					}
+
+					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
+					{
+						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
+						{
+							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
+							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
+
+							dest[(x + i) + (y + j) * internal.pitchP] = (g << 8) + r;
+						}
+					}
+
+					source++;
+				}
+			}
+
+			(byte*&)destSlice += internal.sliceB;
+		}
+
+		external.unlockRect();
+		internal.unlockRect();
+	}
+
+	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
+	{
+		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
+		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
+		external.unlockRect();
+		internal.unlockRect();
+
+		if(isSRGB)
+		{
+			static byte sRGBtoLinearTable[256];
+			static bool sRGBtoLinearTableDirty = true;
+			if(sRGBtoLinearTableDirty)
+			{
+				for(int i = 0; i < 256; i++)
+				{
+					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
+				}
+				sRGBtoLinearTableDirty = false;
+			}
+
+			// Perform sRGB conversion in place after decoding
+			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
+			for(int y = 0; y < internal.height; y++)
+			{
+				byte *srcRow = src + y * internal.pitchB;
+				for(int x = 0; x <  internal.width; x++)
+				{
+					byte *srcPix = srcRow + x * internal.bytes;
+					for(int i = 0; i < 3; i++)
+					{
+						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
+					}
+				}
+			}
+			internal.unlockRect();
+		}
+	}
+
+	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
+	{
+		ASSERT(nbChannels == 1 || nbChannels == 2);
+
+		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
+		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
+		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
+		external.unlockRect();
+
+		// FIXME: We convert EAC data to float, until signed short internal formats are supported
+		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
+		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
+		for(int y = 0; y < internal.height; y++)
+		{
+			byte* srcRow = src + y * internal.pitchB;
+			for(int x = internal.width - 1; x >= 0; x--)
+			{
+				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
+				float* dstPix = reinterpret_cast<float*>(srcPix);
+				for(int c = nbChannels - 1; c >= 0; c--)
+				{
+					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
+				}
+			}
+		}
+
+		internal.unlockRect();
+	}
+
+	void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
+	{
+	}
+
+	size_t Surface::size(int width, int height, int depth, int border, int samples, Format format)
+	{
+		samples = max(1, samples);
+
+		switch(format)
+		{
+		default:
+			{
+				uint64_t size = (uint64_t)sliceB(width, height, border, format, true) * depth * samples;
+
+				// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
+				// and stencil operations also read 8 bytes per four 8-bit stencil values,
+				// so we have to allocate 4 extra bytes to avoid buffer overruns.
+				size += 4;
+
+				// We can only sample buffers smaller than 2 GiB.
+				// Force an out-of-memory if larger, or let the caller report an error.
+				return size < 0x80000000u ? (size_t)size : std::numeric_limits<size_t>::max();
+			}
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			{
+				width += 2 * border;
+				height += 2 * border;
+
+				size_t YStride = align<16>(width);
+				size_t YSize = YStride * height;
+				size_t CStride = align<16>(YStride / 2);
+				size_t CSize = CStride * height / 2;
+
+				return YSize + 2 * CSize;
+			}
+		}
+	}
+
+	bool Surface::isStencil(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_D32:
+		case FORMAT_D16:
+		case FORMAT_D24X8:
+		case FORMAT_D32F:
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32F_SHADOW:
+			return false;
+		case FORMAT_D24S8:
+		case FORMAT_D24FS8:
+		case FORMAT_S8:
+		case FORMAT_DF24S8:
+		case FORMAT_DF16S8:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_D32FS8:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+		case FORMAT_INTZ:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isDepth(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_D32:
+		case FORMAT_D16:
+		case FORMAT_D24X8:
+		case FORMAT_D24S8:
+		case FORMAT_D24FS8:
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_DF24S8:
+		case FORMAT_DF16S8:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_INTZ:
+			return true;
+		case FORMAT_S8:
+			return false;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::hasQuadLayout(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_D32:
+		case FORMAT_D16:
+		case FORMAT_D24X8:
+		case FORMAT_D24S8:
+		case FORMAT_D24FS8:
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+		case FORMAT_DF24S8:
+		case FORMAT_DF16S8:
+		case FORMAT_INTZ:
+		case FORMAT_S8:
+		case FORMAT_A8G8R8B8Q:
+		case FORMAT_X8G8R8B8Q:
+			return true;
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		default:
+			break;
+		}
+
+		return false;
+	}
+
+	bool Surface::isPalette(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_P8:
+		case FORMAT_A8P8:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isFloatFormat(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_R5G6B5:
+		case FORMAT_R8G8B8:
+		case FORMAT_B8G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8:
+		case FORMAT_A2B10G10R10:
+		case FORMAT_A2B10G10R10UI:
+		case FORMAT_R8_SNORM:
+		case FORMAT_G8R8_SNORM:
+		case FORMAT_X8B8G8R8_SNORM:
+		case FORMAT_A8B8G8R8_SNORM:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_G16R16:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_V8U8:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_V16U16:
+		case FORMAT_A16W16V16U16:
+		case FORMAT_Q16W16V16U16:
+		case FORMAT_A8:
+		case FORMAT_R8I:
+		case FORMAT_R8:
+		case FORMAT_S8:
+		case FORMAT_L8:
+		case FORMAT_L16:
+		case FORMAT_A8L8:
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+			return false;
+		case FORMAT_R16F:
+		case FORMAT_G16R16F:
+		case FORMAT_B16G16R16F:
+		case FORMAT_X16B16G16R16F:
+		case FORMAT_A16B16G16R16F:
+		case FORMAT_X16B16G16R16F_UNSIGNED:
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_B32G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_L16F:
+		case FORMAT_A16L16F:
+		case FORMAT_L32F:
+		case FORMAT_A32L32F:
+			return true;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool Surface::isUnsignedComponent(Format format, int component)
+	{
+		switch(format)
+		{
+		case FORMAT_NULL:
+		case FORMAT_R5G6B5:
+		case FORMAT_R8G8B8:
+		case FORMAT_B8G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_G8R8:
+		case FORMAT_A2B10G10R10:
+		case FORMAT_A2B10G10R10UI:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_D32F:
+		case FORMAT_D32FS8:
+		case FORMAT_D32F_COMPLEMENTARY:
+		case FORMAT_D32FS8_COMPLEMENTARY:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_A8:
+		case FORMAT_R8:
+		case FORMAT_L8:
+		case FORMAT_L16:
+		case FORMAT_A8L8:
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return true;
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A8B8G8R8_SNORM:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_Q16W16V16U16:
+		case FORMAT_A32B32G32R32F:
+			return false;
+		case FORMAT_R32F:
+		case FORMAT_R8I:
+		case FORMAT_R16I:
+		case FORMAT_R32I:
+		case FORMAT_R8_SNORM:
+			return component >= 1;
+		case FORMAT_V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_V16U16:
+		case FORMAT_G32R32F:
+		case FORMAT_G8R8I:
+		case FORMAT_G16R16I:
+		case FORMAT_G32R32I:
+		case FORMAT_G8R8_SNORM:
+			return component >= 2;
+		case FORMAT_A16W16V16U16:
+		case FORMAT_B32G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X8B8G8R8_SNORM:
+			return component >= 3;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool Surface::isSRGBreadable(Format format)
+	{
+		// Keep in sync with Capabilities::isSRGBreadable
+		switch(format)
+		{
+		case FORMAT_L8:
+		case FORMAT_A8L8:
+		case FORMAT_R8G8B8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_R5G6B5:
+		case FORMAT_X1R5G5B5:
+		case FORMAT_A1R5G5B5:
+		case FORMAT_A4R4G4B4:
+		case FORMAT_DXT1:
+		case FORMAT_DXT3:
+		case FORMAT_DXT5:
+		case FORMAT_ATI1:
+		case FORMAT_ATI2:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isSRGBwritable(Format format)
+	{
+		// Keep in sync with Capabilities::isSRGBwritable
+		switch(format)
+		{
+		case FORMAT_NULL:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_R5G6B5:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isSRGBformat(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isCompressed(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_DXT1:
+		case FORMAT_DXT3:
+		case FORMAT_DXT5:
+		case FORMAT_ATI1:
+		case FORMAT_ATI2:
+		case FORMAT_ETC1:
+		case FORMAT_R11_EAC:
+		case FORMAT_SIGNED_R11_EAC:
+		case FORMAT_RG11_EAC:
+		case FORMAT_SIGNED_RG11_EAC:
+		case FORMAT_RGB8_ETC2:
+		case FORMAT_SRGB8_ETC2:
+		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+		case FORMAT_RGBA8_ETC2_EAC:
+		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+		case FORMAT_RGBA_ASTC_4x4_KHR:
+		case FORMAT_RGBA_ASTC_5x4_KHR:
+		case FORMAT_RGBA_ASTC_5x5_KHR:
+		case FORMAT_RGBA_ASTC_6x5_KHR:
+		case FORMAT_RGBA_ASTC_6x6_KHR:
+		case FORMAT_RGBA_ASTC_8x5_KHR:
+		case FORMAT_RGBA_ASTC_8x6_KHR:
+		case FORMAT_RGBA_ASTC_8x8_KHR:
+		case FORMAT_RGBA_ASTC_10x5_KHR:
+		case FORMAT_RGBA_ASTC_10x6_KHR:
+		case FORMAT_RGBA_ASTC_10x8_KHR:
+		case FORMAT_RGBA_ASTC_10x10_KHR:
+		case FORMAT_RGBA_ASTC_12x10_KHR:
+		case FORMAT_RGBA_ASTC_12x12_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isSignedNonNormalizedInteger(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_G8R8I:
+		case FORMAT_R8I:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_G16R16I:
+		case FORMAT_R16I:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_G32R32I:
+		case FORMAT_R32I:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isUnsignedNonNormalizedInteger(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_G8R8UI:
+		case FORMAT_R8UI:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_G16R16UI:
+		case FORMAT_R16UI:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_G32R32UI:
+		case FORMAT_R32UI:
+			return true;
+		default:
+			return false;
+		}
+	}
+
+	bool Surface::isNonNormalizedInteger(Format format)
+	{
+		return isSignedNonNormalizedInteger(format) ||
+		       isUnsignedNonNormalizedInteger(format);
+	}
+
+	bool Surface::isNormalizedInteger(Format format)
+	{
+		return !isFloatFormat(format) &&
+		       !isNonNormalizedInteger(format) &&
+		       !isCompressed(format) &&
+		       !isDepth(format) &&
+		       !isStencil(format);
+	}
+
+	int Surface::componentCount(Format format)
+	{
+		switch(format)
+		{
+		case FORMAT_R5G6B5:         return 3;
+		case FORMAT_X8R8G8B8:       return 3;
+		case FORMAT_X8B8G8R8I:      return 3;
+		case FORMAT_X8B8G8R8:       return 3;
+		case FORMAT_A8R8G8B8:       return 4;
+		case FORMAT_SRGB8_X8:       return 3;
+		case FORMAT_SRGB8_A8:       return 4;
+		case FORMAT_A8B8G8R8I:      return 4;
+		case FORMAT_A8B8G8R8:       return 4;
+		case FORMAT_G8R8I:          return 2;
+		case FORMAT_G8R8:           return 2;
+		case FORMAT_R8_SNORM:      return 1;
+		case FORMAT_G8R8_SNORM:    return 2;
+		case FORMAT_X8B8G8R8_SNORM:return 3;
+		case FORMAT_A8B8G8R8_SNORM:return 4;
+		case FORMAT_R8UI:           return 1;
+		case FORMAT_G8R8UI:         return 2;
+		case FORMAT_X8B8G8R8UI:     return 3;
+		case FORMAT_A8B8G8R8UI:     return 4;
+		case FORMAT_A2B10G10R10:    return 4;
+		case FORMAT_A2B10G10R10UI:  return 4;
+		case FORMAT_G16R16I:        return 2;
+		case FORMAT_G16R16UI:       return 2;
+		case FORMAT_G16R16:         return 2;
+		case FORMAT_G32R32I:        return 2;
+		case FORMAT_G32R32UI:       return 2;
+		case FORMAT_X16B16G16R16I:  return 3;
+		case FORMAT_X16B16G16R16UI: return 3;
+		case FORMAT_A16B16G16R16I:  return 4;
+		case FORMAT_A16B16G16R16UI: return 4;
+		case FORMAT_A16B16G16R16:   return 4;
+		case FORMAT_X32B32G32R32I:  return 3;
+		case FORMAT_X32B32G32R32UI: return 3;
+		case FORMAT_A32B32G32R32I:  return 4;
+		case FORMAT_A32B32G32R32UI: return 4;
+		case FORMAT_V8U8:           return 2;
+		case FORMAT_Q8W8V8U8:       return 4;
+		case FORMAT_X8L8V8U8:       return 3;
+		case FORMAT_V16U16:         return 2;
+		case FORMAT_A16W16V16U16:   return 4;
+		case FORMAT_Q16W16V16U16:   return 4;
+		case FORMAT_R32F:           return 1;
+		case FORMAT_G32R32F:        return 2;
+		case FORMAT_X32B32G32R32F:  return 3;
+		case FORMAT_A32B32G32R32F:  return 4;
+		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
+		case FORMAT_D32F:           return 1;
+		case FORMAT_D32FS8:         return 1;
+		case FORMAT_D32F_LOCKABLE:  return 1;
+		case FORMAT_D32FS8_TEXTURE: return 1;
+		case FORMAT_D32F_SHADOW:    return 1;
+		case FORMAT_D32FS8_SHADOW:  return 1;
+		case FORMAT_A8:             return 1;
+		case FORMAT_R8I:            return 1;
+		case FORMAT_R8:             return 1;
+		case FORMAT_R16I:           return 1;
+		case FORMAT_R16UI:          return 1;
+		case FORMAT_R32I:           return 1;
+		case FORMAT_R32UI:          return 1;
+		case FORMAT_L8:             return 1;
+		case FORMAT_L16:            return 1;
+		case FORMAT_A8L8:           return 2;
+		case FORMAT_YV12_BT601:     return 3;
+		case FORMAT_YV12_BT709:     return 3;
+		case FORMAT_YV12_JFIF:      return 3;
+		default:
+			ASSERT(false);
+		}
+
+		return 1;
+	}
+
+	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
+	{
+		return allocate(size(width, height, depth, border, samples, format));
+	}
+
+	void Surface::memfill4(void *buffer, int pattern, int bytes)
+	{
+		while((size_t)buffer & 0x1 && bytes >= 1)
+		{
+			*(char*)buffer = (char)pattern;
+			(char*&)buffer += 1;
+			bytes -= 1;
+		}
+
+		while((size_t)buffer & 0x3 && bytes >= 2)
+		{
+			*(short*)buffer = (short)pattern;
+			(short*&)buffer += 1;
+			bytes -= 2;
+		}
+
+		#if defined(__i386__) || defined(__x86_64__)
+			if(CPUID::supportsSSE())
+			{
+				while((size_t)buffer & 0xF && bytes >= 4)
+				{
+					*(int*)buffer = pattern;
+					(int*&)buffer += 1;
+					bytes -= 4;
+				}
+
+				__m128 quad = _mm_set_ps1((float&)pattern);
+
+				float *pointer = (float*)buffer;
+				int qxwords = bytes / 64;
+				bytes -= qxwords * 64;
+
+				while(qxwords--)
+				{
+					_mm_stream_ps(pointer + 0, quad);
+					_mm_stream_ps(pointer + 4, quad);
+					_mm_stream_ps(pointer + 8, quad);
+					_mm_stream_ps(pointer + 12, quad);
+
+					pointer += 16;
+				}
+
+				buffer = pointer;
+			}
+		#endif
+
+		while(bytes >= 4)
+		{
+			*(int*)buffer = (int)pattern;
+			(int*&)buffer += 1;
+			bytes -= 4;
+		}
+
+		while(bytes >= 2)
+		{
+			*(short*)buffer = (short)pattern;
+			(short*&)buffer += 1;
+			bytes -= 2;
+		}
+
+		while(bytes >= 1)
+		{
+			*(char*)buffer = (char)pattern;
+			(char*&)buffer += 1;
+			bytes -= 1;
+		}
+	}
+
+	void Surface::sync()
+	{
+		resource->lock(EXCLUSIVE);
+		resource->unlock();
+	}
+
+	bool Surface::isEntire(const Rect& rect) const
+	{
+		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
+	}
+
+	Rect Surface::getRect() const
+	{
+		return Rect(0, 0, internal.width, internal.height);
+	}
+
+	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
+	{
+		if(width == 0 || height == 0)
+		{
+			return;
+		}
+
+		if(internal.format == FORMAT_NULL)
+		{
+			return;
+		}
+
+		// Not overlapping
+		if(x0 > internal.width) return;
+		if(y0 > internal.height) return;
+		if(x0 + width < 0) return;
+		if(y0 + height < 0) return;
+
+		// Clip against dimensions
+		if(x0 < 0) {width += x0; x0 = 0;}
+		if(x0 + width > internal.width) width = internal.width - x0;
+		if(y0 < 0) {height += y0; y0 = 0;}
+		if(y0 + height > internal.height) height = internal.height - y0;
+
+		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
+		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
+
+		int x1 = x0 + width;
+		int y1 = y0 + height;
+
+		if(!hasQuadLayout(internal.format))
+		{
+			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
+
+			for(int z = 0; z < internal.samples; z++)
+			{
+				float *row = target;
+				for(int y = y0; y < y1; y++)
+				{
+					memfill4(row, (int&)depth, width * sizeof(float));
+					row += internal.pitchP;
+				}
+				target += internal.sliceP;
+			}
+
+			unlockInternal();
+		}
+		else   // Quad layout
+		{
+			if(complementaryDepthBuffer)
+			{
+				depth = 1 - depth;
+			}
+
+			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
+
+			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
+			int oddX1 = (x1 & ~1) * 2;
+			int evenX0 = ((x0 + 1) & ~1) * 2;
+			int evenBytes = (oddX1 - evenX0) * sizeof(float);
+
+			for(int z = 0; z < internal.samples; z++)
+			{
+				for(int y = y0; y < y1; y++)
+				{
+					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
+
+					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
+					{
+						if((x0 & 1) != 0)
+						{
+							target[oddX0 + 0] = depth;
+							target[oddX0 + 2] = depth;
+						}
+
+					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
+					//	{
+					//		target[x2 + 0] = depth;
+					//		target[x2 + 1] = depth;
+					//		target[x2 + 2] = depth;
+					//		target[x2 + 3] = depth;
+					//	}
+
+					//	__asm
+					//	{
+					//		movss xmm0, depth
+					//		shufps xmm0, xmm0, 0x00
+					//
+					//		mov eax, x0
+					//		add eax, 1
+					//		and eax, 0xFFFFFFFE
+					//		cmp eax, x1
+					//		jge qEnd
+					//
+					//		mov edi, target
+					//
+					//	qLoop:
+					//		movntps [edi+8*eax], xmm0
+					//
+					//		add eax, 2
+					//		cmp eax, x1
+					//		jl qLoop
+					//	qEnd:
+					//	}
+
+						memfill4(&target[evenX0], (int&)depth, evenBytes);
+
+						if((x1 & 1) != 0)
+						{
+							target[oddX1 + 0] = depth;
+							target[oddX1 + 2] = depth;
+						}
+
+						y++;
+					}
+					else
+					{
+						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
+						{
+							target[i] = depth;
+						}
+					}
+				}
+
+				buffer += internal.sliceP;
+			}
+
+			unlockInternal();
+		}
+	}
+
+	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
+	{
+		if(mask == 0 || width == 0 || height == 0)
+		{
+			return;
+		}
+
+		if(stencil.format == FORMAT_NULL)
+		{
+			return;
+		}
+
+		// Not overlapping
+		if(x0 > internal.width) return;
+		if(y0 > internal.height) return;
+		if(x0 + width < 0) return;
+		if(y0 + height < 0) return;
+
+		// Clip against dimensions
+		if(x0 < 0) {width += x0; x0 = 0;}
+		if(x0 + width > internal.width) width = internal.width - x0;
+		if(y0 < 0) {height += y0; y0 = 0;}
+		if(y0 + height > internal.height) height = internal.height - y0;
+
+		int x1 = x0 + width;
+		int y1 = y0 + height;
+
+		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
+		int oddX1 = (x1 & ~1) * 2;
+		int evenX0 = ((x0 + 1) & ~1) * 2;
+		int evenBytes = oddX1 - evenX0;
+
+		unsigned char maskedS = s & mask;
+		unsigned char invMask = ~mask;
+		unsigned int fill = maskedS;
+		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
+
+		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
+
+		// Stencil buffers are assumed to use quad layout
+		for(int z = 0; z < stencil.samples; z++)
+		{
+			for(int y = y0; y < y1; y++)
+			{
+				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
+
+				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
+				{
+					if((x0 & 1) != 0)
+					{
+						target[oddX0 + 0] = fill;
+						target[oddX0 + 2] = fill;
+					}
+
+					memfill4(&target[evenX0], fill, evenBytes);
+
+					if((x1 & 1) != 0)
+					{
+						target[oddX1 + 0] = fill;
+						target[oddX1 + 2] = fill;
+					}
+
+					y++;
+				}
+				else
+				{
+					for(int x = x0; x < x1; x++)
+					{
+						int i = (x & ~1) * 2 + (x & 1);
+						target[i] = maskedS | (target[i] & invMask);
+					}
+				}
+			}
+
+			buffer += stencil.sliceP;
+		}
+
+		unlockStencil();
+	}
+
+	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
+	{
+		unsigned char *row;
+		Buffer *buffer;
+
+		if(internal.dirty)
+		{
+			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
+			buffer = &internal;
+		}
+		else
+		{
+			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
+			buffer = &external;
+		}
+
+		if(buffer->bytes <= 4)
+		{
+			int c;
+			buffer->write(&c, color);
+
+			if(buffer->bytes <= 1) c = (c << 8)  | c;
+			if(buffer->bytes <= 2) c = (c << 16) | c;
+
+			for(int y = 0; y < height; y++)
+			{
+				memfill4(row, c, width * buffer->bytes);
+
+				row += buffer->pitchB;
+			}
+		}
+		else   // Generic
+		{
+			for(int y = 0; y < height; y++)
+			{
+				unsigned char *element = row;
+
+				for(int x = 0; x < width; x++)
+				{
+					buffer->write(element, color);
+
+					element += buffer->bytes;
+				}
+
+				row += buffer->pitchB;
+			}
+		}
+
+		if(buffer == &internal)
+		{
+			unlockInternal();
+		}
+		else
+		{
+			unlockExternal();
+		}
+	}
+
+	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
+	{
+		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
+
+		sw::Color<float> color;
+
+		if(!filter)
+		{
+			color = source->internal.read((int)srcX, (int)srcY, 0);
+		}
+		else   // Bilinear filtering
+		{
+			color = source->internal.sample(srcX, srcY, 0);
+		}
+
+		internal.write(x, y, color);
+	}
+
+	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
+	{
+		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
+
+		sw::Color<float> color;
+
+		if(!filter)
+		{
+			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
+		}
+		else   // Bilinear filtering
+		{
+			color = source->internal.sample(srcX, srcY, srcZ);
+		}
+
+		internal.write(x, y, z, color);
+	}
+
+	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
+	{
+		Surface *dst = this;
+
+		// Figure out if the edges to be copied in reverse order respectively from one another
+		// The copy should be reversed whenever the same edges are contiguous or if we're
+		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
+		//
+		//      | +y |
+		// | -x | +z | +x | -z |
+		//      | -y |
+
+		bool reverse = (srcEdge == dstEdge) ||
+		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
+		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
+		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
+		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
+
+		int srcBytes = src->bytes(src->Surface::getInternalFormat());
+		int srcPitch = src->getInternalPitchB();
+		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
+		int dstPitch = dst->getInternalPitchB();
+
+		int srcW = src->getWidth();
+		int srcH = src->getHeight();
+		int dstW = dst->getWidth();
+		int dstH = dst->getHeight();
+
+		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
+
+		// Src is expressed in the regular [0, width-1], [0, height-1] space
+		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
+		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
+
+		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
+		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
+		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
+
+		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
+		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
+
+		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
+		{
+			memcpy(dstBuf, srcBuf, srcBytes);
+		}
+
+		if(dstEdge == LEFT || dstEdge == RIGHT)
+		{
+			// TOP and BOTTOM are already set, let's average out the corners
+			int x0 = (dstEdge == RIGHT) ? dstW : -1;
+			int y0 = -1;
+			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
+			int y1 = 0;
+			dst->computeCubeCorner(x0, y0, x1, y1);
+			y0 = dstH;
+			y1 = dstH - 1;
+			dst->computeCubeCorner(x0, y0, x1, y1);
+		}
+
+		src->unlockInternal();
+		dst->unlockInternal();
+	}
+
+	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
+	{
+		ASSERT(internal.lock != LOCK_UNLOCKED);
+
+		sw::Color<float> color = internal.read(x0, y1);
+		color += internal.read(x1, y0);
+		color += internal.read(x1, y1);
+		color *= (1.0f / 3.0f);
+
+		internal.write(x0, y0, color);
+	}
+
+	bool Surface::hasStencil() const
+	{
+		return isStencil(external.format);
+	}
+
+	bool Surface::hasDepth() const
+	{
+		return isDepth(external.format);
+	}
+
+	bool Surface::hasPalette() const
+	{
+		return isPalette(external.format);
+	}
+
+	bool Surface::isRenderTarget() const
+	{
+		return renderTarget;
+	}
+
+	bool Surface::hasDirtyContents() const
+	{
+		return dirtyContents;
+	}
+
+	void Surface::markContentsClean()
+	{
+		dirtyContents = false;
+	}
+
+	Resource *Surface::getResource()
+	{
+		return resource;
+	}
+
+	bool Surface::identicalBuffers() const
+	{
+		return external.format == internal.format &&
+		       external.width  == internal.width &&
+		       external.height == internal.height &&
+		       external.depth  == internal.depth &&
+		       external.pitchB == internal.pitchB &&
+		       external.sliceB == internal.sliceB &&
+		       external.border == internal.border &&
+		       external.samples == internal.samples;
+	}
+
+	Format Surface::selectInternalFormat(Format format) const
+	{
+		switch(format)
+		{
+		case FORMAT_NULL:
+			return FORMAT_NULL;
+		case FORMAT_P8:
+		case FORMAT_A8P8:
+		case FORMAT_A4R4G4B4:
+		case FORMAT_A1R5G5B5:
+		case FORMAT_A8R3G3B2:
+			return FORMAT_A8R8G8B8;
+		case FORMAT_A8:
+			return FORMAT_A8;
+		case FORMAT_R8I:
+			return FORMAT_R8I;
+		case FORMAT_R8UI:
+			return FORMAT_R8UI;
+		case FORMAT_R8_SNORM:
+			return FORMAT_R8_SNORM;
+		case FORMAT_R8:
+			return FORMAT_R8;
+		case FORMAT_R16I:
+			return FORMAT_R16I;
+		case FORMAT_R16UI:
+			return FORMAT_R16UI;
+		case FORMAT_R32I:
+			return FORMAT_R32I;
+		case FORMAT_R32UI:
+			return FORMAT_R32UI;
+		case FORMAT_X16B16G16R16I:
+			return FORMAT_X16B16G16R16I;
+		case FORMAT_A16B16G16R16I:
+			return FORMAT_A16B16G16R16I;
+		case FORMAT_X16B16G16R16UI:
+			return FORMAT_X16B16G16R16UI;
+		case FORMAT_A16B16G16R16UI:
+			return FORMAT_A16B16G16R16UI;
+		case FORMAT_A2R10G10B10:
+		case FORMAT_A2B10G10R10:
+		case FORMAT_A16B16G16R16:
+			return FORMAT_A16B16G16R16;
+		case FORMAT_A2B10G10R10UI:
+			return FORMAT_A16B16G16R16UI;
+		case FORMAT_X32B32G32R32I:
+			return FORMAT_X32B32G32R32I;
+		case FORMAT_A32B32G32R32I:
+			return FORMAT_A32B32G32R32I;
+		case FORMAT_X32B32G32R32UI:
+			return FORMAT_X32B32G32R32UI;
+		case FORMAT_A32B32G32R32UI:
+			return FORMAT_A32B32G32R32UI;
+		case FORMAT_G8R8I:
+			return FORMAT_G8R8I;
+		case FORMAT_G8R8UI:
+			return FORMAT_G8R8UI;
+		case FORMAT_G8R8_SNORM:
+			return FORMAT_G8R8_SNORM;
+		case FORMAT_G8R8:
+			return FORMAT_G8R8;
+		case FORMAT_G16R16I:
+			return FORMAT_G16R16I;
+		case FORMAT_G16R16UI:
+			return FORMAT_G16R16UI;
+		case FORMAT_G16R16:
+			return FORMAT_G16R16;
+		case FORMAT_G32R32I:
+			return FORMAT_G32R32I;
+		case FORMAT_G32R32UI:
+			return FORMAT_G32R32UI;
+		case FORMAT_A8R8G8B8:
+			if(lockable || !quadLayoutEnabled)
+			{
+				return FORMAT_A8R8G8B8;
+			}
+			else
+			{
+				return FORMAT_A8G8R8B8Q;
+			}
+		case FORMAT_A8B8G8R8I:
+			return FORMAT_A8B8G8R8I;
+		case FORMAT_A8B8G8R8UI:
+			return FORMAT_A8B8G8R8UI;
+		case FORMAT_A8B8G8R8_SNORM:
+			return FORMAT_A8B8G8R8_SNORM;
+		case FORMAT_R5G5B5A1:
+		case FORMAT_R4G4B4A4:
+		case FORMAT_A8B8G8R8:
+			return FORMAT_A8B8G8R8;
+		case FORMAT_R5G6B5:
+			return FORMAT_R5G6B5;
+		case FORMAT_R3G3B2:
+		case FORMAT_R8G8B8:
+		case FORMAT_X4R4G4B4:
+		case FORMAT_X1R5G5B5:
+		case FORMAT_X8R8G8B8:
+			if(lockable || !quadLayoutEnabled)
+			{
+				return FORMAT_X8R8G8B8;
+			}
+			else
+			{
+				return FORMAT_X8G8R8B8Q;
+			}
+		case FORMAT_X8B8G8R8I:
+			return FORMAT_X8B8G8R8I;
+		case FORMAT_X8B8G8R8UI:
+			return FORMAT_X8B8G8R8UI;
+		case FORMAT_X8B8G8R8_SNORM:
+			return FORMAT_X8B8G8R8_SNORM;
+		case FORMAT_B8G8R8:
+		case FORMAT_X8B8G8R8:
+			return FORMAT_X8B8G8R8;
+		case FORMAT_SRGB8_X8:
+			return FORMAT_SRGB8_X8;
+		case FORMAT_SRGB8_A8:
+			return FORMAT_SRGB8_A8;
+		// Compressed formats
+		case FORMAT_DXT1:
+		case FORMAT_DXT3:
+		case FORMAT_DXT5:
+		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+		case FORMAT_RGBA8_ETC2_EAC:
+		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
+		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
+		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
+			return FORMAT_A8R8G8B8;
+		case FORMAT_RGBA_ASTC_4x4_KHR:
+		case FORMAT_RGBA_ASTC_5x4_KHR:
+		case FORMAT_RGBA_ASTC_5x5_KHR:
+		case FORMAT_RGBA_ASTC_6x5_KHR:
+		case FORMAT_RGBA_ASTC_6x6_KHR:
+		case FORMAT_RGBA_ASTC_8x5_KHR:
+		case FORMAT_RGBA_ASTC_8x6_KHR:
+		case FORMAT_RGBA_ASTC_8x8_KHR:
+		case FORMAT_RGBA_ASTC_10x5_KHR:
+		case FORMAT_RGBA_ASTC_10x6_KHR:
+		case FORMAT_RGBA_ASTC_10x8_KHR:
+		case FORMAT_RGBA_ASTC_10x10_KHR:
+		case FORMAT_RGBA_ASTC_12x10_KHR:
+		case FORMAT_RGBA_ASTC_12x12_KHR:
+			// ASTC supports HDR, so a floating point format is required to represent it properly
+			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
+		case FORMAT_ATI1:
+			return FORMAT_R8;
+		case FORMAT_R11_EAC:
+		case FORMAT_SIGNED_R11_EAC:
+			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
+		case FORMAT_ATI2:
+			return FORMAT_G8R8;
+		case FORMAT_RG11_EAC:
+		case FORMAT_SIGNED_RG11_EAC:
+			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
+		case FORMAT_ETC1:
+		case FORMAT_RGB8_ETC2:
+		case FORMAT_SRGB8_ETC2:
+			return FORMAT_X8R8G8B8;
+		// Bumpmap formats
+		case FORMAT_V8U8:			return FORMAT_V8U8;
+		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
+		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
+		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
+		case FORMAT_V16U16:			return FORMAT_V16U16;
+		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
+		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
+		// Floating-point formats
+		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
+		case FORMAT_R16F:			return FORMAT_R32F;
+		case FORMAT_G16R16F:		return FORMAT_G32R32F;
+		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
+		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
+		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
+		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
+		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
+		case FORMAT_R32F:			return FORMAT_R32F;
+		case FORMAT_G32R32F:		return FORMAT_G32R32F;
+		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
+		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
+		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
+		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
+		// Luminance formats
+		case FORMAT_L8:				return FORMAT_L8;
+		case FORMAT_A4L4:			return FORMAT_A8L8;
+		case FORMAT_L16:			return FORMAT_L16;
+		case FORMAT_A8L8:			return FORMAT_A8L8;
+		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
+		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
+		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
+		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
+		// Depth/stencil formats
+		case FORMAT_D16:
+		case FORMAT_D32:
+		case FORMAT_D24X8:
+			if(hasParent)   // Texture
+			{
+				return FORMAT_D32F_SHADOW;
+			}
+			else if(complementaryDepthBuffer)
+			{
+				return FORMAT_D32F_COMPLEMENTARY;
+			}
+			else
+			{
+				return FORMAT_D32F;
+			}
+		case FORMAT_D24S8:
+		case FORMAT_D24FS8:
+			if(hasParent)   // Texture
+			{
+				return FORMAT_D32FS8_SHADOW;
+			}
+			else if(complementaryDepthBuffer)
+			{
+				return FORMAT_D32FS8_COMPLEMENTARY;
+			}
+			else
+			{
+				return FORMAT_D32FS8;
+			}
+		case FORMAT_D32F:           return FORMAT_D32F;
+		case FORMAT_D32FS8:         return FORMAT_D32FS8;
+		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
+		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
+		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
+		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
+		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
+		case FORMAT_S8:             return FORMAT_S8;
+		// YUV formats
+		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
+		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
+		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
+		default:
+			ASSERT(false);
+		}
+
+		return FORMAT_NULL;
+	}
+
+	void Surface::setTexturePalette(unsigned int *palette)
+	{
+		Surface::palette = palette;
+		Surface::paletteID++;
+	}
+
+	void Surface::resolve()
+	{
+		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
+		{
+			return;
+		}
+
+		ASSERT(internal.depth == 1);  // Unimplemented
+
+		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
+
+		int width = internal.width;
+		int height = internal.height;
+		int pitch = internal.pitchB;
+		int slice = internal.sliceB;
+
+		unsigned char *source0 = (unsigned char*)source;
+		unsigned char *source1 = source0 + slice;
+		unsigned char *source2 = source1 + slice;
+		unsigned char *source3 = source2 + slice;
+		unsigned char *source4 = source3 + slice;
+		unsigned char *source5 = source4 + slice;
+		unsigned char *source6 = source5 + slice;
+		unsigned char *source7 = source6 + slice;
+		unsigned char *source8 = source7 + slice;
+		unsigned char *source9 = source8 + slice;
+		unsigned char *sourceA = source9 + slice;
+		unsigned char *sourceB = sourceA + slice;
+		unsigned char *sourceC = sourceB + slice;
+		unsigned char *sourceD = sourceC + slice;
+		unsigned char *sourceE = sourceD + slice;
+		unsigned char *sourceF = sourceE + slice;
+
+		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
+		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
+		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
+		{
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 4) == 0)
+				{
+					if(internal.samples == 2)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+
+								c0 = _mm_avg_epu8(c0, c1);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+						}
+					}
+					else if(internal.samples == 4)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c0 = _mm_avg_epu8(c0, c2);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+						}
+					}
+					else if(internal.samples == 8)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c4 = _mm_avg_epu8(c4, c5);
+								c6 = _mm_avg_epu8(c6, c7);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c0 = _mm_avg_epu8(c0, c4);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+						}
+					}
+					else if(internal.samples == 16)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+
+								c0 = _mm_avg_epu8(c0, c1);
+								c2 = _mm_avg_epu8(c2, c3);
+								c4 = _mm_avg_epu8(c4, c5);
+								c6 = _mm_avg_epu8(c6, c7);
+								c8 = _mm_avg_epu8(c8, c9);
+								cA = _mm_avg_epu8(cA, cB);
+								cC = _mm_avg_epu8(cC, cD);
+								cE = _mm_avg_epu8(cE, cF);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c8 = _mm_avg_epu8(c8, cA);
+								cC = _mm_avg_epu8(cC, cE);
+								c0 = _mm_avg_epu8(c0, c4);
+								c8 = _mm_avg_epu8(c8, cC);
+								c0 = _mm_avg_epu8(c0, c8);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
+						}
+					}
+					else ASSERT(false);
+				}
+				else
+			#endif
+			{
+				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
+
+				if(internal.samples == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.samples == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c0 = AVERAGE(c0, c2);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.samples == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c0 = AVERAGE(c0, c4);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.samples == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
+							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
+							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
+							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
+							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
+							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
+							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
+							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c8 = AVERAGE(c8, c9);
+							cA = AVERAGE(cA, cB);
+							cC = AVERAGE(cC, cD);
+							cE = AVERAGE(cE, cF);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c8 = AVERAGE(c8, cA);
+							cC = AVERAGE(cC, cE);
+							c0 = AVERAGE(c0, c4);
+							c8 = AVERAGE(c8, cC);
+							c0 = AVERAGE(c0, c8);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+
+				#undef AVERAGE
+			}
+		}
+		else if(internal.format == FORMAT_G16R16)
+		{
+
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 4) == 0)
+				{
+					if(internal.samples == 2)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+						}
+					}
+					else if(internal.samples == 4)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c0 = _mm_avg_epu16(c0, c2);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+						}
+					}
+					else if(internal.samples == 8)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c0 = _mm_avg_epu16(c0, c4);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+						}
+					}
+					else if(internal.samples == 16)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c8 = _mm_avg_epu16(c8, c9);
+								cA = _mm_avg_epu16(cA, cB);
+								cC = _mm_avg_epu16(cC, cD);
+								cE = _mm_avg_epu16(cE, cF);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c8 = _mm_avg_epu16(c8, cA);
+								cC = _mm_avg_epu16(cC, cE);
+								c0 = _mm_avg_epu16(c0, c4);
+								c8 = _mm_avg_epu16(c8, cC);
+								c0 = _mm_avg_epu16(c0, c8);
+
+								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
+						}
+					}
+					else ASSERT(false);
+				}
+				else
+			#endif
+			{
+				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
+
+				if(internal.samples == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.samples == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c0 = AVERAGE(c0, c2);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.samples == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c0 = AVERAGE(c0, c4);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.samples == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
+							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
+							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
+							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
+							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
+							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
+							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
+							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c8 = AVERAGE(c8, c9);
+							cA = AVERAGE(cA, cB);
+							cC = AVERAGE(cC, cD);
+							cE = AVERAGE(cE, cF);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c8 = AVERAGE(c8, cA);
+							cC = AVERAGE(cC, cE);
+							c0 = AVERAGE(c0, c4);
+							c8 = AVERAGE(c8, cC);
+							c0 = AVERAGE(c0, c8);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+
+				#undef AVERAGE
+			}
+		}
+		else if(internal.format == FORMAT_A16B16G16R16)
+		{
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 2) == 0)
+				{
+					if(internal.samples == 2)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+						}
+					}
+					else if(internal.samples == 4)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c0 = _mm_avg_epu16(c0, c2);
+
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+						}
+					}
+					else if(internal.samples == 8)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c0 = _mm_avg_epu16(c0, c4);
+
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+						}
+					}
+					else if(internal.samples == 16)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
+
+								c0 = _mm_avg_epu16(c0, c1);
+								c2 = _mm_avg_epu16(c2, c3);
+								c4 = _mm_avg_epu16(c4, c5);
+								c6 = _mm_avg_epu16(c6, c7);
+								c8 = _mm_avg_epu16(c8, c9);
+								cA = _mm_avg_epu16(cA, cB);
+								cC = _mm_avg_epu16(cC, cD);
+								cE = _mm_avg_epu16(cE, cF);
+								c0 = _mm_avg_epu16(c0, c2);
+								c4 = _mm_avg_epu16(c4, c6);
+								c8 = _mm_avg_epu16(c8, cA);
+								cC = _mm_avg_epu16(cC, cE);
+								c0 = _mm_avg_epu16(c0, c4);
+								c8 = _mm_avg_epu16(c8, cC);
+								c0 = _mm_avg_epu16(c0, c8);
+
+								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
+						}
+					}
+					else ASSERT(false);
+				}
+				else
+			#endif
+			{
+				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
+
+				if(internal.samples == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.samples == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c0 = AVERAGE(c0, c2);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.samples == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c0 = AVERAGE(c0, c4);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.samples == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
+							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
+							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
+							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
+							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
+							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
+							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
+							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
+							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
+							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
+							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
+							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
+							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
+							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
+							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
+							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c8 = AVERAGE(c8, c9);
+							cA = AVERAGE(cA, cB);
+							cC = AVERAGE(cC, cD);
+							cE = AVERAGE(cE, cF);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c8 = AVERAGE(c8, cA);
+							cC = AVERAGE(cC, cE);
+							c0 = AVERAGE(c0, c4);
+							c8 = AVERAGE(c8, cC);
+							c0 = AVERAGE(c0, c8);
+
+							*(unsigned int*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+
+				#undef AVERAGE
+			}
+		}
+		else if(internal.format == FORMAT_R32F)
+		{
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE() && (width % 4) == 0)
+				{
+					if(internal.samples == 2)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+						}
+					}
+					else if(internal.samples == 4)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+						}
+					}
+					else if(internal.samples == 8)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+						}
+					}
+					else if(internal.samples == 16)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 4)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+
+								_mm_store_ps((float*)(source0 + 4 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
+						}
+					}
+					else ASSERT(false);
+				}
+				else
+			#endif
+			{
+				if(internal.samples == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+
+							c0 = c0 + c1;
+							c0 *= 1.0f / 2.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.samples == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c0 = c0 + c2;
+							c0 *= 1.0f / 4.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.samples == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+							float c4 = *(float*)(source4 + 4 * x);
+							float c5 = *(float*)(source5 + 4 * x);
+							float c6 = *(float*)(source6 + 4 * x);
+							float c7 = *(float*)(source7 + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c4 = c4 + c5;
+							c6 = c6 + c7;
+							c0 = c0 + c2;
+							c4 = c4 + c6;
+							c0 = c0 + c4;
+							c0 *= 1.0f / 8.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.samples == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+							float c4 = *(float*)(source4 + 4 * x);
+							float c5 = *(float*)(source5 + 4 * x);
+							float c6 = *(float*)(source6 + 4 * x);
+							float c7 = *(float*)(source7 + 4 * x);
+							float c8 = *(float*)(source8 + 4 * x);
+							float c9 = *(float*)(source9 + 4 * x);
+							float cA = *(float*)(sourceA + 4 * x);
+							float cB = *(float*)(sourceB + 4 * x);
+							float cC = *(float*)(sourceC + 4 * x);
+							float cD = *(float*)(sourceD + 4 * x);
+							float cE = *(float*)(sourceE + 4 * x);
+							float cF = *(float*)(sourceF + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c4 = c4 + c5;
+							c6 = c6 + c7;
+							c8 = c8 + c9;
+							cA = cA + cB;
+							cC = cC + cD;
+							cE = cE + cF;
+							c0 = c0 + c2;
+							c4 = c4 + c6;
+							c8 = c8 + cA;
+							cC = cC + cE;
+							c0 = c0 + c4;
+							c8 = c8 + cC;
+							c0 = c0 + c8;
+							c0 *= 1.0f / 16.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+			}
+		}
+		else if(internal.format == FORMAT_G32R32F)
+		{
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE() && (width % 2) == 0)
+				{
+					if(internal.samples == 2)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+						}
+					}
+					else if(internal.samples == 4)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+						}
+					}
+					else if(internal.samples == 8)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+						}
+					}
+					else if(internal.samples == 16)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 2)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+
+								_mm_store_ps((float*)(source0 + 8 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
+						}
+					}
+					else ASSERT(false);
+				}
+				else
+			#endif
+			{
+				if(internal.samples == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+
+							c0 = c0 + c1;
+							c0 *= 1.0f / 2.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.samples == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c0 = c0 + c2;
+							c0 *= 1.0f / 4.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.samples == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+							float c4 = *(float*)(source4 + 4 * x);
+							float c5 = *(float*)(source5 + 4 * x);
+							float c6 = *(float*)(source6 + 4 * x);
+							float c7 = *(float*)(source7 + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c4 = c4 + c5;
+							c6 = c6 + c7;
+							c0 = c0 + c2;
+							c4 = c4 + c6;
+							c0 = c0 + c4;
+							c0 *= 1.0f / 8.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.samples == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 2 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+							float c4 = *(float*)(source4 + 4 * x);
+							float c5 = *(float*)(source5 + 4 * x);
+							float c6 = *(float*)(source6 + 4 * x);
+							float c7 = *(float*)(source7 + 4 * x);
+							float c8 = *(float*)(source8 + 4 * x);
+							float c9 = *(float*)(source9 + 4 * x);
+							float cA = *(float*)(sourceA + 4 * x);
+							float cB = *(float*)(sourceB + 4 * x);
+							float cC = *(float*)(sourceC + 4 * x);
+							float cD = *(float*)(sourceD + 4 * x);
+							float cE = *(float*)(sourceE + 4 * x);
+							float cF = *(float*)(sourceF + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c4 = c4 + c5;
+							c6 = c6 + c7;
+							c8 = c8 + c9;
+							cA = cA + cB;
+							cC = cC + cD;
+							cE = cE + cF;
+							c0 = c0 + c2;
+							c4 = c4 + c6;
+							c8 = c8 + cA;
+							cC = cC + cE;
+							c0 = c0 + c4;
+							c8 = c8 + cC;
+							c0 = c0 + c8;
+							c0 *= 1.0f / 16.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+			}
+		}
+		else if(internal.format == FORMAT_A32B32G32R32F ||
+		        internal.format == FORMAT_X32B32G32R32F ||
+		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
+		{
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE())
+				{
+					if(internal.samples == 2)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
+
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+						}
+					}
+					else if(internal.samples == 4)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c0 = _mm_add_ps(c0, c2);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
+
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+						}
+					}
+					else if(internal.samples == 8)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c0 = _mm_add_ps(c0, c4);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
+
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+						}
+					}
+					else if(internal.samples == 16)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x++)
+							{
+								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
+								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
+								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
+								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
+								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
+								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
+								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
+								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
+								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
+								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
+								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
+								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
+								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
+								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
+								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
+								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
+
+								c0 = _mm_add_ps(c0, c1);
+								c2 = _mm_add_ps(c2, c3);
+								c4 = _mm_add_ps(c4, c5);
+								c6 = _mm_add_ps(c6, c7);
+								c8 = _mm_add_ps(c8, c9);
+								cA = _mm_add_ps(cA, cB);
+								cC = _mm_add_ps(cC, cD);
+								cE = _mm_add_ps(cE, cF);
+								c0 = _mm_add_ps(c0, c2);
+								c4 = _mm_add_ps(c4, c6);
+								c8 = _mm_add_ps(c8, cA);
+								cC = _mm_add_ps(cC, cE);
+								c0 = _mm_add_ps(c0, c4);
+								c8 = _mm_add_ps(c8, cC);
+								c0 = _mm_add_ps(c0, c8);
+								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
+
+								_mm_store_ps((float*)(source0 + 16 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
+						}
+					}
+					else ASSERT(false);
+				}
+				else
+			#endif
+			{
+				if(internal.samples == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 4 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+
+							c0 = c0 + c1;
+							c0 *= 1.0f / 2.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.samples == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 4 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c0 = c0 + c2;
+							c0 *= 1.0f / 4.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.samples == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 4 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+							float c4 = *(float*)(source4 + 4 * x);
+							float c5 = *(float*)(source5 + 4 * x);
+							float c6 = *(float*)(source6 + 4 * x);
+							float c7 = *(float*)(source7 + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c4 = c4 + c5;
+							c6 = c6 + c7;
+							c0 = c0 + c2;
+							c4 = c4 + c6;
+							c0 = c0 + c4;
+							c0 *= 1.0f / 8.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.samples == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < 4 * width; x++)
+						{
+							float c0 = *(float*)(source0 + 4 * x);
+							float c1 = *(float*)(source1 + 4 * x);
+							float c2 = *(float*)(source2 + 4 * x);
+							float c3 = *(float*)(source3 + 4 * x);
+							float c4 = *(float*)(source4 + 4 * x);
+							float c5 = *(float*)(source5 + 4 * x);
+							float c6 = *(float*)(source6 + 4 * x);
+							float c7 = *(float*)(source7 + 4 * x);
+							float c8 = *(float*)(source8 + 4 * x);
+							float c9 = *(float*)(source9 + 4 * x);
+							float cA = *(float*)(sourceA + 4 * x);
+							float cB = *(float*)(sourceB + 4 * x);
+							float cC = *(float*)(sourceC + 4 * x);
+							float cD = *(float*)(sourceD + 4 * x);
+							float cE = *(float*)(sourceE + 4 * x);
+							float cF = *(float*)(sourceF + 4 * x);
+
+							c0 = c0 + c1;
+							c2 = c2 + c3;
+							c4 = c4 + c5;
+							c6 = c6 + c7;
+							c8 = c8 + c9;
+							cA = cA + cB;
+							cC = cC + cD;
+							cE = cE + cF;
+							c0 = c0 + c2;
+							c4 = c4 + c6;
+							c8 = c8 + cA;
+							cC = cC + cE;
+							c0 = c0 + c4;
+							c8 = c8 + cC;
+							c0 = c0 + c8;
+							c0 *= 1.0f / 16.0f;
+
+							*(float*)(source0 + 4 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+			}
+		}
+		else if(internal.format == FORMAT_R5G6B5)
+		{
+			#if defined(__i386__) || defined(__x86_64__)
+				if(CPUID::supportsSSE2() && (width % 8) == 0)
+				{
+					if(internal.samples == 2)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
+
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+						}
+					}
+					else if(internal.samples == 4)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c1 = _mm_avg_epu16(c1, c3);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
+
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+						}
+					}
+					else if(internal.samples == 8)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c0 = _mm_avg_epu8(c0, c4);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c5 = _mm_avg_epu16(c4__g_, c5__g_);
+								c7 = _mm_avg_epu16(c6__g_, c7__g_);
+								c1 = _mm_avg_epu16(c1, c3);
+								c5 = _mm_avg_epu16(c5, c7);
+								c1 = _mm_avg_epu16(c1, c5);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
+
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+						}
+					}
+					else if(internal.samples == 16)
+					{
+						for(int y = 0; y < height; y++)
+						{
+							for(int x = 0; x < width; x += 8)
+							{
+								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
+								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
+								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
+								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
+								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
+								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
+								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
+								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
+								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
+								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
+								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
+								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
+								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
+								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
+								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
+								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
+
+								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
+								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
+								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
+								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
+								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
+								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
+								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
+
+								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
+								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
+								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
+								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
+								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
+								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
+								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
+								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
+								c0 = _mm_avg_epu8(c0, c2);
+								c4 = _mm_avg_epu8(c4, c6);
+								c8 = _mm_avg_epu8(c8, cA);
+								cC = _mm_avg_epu8(cC, cE);
+								c0 = _mm_avg_epu8(c0, c4);
+								c8 = _mm_avg_epu8(c8, cC);
+								c0 = _mm_avg_epu8(c0, c8);
+								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
+								c1 = _mm_avg_epu16(c0__g_, c1__g_);
+								c3 = _mm_avg_epu16(c2__g_, c3__g_);
+								c5 = _mm_avg_epu16(c4__g_, c5__g_);
+								c7 = _mm_avg_epu16(c6__g_, c7__g_);
+								c9 = _mm_avg_epu16(c8__g_, c9__g_);
+								cB = _mm_avg_epu16(cA__g_, cB__g_);
+								cD = _mm_avg_epu16(cC__g_, cD__g_);
+								cF = _mm_avg_epu16(cE__g_, cF__g_);
+								c1 = _mm_avg_epu8(c1, c3);
+								c5 = _mm_avg_epu8(c5, c7);
+								c9 = _mm_avg_epu8(c9, cB);
+								cD = _mm_avg_epu8(cD, cF);
+								c1 = _mm_avg_epu8(c1, c5);
+								c9 = _mm_avg_epu8(c9, cD);
+								c1 = _mm_avg_epu8(c1, c9);
+								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
+								c0 = _mm_or_si128(c0, c1);
+
+								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
+							}
+
+							source0 += pitch;
+							source1 += pitch;
+							source2 += pitch;
+							source3 += pitch;
+							source4 += pitch;
+							source5 += pitch;
+							source6 += pitch;
+							source7 += pitch;
+							source8 += pitch;
+							source9 += pitch;
+							sourceA += pitch;
+							sourceB += pitch;
+							sourceC += pitch;
+							sourceD += pitch;
+							sourceE += pitch;
+							sourceF += pitch;
+						}
+					}
+					else ASSERT(false);
+				}
+				else
+			#endif
+			{
+				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
+
+				if(internal.samples == 2)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+					}
+				}
+				else if(internal.samples == 4)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c0 = AVERAGE(c0, c2);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+					}
+				}
+				else if(internal.samples == 8)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
+							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
+							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
+							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c0 = AVERAGE(c0, c4);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+					}
+				}
+				else if(internal.samples == 16)
+				{
+					for(int y = 0; y < height; y++)
+					{
+						for(int x = 0; x < width; x++)
+						{
+							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
+							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
+							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
+							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
+							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
+							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
+							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
+							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
+							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
+							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
+							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
+							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
+							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
+							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
+							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
+							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
+
+							c0 = AVERAGE(c0, c1);
+							c2 = AVERAGE(c2, c3);
+							c4 = AVERAGE(c4, c5);
+							c6 = AVERAGE(c6, c7);
+							c8 = AVERAGE(c8, c9);
+							cA = AVERAGE(cA, cB);
+							cC = AVERAGE(cC, cD);
+							cE = AVERAGE(cE, cF);
+							c0 = AVERAGE(c0, c2);
+							c4 = AVERAGE(c4, c6);
+							c8 = AVERAGE(c8, cA);
+							cC = AVERAGE(cC, cE);
+							c0 = AVERAGE(c0, c4);
+							c8 = AVERAGE(c8, cC);
+							c0 = AVERAGE(c0, c8);
+
+							*(unsigned short*)(source0 + 2 * x) = c0;
+						}
+
+						source0 += pitch;
+						source1 += pitch;
+						source2 += pitch;
+						source3 += pitch;
+						source4 += pitch;
+						source5 += pitch;
+						source6 += pitch;
+						source7 += pitch;
+						source8 += pitch;
+						source9 += pitch;
+						sourceA += pitch;
+						sourceB += pitch;
+						sourceC += pitch;
+						sourceD += pitch;
+						sourceE += pitch;
+						sourceF += pitch;
+					}
+				}
+				else ASSERT(false);
+
+				#undef AVERAGE
+			}
+		}
+		else
+		{
+		//	UNIMPLEMENTED();
+		}
+	}
+}
diff --git a/src/Device/Surface.hpp b/src/Device/Surface.hpp
new file mode 100644
index 0000000..10c5364
--- /dev/null
+++ b/src/Device/Surface.hpp
@@ -0,0 +1,665 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Surface_hpp
+#define sw_Surface_hpp
+
+#include "Color.hpp"
+#include "Main/Config.hpp"
+#include "Common/Resource.hpp"
+
+namespace sw
+{
+	class Resource;
+
+	template <typename T> struct RectT
+	{
+		RectT() {}
+		RectT(T x0i, T y0i, T x1i, T y1i) : x0(x0i), y0(y0i), x1(x1i), y1(y1i) {}
+
+		void clip(T minX, T minY, T maxX, T maxY)
+		{
+			x0 = clamp(x0, minX, maxX);
+			y0 = clamp(y0, minY, maxY);
+			x1 = clamp(x1, minX, maxX);
+			y1 = clamp(y1, minY, maxY);
+		}
+
+		T width() const  { return x1 - x0; }
+		T height() const { return y1 - y0; }
+
+		T x0;   // Inclusive
+		T y0;   // Inclusive
+		T x1;   // Exclusive
+		T y1;   // Exclusive
+	};
+
+	typedef RectT<int> Rect;
+	typedef RectT<float> RectF;
+
+	template<typename T> struct SliceRectT : public RectT<T>
+	{
+		SliceRectT() : slice(0) {}
+		SliceRectT(const RectT<T>& rect) : RectT<T>(rect), slice(0) {}
+		SliceRectT(const RectT<T>& rect, int s) : RectT<T>(rect), slice(s) {}
+		SliceRectT(T x0, T y0, T x1, T y1, int s) : RectT<T>(x0, y0, x1, y1), slice(s) {}
+		int slice;
+	};
+
+	typedef SliceRectT<int> SliceRect;
+	typedef SliceRectT<float> SliceRectF;
+
+	enum Format : unsigned char
+	{
+		FORMAT_NULL,
+
+		FORMAT_A8,
+		FORMAT_R8I,
+		FORMAT_R8UI,
+		FORMAT_R8_SNORM,
+		FORMAT_R8,
+		FORMAT_R16I,
+		FORMAT_R16UI,
+		FORMAT_R32I,
+		FORMAT_R32UI,
+		FORMAT_R3G3B2,
+		FORMAT_A8R3G3B2,
+		FORMAT_X4R4G4B4,
+		FORMAT_A4R4G4B4,
+		FORMAT_R4G4B4A4,
+		FORMAT_R5G6B5,
+		FORMAT_R8G8B8,
+		FORMAT_B8G8R8,
+		FORMAT_X8R8G8B8,
+		FORMAT_A8R8G8B8,
+		FORMAT_X8B8G8R8I,
+		FORMAT_X8B8G8R8UI,
+		FORMAT_X8B8G8R8_SNORM,
+		FORMAT_X8B8G8R8,
+		FORMAT_A8B8G8R8I,
+		FORMAT_A8B8G8R8UI,
+		FORMAT_A8B8G8R8_SNORM,
+		FORMAT_A8B8G8R8,
+		FORMAT_SRGB8_X8,
+		FORMAT_SRGB8_A8,
+		FORMAT_X1R5G5B5,
+		FORMAT_A1R5G5B5,
+		FORMAT_R5G5B5A1,
+		FORMAT_G8R8I,
+		FORMAT_G8R8UI,
+		FORMAT_G8R8_SNORM,
+		FORMAT_G8R8,
+		FORMAT_G16R16,
+		FORMAT_G16R16I,
+		FORMAT_G16R16UI,
+		FORMAT_G32R32I,
+		FORMAT_G32R32UI,
+		FORMAT_A2R10G10B10,
+		FORMAT_A2B10G10R10,
+		FORMAT_A2B10G10R10UI,
+		FORMAT_A16B16G16R16,
+		FORMAT_X16B16G16R16I,
+		FORMAT_X16B16G16R16UI,
+		FORMAT_A16B16G16R16I,
+		FORMAT_A16B16G16R16UI,
+		FORMAT_X32B32G32R32I,
+		FORMAT_X32B32G32R32UI,
+		FORMAT_A32B32G32R32I,
+		FORMAT_A32B32G32R32UI,
+		// Paletted formats
+		FORMAT_P8,
+		FORMAT_A8P8,
+		// Compressed formats
+		FORMAT_DXT1,
+		FORMAT_DXT3,
+		FORMAT_DXT5,
+		FORMAT_ATI1,
+		FORMAT_ATI2,
+		FORMAT_ETC1,
+		FORMAT_R11_EAC,
+		FORMAT_SIGNED_R11_EAC,
+		FORMAT_RG11_EAC,
+		FORMAT_SIGNED_RG11_EAC,
+		FORMAT_RGB8_ETC2,
+		FORMAT_SRGB8_ETC2,
+		FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2,
+		FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2,
+		FORMAT_RGBA8_ETC2_EAC,
+		FORMAT_SRGB8_ALPHA8_ETC2_EAC,
+		FORMAT_RGBA_ASTC_4x4_KHR,
+		FORMAT_RGBA_ASTC_5x4_KHR,
+		FORMAT_RGBA_ASTC_5x5_KHR,
+		FORMAT_RGBA_ASTC_6x5_KHR,
+		FORMAT_RGBA_ASTC_6x6_KHR,
+		FORMAT_RGBA_ASTC_8x5_KHR,
+		FORMAT_RGBA_ASTC_8x6_KHR,
+		FORMAT_RGBA_ASTC_8x8_KHR,
+		FORMAT_RGBA_ASTC_10x5_KHR,
+		FORMAT_RGBA_ASTC_10x6_KHR,
+		FORMAT_RGBA_ASTC_10x8_KHR,
+		FORMAT_RGBA_ASTC_10x10_KHR,
+		FORMAT_RGBA_ASTC_12x10_KHR,
+		FORMAT_RGBA_ASTC_12x12_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR,
+		FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR,
+		// Floating-point formats
+		FORMAT_A16F,
+		FORMAT_R16F,
+		FORMAT_G16R16F,
+		FORMAT_B16G16R16F,
+		FORMAT_X16B16G16R16F,
+		FORMAT_A16B16G16R16F,
+		FORMAT_X16B16G16R16F_UNSIGNED,
+		FORMAT_A32F,
+		FORMAT_R32F,
+		FORMAT_G32R32F,
+		FORMAT_B32G32R32F,
+		FORMAT_X32B32G32R32F,
+		FORMAT_A32B32G32R32F,
+		FORMAT_X32B32G32R32F_UNSIGNED,
+		// Bump map formats
+		FORMAT_V8U8,
+		FORMAT_L6V5U5,
+		FORMAT_Q8W8V8U8,
+		FORMAT_X8L8V8U8,
+		FORMAT_A2W10V10U10,
+		FORMAT_V16U16,
+		FORMAT_A16W16V16U16,
+		FORMAT_Q16W16V16U16,
+		// Luminance formats
+		FORMAT_L8,
+		FORMAT_A4L4,
+		FORMAT_L16,
+		FORMAT_A8L8,
+		FORMAT_L16F,
+		FORMAT_A16L16F,
+		FORMAT_L32F,
+		FORMAT_A32L32F,
+		// Depth/stencil formats
+		FORMAT_D16,
+		FORMAT_D32,
+		FORMAT_D24X8,
+		FORMAT_D24S8,
+		FORMAT_D24FS8,
+		FORMAT_D32F,                 // Quad layout
+		FORMAT_D32FS8,               // Quad layout
+		FORMAT_D32F_COMPLEMENTARY,   // Quad layout, 1 - z
+		FORMAT_D32FS8_COMPLEMENTARY, // Quad layout, 1 - z
+		FORMAT_D32F_LOCKABLE,        // Linear layout
+		FORMAT_D32FS8_TEXTURE,       // Linear layout, no PCF
+		FORMAT_D32F_SHADOW,          // Linear layout, PCF
+		FORMAT_D32FS8_SHADOW,        // Linear layout, PCF
+		FORMAT_DF24S8,
+		FORMAT_DF16S8,
+		FORMAT_INTZ,
+		FORMAT_S8,
+		// Quad layout framebuffer
+		FORMAT_X8G8R8B8Q,
+		FORMAT_A8G8R8B8Q,
+		// YUV formats
+		FORMAT_YV12_BT601,
+		FORMAT_YV12_BT709,
+		FORMAT_YV12_JFIF,    // Full-swing BT.601
+
+		FORMAT_LAST = FORMAT_YV12_JFIF
+	};
+
+	enum Lock
+	{
+		LOCK_UNLOCKED,
+		LOCK_READONLY,
+		LOCK_WRITEONLY,
+		LOCK_READWRITE,
+		LOCK_DISCARD,
+		LOCK_UPDATE   // Write access which doesn't dirty the buffer, because it's being updated with the sibling's data.
+	};
+
+	class [[clang::lto_visibility_public]] Surface
+	{
+	private:
+		struct Buffer
+		{
+			friend Surface;
+
+		private:
+			void write(int x, int y, int z, const Color<float> &color);
+			void write(int x, int y, const Color<float> &color);
+			void write(void *element, const Color<float> &color);
+			Color<float> read(int x, int y, int z) const;
+			Color<float> read(int x, int y) const;
+			Color<float> read(void *element) const;
+			Color<float> sample(float x, float y, float z) const;
+			Color<float> sample(float x, float y, int layer) const;
+
+			void *lockRect(int x, int y, int z, Lock lock);
+			void unlockRect();
+
+			void *buffer;
+			int width;
+			int height;
+			int depth;
+			short border;
+			short samples;
+
+			int bytes;
+			int pitchB;
+			int pitchP;
+			int sliceB;
+			int sliceP;
+
+			Format format;
+			AtomicInt lock;
+
+			bool dirty;   // Sibling internal/external buffer doesn't match.
+		};
+
+	protected:
+		Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice);
+		Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0);
+
+	public:
+		static Surface *create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice);
+		static Surface *create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0);
+
+		virtual ~Surface() = 0;
+
+		inline void *lock(int x, int y, int z, Lock lock, Accessor client, bool internal = false);
+		inline void unlock(bool internal = false);
+		inline int getWidth() const;
+		inline int getHeight() const;
+		inline int getDepth() const;
+		inline int getBorder() const;
+		inline Format getFormat(bool internal = false) const;
+		inline int getPitchB(bool internal = false) const;
+		inline int getPitchP(bool internal = false) const;
+		inline int getSliceB(bool internal = false) const;
+		inline int getSliceP(bool internal = false) const;
+
+		void *lockExternal(int x, int y, int z, Lock lock, Accessor client);
+		void unlockExternal();
+		inline Format getExternalFormat() const;
+		inline int getExternalPitchB() const;
+		inline int getExternalPitchP() const;
+		inline int getExternalSliceB() const;
+		inline int getExternalSliceP() const;
+
+		virtual void *lockInternal(int x, int y, int z, Lock lock, Accessor client) = 0;
+		virtual void unlockInternal() = 0;
+		inline Format getInternalFormat() const;
+		inline int getInternalPitchB() const;
+		inline int getInternalPitchP() const;
+		inline int getInternalSliceB() const;
+		inline int getInternalSliceP() const;
+
+		void *lockStencil(int x, int y, int front, Accessor client);
+		void unlockStencil();
+		inline Format getStencilFormat() const;
+		inline int getStencilPitchB() const;
+		inline int getStencilSliceB() const;
+
+		void sync();                      // Wait for lock(s) to be released.
+		virtual bool requiresSync() const { return false; }
+		inline bool isUnlocked() const;   // Only reliable after sync().
+
+		inline int getSamples() const;
+		inline int getMultiSampleCount() const;
+		inline int getSuperSampleCount() const;
+
+		bool isEntire(const Rect& rect) const;
+		Rect getRect() const;
+		void clearDepth(float depth, int x0, int y0, int width, int height);
+		void clearStencil(unsigned char stencil, unsigned char mask, int x0, int y0, int width, int height);
+		void fill(const Color<float> &color, int x0, int y0, int width, int height);
+
+		Color<float> readExternal(int x, int y, int z) const;
+		Color<float> readExternal(int x, int y) const;
+		Color<float> sampleExternal(float x, float y, float z) const;
+		Color<float> sampleExternal(float x, float y) const;
+		void writeExternal(int x, int y, int z, const Color<float> &color);
+		void writeExternal(int x, int y, const Color<float> &color);
+
+		void copyInternal(const Surface* src, int x, int y, float srcX, float srcY, bool filter);
+		void copyInternal(const Surface* src, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter);
+
+		enum Edge { TOP, BOTTOM, RIGHT, LEFT };
+		void copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge);
+		void computeCubeCorner(int x0, int y0, int x1, int y1);
+
+		bool hasStencil() const;
+		bool hasDepth() const;
+		bool hasPalette() const;
+		bool isRenderTarget() const;
+
+		bool hasDirtyContents() const;
+		void markContentsClean();
+		inline bool isExternalDirty() const;
+		Resource *getResource();
+
+		static int bytes(Format format);
+		static int pitchB(int width, int border, Format format, bool target);
+		static int pitchP(int width, int border, Format format, bool target);
+		static int sliceB(int width, int height, int border, Format format, bool target);
+		static int sliceP(int width, int height, int border, Format format, bool target);
+		static size_t size(int width, int height, int depth, int border, int samples, Format format);
+
+		static bool isStencil(Format format);
+		static bool isDepth(Format format);
+		static bool hasQuadLayout(Format format);
+		static bool isPalette(Format format);
+
+		static bool isFloatFormat(Format format);
+		static bool isUnsignedComponent(Format format, int component);
+		static bool isSRGBreadable(Format format);
+		static bool isSRGBwritable(Format format);
+		static bool isSRGBformat(Format format);
+		static bool isCompressed(Format format);
+		static bool isSignedNonNormalizedInteger(Format format);
+		static bool isUnsignedNonNormalizedInteger(Format format);
+		static bool isNonNormalizedInteger(Format format);
+		static bool isNormalizedInteger(Format format);
+		static int componentCount(Format format);
+
+		static void setTexturePalette(unsigned int *palette);
+
+	private:
+		sw::Resource *resource;
+
+		typedef unsigned char byte;
+		typedef unsigned short word;
+		typedef unsigned int dword;
+		typedef uint64_t qword;
+
+		struct DXT1
+		{
+			word c0;
+			word c1;
+			dword lut;
+		};
+
+		struct DXT3
+		{
+			qword a;
+
+			word c0;
+			word c1;
+			dword lut;
+		};
+
+		struct DXT5
+		{
+			union
+			{
+				struct
+				{
+					byte a0;
+					byte a1;
+				};
+
+				qword alut;   // Skip first 16 bit
+			};
+
+			word c0;
+			word c1;
+			dword clut;
+		};
+
+		struct ATI2
+		{
+			union
+			{
+				struct
+				{
+					byte y0;
+					byte y1;
+				};
+
+				qword ylut;   // Skip first 16 bit
+			};
+
+			union
+			{
+				struct
+				{
+					byte x0;
+					byte x1;
+				};
+
+				qword xlut;   // Skip first 16 bit
+			};
+		};
+
+		struct ATI1
+		{
+			union
+			{
+				struct
+				{
+					byte r0;
+					byte r1;
+				};
+
+				qword rlut;   // Skip first 16 bit
+			};
+		};
+
+		static void decodeR8G8B8(Buffer &destination, Buffer &source);
+		static void decodeX1R5G5B5(Buffer &destination, Buffer &source);
+		static void decodeA1R5G5B5(Buffer &destination, Buffer &source);
+		static void decodeX4R4G4B4(Buffer &destination, Buffer &source);
+		static void decodeA4R4G4B4(Buffer &destination, Buffer &source);
+		static void decodeP8(Buffer &destination, Buffer &source);
+
+		static void decodeDXT1(Buffer &internal, Buffer &external);
+		static void decodeDXT3(Buffer &internal, Buffer &external);
+		static void decodeDXT5(Buffer &internal, Buffer &external);
+		static void decodeATI1(Buffer &internal, Buffer &external);
+		static void decodeATI2(Buffer &internal, Buffer &external);
+		static void decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned);
+		static void decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB);
+		static void decodeASTC(Buffer &internal, Buffer &external, int xSize, int ySize, int zSize, bool isSRGB);
+
+		static void update(Buffer &destination, Buffer &source);
+		static void genericUpdate(Buffer &destination, Buffer &source);
+		static void *allocateBuffer(int width, int height, int depth, int border, int samples, Format format);
+		static void memfill4(void *buffer, int pattern, int bytes);
+
+		bool identicalBuffers() const;
+		Format selectInternalFormat(Format format) const;
+
+		void resolve();
+
+		Buffer external;
+		Buffer internal;
+		Buffer stencil;
+
+		const bool lockable;
+		const bool renderTarget;
+
+		bool dirtyContents;   // Sibling surfaces need updating (mipmaps / cube borders).
+		unsigned int paletteUsed;
+
+		static unsigned int *palette;   // FIXME: Not multi-device safe
+		static unsigned int paletteID;
+
+		bool hasParent;
+		bool ownExternal;
+	};
+}
+
+#undef min
+#undef max
+
+namespace sw
+{
+	void *Surface::lock(int x, int y, int z, Lock lock, Accessor client, bool internal)
+	{
+		return internal ? lockInternal(x, y, z, lock, client) : lockExternal(x, y, z, lock, client);
+	}
+
+	void Surface::unlock(bool internal)
+	{
+		return internal ? unlockInternal() : unlockExternal();
+	}
+
+	int Surface::getWidth() const
+	{
+		return external.width;
+	}
+
+	int Surface::getHeight() const
+	{
+		return external.height;
+	}
+
+	int Surface::getDepth() const
+	{
+		return external.depth;
+	}
+
+	int Surface::getBorder() const
+	{
+		return internal.border;
+	}
+
+	Format Surface::getFormat(bool internal) const
+	{
+		return internal ? getInternalFormat() : getExternalFormat();
+	}
+
+	int Surface::getPitchB(bool internal) const
+	{
+		return internal ? getInternalPitchB() : getExternalPitchB();
+	}
+
+	int Surface::getPitchP(bool internal) const
+	{
+		return internal ? getInternalPitchP() : getExternalPitchP();
+	}
+
+	int Surface::getSliceB(bool internal) const
+	{
+		return internal ? getInternalSliceB() : getExternalSliceB();
+	}
+
+	int Surface::getSliceP(bool internal) const
+	{
+		return internal ? getInternalSliceP() : getExternalSliceP();
+	}
+
+	Format Surface::getExternalFormat() const
+	{
+		return external.format;
+	}
+
+	int Surface::getExternalPitchB() const
+	{
+		return external.pitchB;
+	}
+
+	int Surface::getExternalPitchP() const
+	{
+		return external.pitchP;
+	}
+
+	int Surface::getExternalSliceB() const
+	{
+		return external.sliceB;
+	}
+
+	int Surface::getExternalSliceP() const
+	{
+		return external.sliceP;
+	}
+
+	Format Surface::getInternalFormat() const
+	{
+		return internal.format;
+	}
+
+	int Surface::getInternalPitchB() const
+	{
+		return internal.pitchB;
+	}
+
+	int Surface::getInternalPitchP() const
+	{
+		return internal.pitchP;
+	}
+
+	int Surface::getInternalSliceB() const
+	{
+		return internal.sliceB;
+	}
+
+	int Surface::getInternalSliceP() const
+	{
+		return internal.sliceP;
+	}
+
+	Format Surface::getStencilFormat() const
+	{
+		return stencil.format;
+	}
+
+	int Surface::getStencilPitchB() const
+	{
+		return stencil.pitchB;
+	}
+
+	int Surface::getStencilSliceB() const
+	{
+		return stencil.sliceB;
+	}
+
+	int Surface::getSamples() const
+	{
+		return internal.samples;
+	}
+
+	int Surface::getMultiSampleCount() const
+	{
+		return sw::min((int)internal.samples, 4);
+	}
+
+	int Surface::getSuperSampleCount() const
+	{
+		return internal.samples > 4 ? internal.samples / 4 : 1;
+	}
+
+	bool Surface::isUnlocked() const
+	{
+		return external.lock == LOCK_UNLOCKED &&
+		       internal.lock == LOCK_UNLOCKED &&
+		       stencil.lock == LOCK_UNLOCKED;
+	}
+
+	bool Surface::isExternalDirty() const
+	{
+		return external.buffer && external.buffer != internal.buffer && external.dirty;
+	}
+}
+
+#endif   // sw_Surface_hpp
diff --git a/src/Device/SwiftConfig.cpp b/src/Device/SwiftConfig.cpp
new file mode 100644
index 0000000..1c22394
--- /dev/null
+++ b/src/Device/SwiftConfig.cpp
@@ -0,0 +1,822 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SwiftConfig.hpp"
+
+#include "Config.hpp"
+#include "Common/Configurator.hpp"
+#include "Common/Debug.hpp"
+#include "Common/Version.h"
+
+#include <sstream>
+#include <stdio.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <algorithm>
+
+namespace sw
+{
+	extern Profiler profiler;
+
+	std::string itoa(int number)
+	{
+		std::stringstream ss;
+		ss << number;
+		return ss.str();
+	}
+
+	std::string ftoa(double number)
+	{
+		std::stringstream ss;
+		ss << number;
+		return ss.str();
+	}
+
+	SwiftConfig::SwiftConfig(bool disableServerOverride) : listenSocket(0)
+	{
+		readConfiguration(disableServerOverride);
+
+		if(!disableServerOverride)
+		{
+			writeConfiguration();
+		}
+
+		receiveBuffer = 0;
+
+		if(!config.disableServer)
+		{
+			createServer();
+		}
+	}
+
+	SwiftConfig::~SwiftConfig()
+	{
+		destroyServer();
+	}
+
+	void SwiftConfig::createServer()
+	{
+		bufferLength = 16 * 1024;
+		receiveBuffer = new char[bufferLength];
+
+		Socket::startup();
+		listenSocket = new Socket("localhost", "8080");
+		listenSocket->listen();
+
+		terminate = false;
+		serverThread = new Thread(serverRoutine, this);
+	}
+
+	void SwiftConfig::destroyServer()
+	{
+		if(receiveBuffer)
+		{
+			terminate = true;
+			serverThread->join();
+			delete serverThread;
+
+			delete listenSocket;
+			listenSocket = 0;
+
+			Socket::cleanup();
+
+			delete[] receiveBuffer;
+			receiveBuffer = 0;
+		}
+	}
+
+	bool SwiftConfig::hasNewConfiguration(bool reset)
+	{
+		bool value = newConfig;
+
+		if(reset)
+		{
+			newConfig = false;
+		}
+
+		return value;
+	}
+
+	void SwiftConfig::getConfiguration(Configuration &configuration)
+	{
+		criticalSection.lock();
+		configuration = config;
+		criticalSection.unlock();
+	}
+
+	void SwiftConfig::serverRoutine(void *parameters)
+	{
+		SwiftConfig *swiftConfig = (SwiftConfig*)parameters;
+
+		swiftConfig->serverLoop();
+	}
+
+	void SwiftConfig::serverLoop()
+	{
+		readConfiguration();
+
+		while(!terminate)
+		{
+			if(listenSocket->select(100000))
+			{
+				Socket *clientSocket = listenSocket->accept();
+				int bytesReceived = 1;
+
+				while(bytesReceived > 0 && !terminate)
+				{
+					if(clientSocket->select(10))
+					{
+						bytesReceived = clientSocket->receive(receiveBuffer, bufferLength);
+
+						if(bytesReceived > 0)
+						{
+							receiveBuffer[bytesReceived] = 0;
+
+							respond(clientSocket, receiveBuffer);
+						}
+					}
+				}
+
+				delete clientSocket;
+			}
+		}
+	}
+
+	bool match(const char **url, const char *string)
+	{
+		size_t length = strlen(string);
+
+		if(strncmp(*url, string, length) == 0)
+		{
+			*url += length;
+
+			return true;
+		}
+
+		return false;
+	}
+
+	void SwiftConfig::respond(Socket *clientSocket, const char *request)
+	{
+		if(match(&request, "GET /"))
+		{
+			if(match(&request, "swiftshader") || match(&request, "swiftconfig"))
+			{
+				if(match(&request, " ") || match(&request, "/ "))
+				{
+					return send(clientSocket, OK, page());
+				}
+			}
+		}
+		else if(match(&request, "POST /"))
+		{
+			if(match(&request, "swiftshader") || match(&request, "swiftconfig"))
+			{
+				if(match(&request, " ") || match(&request, "/ "))
+				{
+					criticalSection.lock();
+
+					const char *postData = strstr(request, "\r\n\r\n");
+					postData = postData ? postData + 4 : 0;
+
+					if(postData && strlen(postData) > 0)
+					{
+						parsePost(postData);
+					}
+					else   // POST data in next packet
+					{
+						int bytesReceived = clientSocket->receive(receiveBuffer, bufferLength);
+
+						if(bytesReceived > 0)
+						{
+							receiveBuffer[bytesReceived] = 0;
+							parsePost(receiveBuffer);
+						}
+					}
+
+					writeConfiguration();
+					newConfig = true;
+
+					if(config.disableServer)
+					{
+						destroyServer();
+					}
+
+					criticalSection.unlock();
+
+					return send(clientSocket, OK, page());
+				}
+				else if(match(&request, "/profile "))
+				{
+					return send(clientSocket, OK, profile());
+				}
+			}
+		}
+
+		return send(clientSocket, NotFound);
+	}
+
+	std::string SwiftConfig::page()
+	{
+		std::string html;
+
+		const std::string selected = "selected='selected'";
+		const std::string checked = "checked='checked'";
+		const std::string empty = "";
+
+		html += "<!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01//EN' 'http://www.w3.org/TR/html4/strict.dtd'>\n";
+		html += "<html>\n";
+		html += "<head>\n";
+		html += "<meta http-equiv='content-type' content='text/html; charset=UTF-8'>\n";
+		html += "<title>SwiftShader Configuration Panel</title>\n";
+		html += "</head>\n";
+		html += "<body>\n";
+		html += "<script type='text/javascript'>\n";
+		html += "request();\n";
+		html += "function request()\n";
+		html += "{\n";
+		html += "var xhr = new XMLHttpRequest();\n";
+		html += "xhr.open('POST', '/swiftshader/profile', true);\n";
+		html += "xhr.onreadystatechange = function()\n";
+		html += "{\n";
+		html += "if(xhr.readyState == 4 && xhr.status == 200)\n";
+		html += "{\n";
+		html += "document.getElementById('profile').innerHTML = xhr.responseText;\n";
+		html += "setTimeout('request()', 1000);\n";
+		html += "}\n";
+		html += "}\n";
+		html += "xhr.send();\n";
+		html += "}\n";
+		html += "</script>\n";
+		html += "<form method='POST' action=''>\n";
+		html += "<h1>SwiftShader Configuration Panel</h1>\n";
+		html += "<div id='profile'>" + profile() + "</div>\n";
+		html += "<hr><p>\n";
+		html += "<input type='submit' value='Apply changes' title='Click to apply all settings.'>\n";
+	//	html += "<input type='reset' value='Reset changes' title='Click to reset your changes to the previous value.'>\n";
+		html += "</p><hr>\n";
+		html += "<h2><em>Device capabilities</em></h2>\n";
+		html += "<table>\n";
+		html += "<tr><td>Build revision:</td><td>" REVISION_STRING "</td></tr>\n";
+		html += "<tr><td>Pixel shader model:</td><td><select name='pixelShaderVersion' title='The highest version of pixel shader supported by SwiftShader. Lower versions might be faster if supported by the application. Only effective after restarting the application.'>\n";
+		html += "<option value='0'"  + (config.pixelShaderVersion ==  0 ? selected : empty) + ">0.0</option>\n";
+		html += "<option value='11'" + (config.pixelShaderVersion == 11 ? selected : empty) + ">1.1</option>\n";
+		html += "<option value='12'" + (config.pixelShaderVersion == 12 ? selected : empty) + ">1.2</option>\n";
+		html += "<option value='13'" + (config.pixelShaderVersion == 13 ? selected : empty) + ">1.3</option>\n";
+		html += "<option value='14'" + (config.pixelShaderVersion == 14 ? selected : empty) + ">1.4</option>\n";
+		html += "<option value='20'" + (config.pixelShaderVersion == 20 ? selected : empty) + ">2.0</option>\n";
+		html += "<option value='21'" + (config.pixelShaderVersion == 21 ? selected : empty) + ">2.x</option>\n";
+		html += "<option value='30'" + (config.pixelShaderVersion == 30 ? selected : empty) + ">3.0 (default)</option>\n";
+		html += "</select></td></tr>\n";
+		html += "<tr><td>Vertex shader model:</td><td><select name='vertexShaderVersion' title='The highest version of vertex shader supported by SwiftShader. Lower versions might be faster if supported by the application. Only effective after restarting the application.'>\n";
+		html += "<option value='0'"  + (config.vertexShaderVersion ==  0 ? selected : empty) + ">0.0</option>\n";
+		html += "<option value='11'" + (config.vertexShaderVersion == 11 ? selected : empty) + ">1.1</option>\n";
+		html += "<option value='20'" + (config.vertexShaderVersion == 20 ? selected : empty) + ">2.0</option>\n";
+		html += "<option value='21'" + (config.vertexShaderVersion == 21 ? selected : empty) + ">2.x</option>\n";
+		html += "<option value='30'" + (config.vertexShaderVersion == 30 ? selected : empty) + ">3.0 (default)</option>\n";
+		html += "</select></td></tr>\n";
+		html += "<tr><td>Texture memory:</td><td><select name='textureMemory' title='The maximum amount of memory used for textures and other resources.'>\n";
+		html += "<option value='128'"  + (config.textureMemory == 128  ? selected : empty) + ">128 MB</option>\n";
+		html += "<option value='256'"  + (config.textureMemory == 256  ? selected : empty) + ">256 MB (default)</option>\n";
+		html += "<option value='512'"  + (config.textureMemory == 512  ? selected : empty) + ">512 MB</option>\n";
+		html += "<option value='1024'" + (config.textureMemory == 1024 ? selected : empty) + ">1024 MB</option>\n";
+		html += "<option value='2048'" + (config.textureMemory == 2048 ? selected : empty) + ">2048 MB</option>\n";
+		html += "</select></td></tr>\n";
+		html += "<tr><td>Device identifier:</td><td><select name='identifier' title='The information used by some applications to determine device capabilities.'>\n";
+		html += "<option value='0'" + (config.identifier == 0 ? selected : empty) + ">Google SwiftShader (default)</option>\n";
+		html += "<option value='1'" + (config.identifier == 1 ? selected : empty) + ">NVIDIA GeForce 7900 GS</option>\n";
+		html += "<option value='2'" + (config.identifier == 2 ? selected : empty) + ">ATI Mobility Radeon X1600</option>\n";
+		html += "<option value='3'" + (config.identifier == 3 ? selected : empty) + ">Intel GMA X3100</option>\n";
+		html += "<option value='4'" + (config.identifier == 4 ? selected : empty) + ">System device</option>\n";
+		html += "</select></td></tr>\n";
+		html += "</table>\n";
+		html += "<h2><em>Cache sizes</em></h2>\n";
+		html += "<table>\n";
+		html += "<tr><td>Vertex routine cache size:</td><td><select name='vertexRoutineCacheSize' title='The number of dynamically generated vertex processing routines being cached for reuse. Lower numbers save memory but require more routines to be regenerated.'>\n";
+		html += "<option value='64'"   + (config.vertexRoutineCacheSize == 64   ? selected : empty) + ">64</option>\n";
+		html += "<option value='128'"  + (config.vertexRoutineCacheSize == 128  ? selected : empty) + ">128</option>\n";
+		html += "<option value='256'"  + (config.vertexRoutineCacheSize == 256  ? selected : empty) + ">256</option>\n";
+		html += "<option value='512'"  + (config.vertexRoutineCacheSize == 512  ? selected : empty) + ">512</option>\n";
+		html += "<option value='1024'" + (config.vertexRoutineCacheSize == 1024 ? selected : empty) + ">1024 (default)</option>\n";
+		html += "<option value='2048'" + (config.vertexRoutineCacheSize == 2048 ? selected : empty) + ">2048</option>\n";
+		html += "<option value='4096'" + (config.vertexRoutineCacheSize == 4096 ? selected : empty) + ">4096</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "<tr><td>Pixel routine cache size:</td><td><select name='pixelRoutineCacheSize' title='The number of dynamically generated pixel processing routines being cached for reuse. Lower numbers save memory but require more routines to be regenerated.'>\n";
+		html += "<option value='64'"   + (config.pixelRoutineCacheSize == 64   ? selected : empty) + ">64</option>\n";
+		html += "<option value='128'"  + (config.pixelRoutineCacheSize == 128  ? selected : empty) + ">128</option>\n";
+		html += "<option value='256'"  + (config.pixelRoutineCacheSize == 256  ? selected : empty) + ">256</option>\n";
+		html += "<option value='512'"  + (config.pixelRoutineCacheSize == 512  ? selected : empty) + ">512</option>\n";
+		html += "<option value='1024'" + (config.pixelRoutineCacheSize == 1024 ? selected : empty) + ">1024 (default)</option>\n";
+		html += "<option value='2048'" + (config.pixelRoutineCacheSize == 2048 ? selected : empty) + ">2048</option>\n";
+		html += "<option value='4096'" + (config.pixelRoutineCacheSize == 4096 ? selected : empty) + ">4096</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "<tr><td>Setup routine cache size:</td><td><select name='setupRoutineCacheSize' title='The number of dynamically generated primitive setup routines being cached for reuse. Lower numbers save memory but require more routines to be regenerated.'>\n";
+		html += "<option value='64'"   + (config.setupRoutineCacheSize == 64   ? selected : empty) + ">64</option>\n";
+		html += "<option value='128'"  + (config.setupRoutineCacheSize == 128  ? selected : empty) + ">128</option>\n";
+		html += "<option value='256'"  + (config.setupRoutineCacheSize == 256  ? selected : empty) + ">256</option>\n";
+		html += "<option value='512'"  + (config.setupRoutineCacheSize == 512  ? selected : empty) + ">512</option>\n";
+		html += "<option value='1024'" + (config.setupRoutineCacheSize == 1024 ? selected : empty) + ">1024 (default)</option>\n";
+		html += "<option value='2048'" + (config.setupRoutineCacheSize == 2048 ? selected : empty) + ">2048</option>\n";
+		html += "<option value='4096'" + (config.setupRoutineCacheSize == 4096 ? selected : empty) + ">4096</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "<tr><td>Vertex cache size:</td><td><select name='vertexCacheSize' title='The number of processed vertices being cached for reuse. Lower numbers save memory but require more vertices to be reprocessed.'>\n";
+		html += "<option value='64'"   + (config.vertexCacheSize == 64   ? selected : empty) + ">64 (default)</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "</table>\n";
+		html += "<h2><em>Quality</em></h2>\n";
+		html += "<table>\n";
+		html += "<tr><td>Maximum texture sampling quality:</td><td><select name='textureSampleQuality' title='The maximum texture filtering quality. Lower settings can be faster but cause visual artifacts.'>\n";
+		html += "<option value='0'" + (config.textureSampleQuality == 0 ? selected : empty) + ">Point</option>\n";
+		html += "<option value='1'" + (config.textureSampleQuality == 1 ? selected : empty) + ">Linear</option>\n";
+		html += "<option value='2'" + (config.textureSampleQuality == 2 ? selected : empty) + ">Anisotropic (default)</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "<tr><td>Maximum mipmapping quality:</td><td><select name='mipmapQuality' title='The maximum mipmap filtering quality. Higher settings can be more visually appealing but are slower.'>\n";
+		html += "<option value='0'" + (config.mipmapQuality == 0 ? selected : empty) + ">Point</option>\n";
+		html += "<option value='1'" + (config.mipmapQuality == 1 ? selected : empty) + ">Linear (default)</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "<tr><td>Perspective correction:</td><td><select name='perspectiveCorrection' title='Enables or disables perspective correction. Disabling it is faster but can causes distortion. Recommended for 2D applications only.'>\n";
+		html += "<option value='0'" + (config.perspectiveCorrection == 0 ? selected : empty) + ">Off</option>\n";
+		html += "<option value='1'" + (config.perspectiveCorrection == 1 ? selected : empty) + ">On (default)</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "<tr><td>Transcendental function precision:</td><td><select name='transcendentalPrecision' title='The precision at which log/exp/pow/rcp/rsq/nrm shader instructions are computed. Lower settings can be faster but cause visual artifacts.'>\n";
+		html += "<option value='0'" + (config.transcendentalPrecision == 0 ? selected : empty) + ">Approximate</option>\n";
+		html += "<option value='1'" + (config.transcendentalPrecision == 1 ? selected : empty) + ">Partial</option>\n";
+		html += "<option value='2'" + (config.transcendentalPrecision == 2 ? selected : empty) + ">Accurate (default)</option>\n";
+		html += "<option value='3'" + (config.transcendentalPrecision == 3 ? selected : empty) + ">WHQL</option>\n";
+		html += "<option value='4'" + (config.transcendentalPrecision == 4 ? selected : empty) + ">IEEE</option>\n";
+		html += "</select></td>\n";
+		html += "</tr>\n";
+		html += "<tr><td>Transparency anti-aliasing:</td><td><select name='transparencyAntialiasing' title='The technique used to anti-alias alpha-tested transparent textures.'>\n";
+		html += "<option value='0'" + (config.transparencyAntialiasing == 0 ? selected : empty) + ">None (default)</option>\n";
+		html += "<option value='1'" + (config.transparencyAntialiasing == 1 ? selected : empty) + ">Alpha-to-Coverage</option>\n";
+		html += "</select></td>\n";
+		html += "</table>\n";
+		html += "<h2><em>Processor settings</em></h2>\n";
+		html += "<table>\n";
+		html += "<tr><td>Number of threads:</td><td><select name='threadCount' title='The number of rendering threads to be used.'>\n";
+		html += "<option value='-1'" + (config.threadCount == -1 ? selected : empty) + ">Core count</option>\n";
+		html += "<option value='0'"  + (config.threadCount == 0  ? selected : empty) + ">Process affinity (default)</option>\n";
+		html += "<option value='1'"  + (config.threadCount == 1  ? selected : empty) + ">1</option>\n";
+		html += "<option value='2'"  + (config.threadCount == 2  ? selected : empty) + ">2</option>\n";
+		html += "<option value='3'"  + (config.threadCount == 3  ? selected : empty) + ">3</option>\n";
+		html += "<option value='4'"  + (config.threadCount == 4  ? selected : empty) + ">4</option>\n";
+		html += "<option value='5'"  + (config.threadCount == 5  ? selected : empty) + ">5</option>\n";
+		html += "<option value='6'"  + (config.threadCount == 6  ? selected : empty) + ">6</option>\n";
+		html += "<option value='7'"  + (config.threadCount == 7  ? selected : empty) + ">7</option>\n";
+		html += "<option value='8'"  + (config.threadCount == 8  ? selected : empty) + ">8</option>\n";
+		html += "<option value='9'"  + (config.threadCount == 9  ? selected : empty) + ">9</option>\n";
+		html += "<option value='10'" + (config.threadCount == 10 ? selected : empty) + ">10</option>\n";
+		html += "<option value='11'" + (config.threadCount == 11 ? selected : empty) + ">11</option>\n";
+		html += "<option value='12'" + (config.threadCount == 12 ? selected : empty) + ">12</option>\n";
+		html += "<option value='13'" + (config.threadCount == 13 ? selected : empty) + ">13</option>\n";
+		html += "<option value='14'" + (config.threadCount == 14 ? selected : empty) + ">14</option>\n";
+		html += "<option value='15'" + (config.threadCount == 15 ? selected : empty) + ">15</option>\n";
+		html += "<option value='16'" + (config.threadCount == 16 ? selected : empty) + ">16</option>\n";
+		html += "</select></td></tr>\n";
+		html += "<tr><td>Enable SSE:</td><td><input name = 'enableSSE' type='checkbox'" + (config.enableSSE ? checked : empty) + " disabled='disabled' title='If checked enables the use of SSE instruction set extentions if supported by the CPU.'></td></tr>";
+		html += "<tr><td>Enable SSE2:</td><td><input name = 'enableSSE2' type='checkbox'" + (config.enableSSE2 ? checked : empty) + " title='If checked enables the use of SSE2 instruction set extentions if supported by the CPU.'></td></tr>";
+		html += "<tr><td>Enable SSE3:</td><td><input name = 'enableSSE3' type='checkbox'" + (config.enableSSE3 ? checked : empty) + " title='If checked enables the use of SSE3 instruction set extentions if supported by the CPU.'></td></tr>";
+		html += "<tr><td>Enable SSSE3:</td><td><input name = 'enableSSSE3' type='checkbox'" + (config.enableSSSE3 ? checked : empty) + " title='If checked enables the use of SSSE3 instruction set extentions if supported by the CPU.'></td></tr>";
+		html += "<tr><td>Enable SSE4.1:</td><td><input name = 'enableSSE4_1' type='checkbox'" + (config.enableSSE4_1 ? checked : empty) + " title='If checked enables the use of SSE4.1 instruction set extentions if supported by the CPU.'></td></tr>";
+		html += "</table>\n";
+		html += "<h2><em>Compiler optimizations</em></h2>\n";
+		html += "<table>\n";
+
+		for(int pass = 0; pass < 10; pass++)
+		{
+			html += "<tr><td>Optimization pass " + itoa(pass + 1) + ":</td><td><select name='optimization" + itoa(pass + 1) + "' title='An optimization pass for the shader compiler.'>\n";
+			html += "<option value='0'"  + (config.optimization[pass] == 0  ? selected : empty) + ">Disabled" + (pass > 0 ? " (default)" : "") + "</option>\n";
+			html += "<option value='1'"  + (config.optimization[pass] == 1  ? selected : empty) + ">Instruction Combining" + (pass == 0 ? " (default)" : "") + "</option>\n";
+			html += "<option value='2'"  + (config.optimization[pass] == 2  ? selected : empty) + ">Control Flow Simplification</option>\n";
+			html += "<option value='3'"  + (config.optimization[pass] == 3  ? selected : empty) + ">Loop Invariant Code Motion</option>\n";
+			html += "<option value='4'"  + (config.optimization[pass] == 4  ? selected : empty) + ">Aggressive Dead Code Elimination</option>\n";
+			html += "<option value='5'"  + (config.optimization[pass] == 5  ? selected : empty) + ">Global Value Numbering</option>\n";
+			html += "<option value='6'"  + (config.optimization[pass] == 6  ? selected : empty) + ">Commutative Expressions Reassociation</option>\n";
+			html += "<option value='7'"  + (config.optimization[pass] == 7  ? selected : empty) + ">Dead Store Elimination</option>\n";
+			html += "<option value='8'"  + (config.optimization[pass] == 8  ? selected : empty) + ">Sparse Conditional Copy Propagation</option>\n";
+			html += "<option value='9'"  + (config.optimization[pass] == 9  ? selected : empty) + ">Scalar Replacement of Aggregates</option>\n";
+			html += "</select></td></tr>\n";
+		}
+
+		html += "</table>\n";
+		html += "<h2><em>Testing & Experimental</em></h2>\n";
+		html += "<table>\n";
+		html += "<tr><td>Disable SwiftConfig server:</td><td><input name = 'disableServer' type='checkbox'" + (config.disableServer == true ? checked : empty) + " title='If checked disables the web browser based control panel.'></td></tr>";
+		html += "<tr><td>Force windowed mode:</td><td><input name = 'forceWindowed' type='checkbox'" + (config.forceWindowed == true ? checked : empty) + " title='If checked prevents the application from switching to full-screen mode.'></td></tr>";
+		html += "<tr><td>Complementary depth buffer:</td><td><input name = 'complementaryDepthBuffer' type='checkbox'" + (config.complementaryDepthBuffer == true ? checked : empty) + " title='If checked causes 1 - z to be stored in the depth buffer.'></td></tr>";
+		html += "<tr><td>Post alpha blend sRGB conversion:</td><td><input name = 'postBlendSRGB' type='checkbox'" + (config.postBlendSRGB == true ? checked : empty) + " title='If checked alpha blending is performed in linear color space.'></td></tr>";
+		html += "<tr><td>Exact color rounding:</td><td><input name = 'exactColorRounding' type='checkbox'" + (config.exactColorRounding == true ? checked : empty) + " title='If checked color rounding is done at high accuracy.'></td></tr>";
+		html += "<tr><td>Disable alpha display formats:</td><td><input name = 'disableAlphaMode' type='checkbox'" + (config.disableAlphaMode == true ? checked : empty) + " title='If checked the device does not advertise the A8R8G8B8 display mode.'></td></tr>";
+		html += "<tr><td>Disable 10-bit display formats:</td><td><input name = 'disable10BitMode' type='checkbox'" + (config.disable10BitMode == true ? checked : empty) + " title='If checked the device does not advertise the A2R10G10B10 display mode.'></td></tr>";
+		html += "<tr><td>Frame-buffer API:</td><td><select name='frameBufferAPI' title='The API used for displaying the rendered result on screen (requires restart).'>\n";
+		html += "<option value='0'" + (config.frameBufferAPI == 0 ? selected : empty) + ">DirectDraw (default)</option>\n";
+		html += "<option value='1'" + (config.frameBufferAPI == 1 ? selected : empty) + ">GDI</option>\n";
+		html += "</select></td>\n";
+		html += "<tr><td>DLL precaching:</td><td><input name = 'precache' type='checkbox'" + (config.precache == true ? checked : empty) + " title='If checked dynamically generated routines will be stored in a DLL for faster loading on application restart.'></td></tr>";
+		html += "<tr><td>Shadow mapping extensions:</td><td><select name='shadowMapping' title='Features that may accelerate or improve the quality of shadow mapping.'>\n";
+		html += "<option value='0'" + (config.shadowMapping == 0 ? selected : empty) + ">None</option>\n";
+		html += "<option value='1'" + (config.shadowMapping == 1 ? selected : empty) + ">Fetch4</option>\n";
+		html += "<option value='2'" + (config.shadowMapping == 2 ? selected : empty) + ">DST</option>\n";
+		html += "<option value='3'" + (config.shadowMapping == 3 ? selected : empty) + ">Fetch4 & DST (default)</option>\n";
+		html += "</select></td>\n";
+		html += "<tr><td>Force clearing registers that have no default value:</td><td><input name = 'forceClearRegisters' type='checkbox'" + (config.forceClearRegisters == true ? checked : empty) + " title='Initializes shader register values to 0 even if they have no default.'></td></tr>";
+		html += "</table>\n";
+	#ifndef NDEBUG
+		html += "<h2><em>Debugging</em></h2>\n";
+		html += "<table>\n";
+		html += "<tr><td>Minimum primitives:</td><td><input type='text' size='10' maxlength='10' name='minPrimitives' value='" + itoa(config.minPrimitives) + "'></td></tr>\n";
+		html += "<tr><td>Maximum primitives:</td><td><input type='text' size='10' maxlength='10' name='maxPrimitives' value='" + itoa(config.maxPrimitives) + "'></td></tr>\n";
+		html += "</table>\n";
+	#endif
+		html += "<hr><p>\n";
+		html += "<span style='font-size:10pt'>Hover the mouse pointer over a control to get additional information.</span><br>\n";
+		html += "<span style='font-size:10pt'>Some settings can be applied interactively, some need a restart of the application.</span><br>\n";
+		html += "<span style='font-size:10pt'>Removing the SwiftShader.ini file results in resetting the options to their default.</span></p>\n";
+		html += "</form>\n";
+		html += "</body>\n";
+		html += "</html>\n";
+
+		profiler.reset();
+
+		return html;
+	}
+
+	std::string SwiftConfig::profile()
+	{
+		std::string html;
+
+		html += "<p>FPS: " + ftoa(profiler.FPS) + "</p>\n";
+		html += "<p>Frame: " + itoa(profiler.framesTotal) + "</p>\n";
+
+		#if PERF_PROFILE
+			int texTime = (int)(1000 * profiler.cycles[PERF_TEX] / profiler.cycles[PERF_PIXEL] + 0.5);
+			int shaderTime = (int)(1000 * profiler.cycles[PERF_SHADER] / profiler.cycles[PERF_PIXEL] + 0.5);
+			int pipeTime = (int)(1000 * profiler.cycles[PERF_PIPE] / profiler.cycles[PERF_PIXEL] + 0.5);
+			int ropTime = (int)(1000 * profiler.cycles[PERF_ROP] / profiler.cycles[PERF_PIXEL] + 0.5);
+			int interpTime = (int)(1000 * profiler.cycles[PERF_INTERP] / profiler.cycles[PERF_PIXEL] + 0.5);
+			int rastTime = 1000 - pipeTime;
+
+			pipeTime -= shaderTime + ropTime + interpTime;
+			shaderTime -= texTime;
+
+			double texTimeF = (double)texTime / 10;
+			double shaderTimeF = (double)shaderTime / 10;
+			double pipeTimeF = (double)pipeTime / 10;
+			double ropTimeF = (double)ropTime / 10;
+			double interpTimeF = (double)interpTime / 10;
+			double rastTimeF = (double)rastTime / 10;
+
+			double averageRopOperations = profiler.ropOperationsTotal / std::max(profiler.framesTotal, 1) / 1.0e6f;
+			double averageCompressedTex = profiler.compressedTexTotal / std::max(profiler.framesTotal, 1) / 1.0e6f;
+			double averageTexOperations = profiler.texOperationsTotal / std::max(profiler.framesTotal, 1) / 1.0e6f;
+
+			html += "<p>Raster operations (million): " + ftoa(profiler.ropOperationsFrame / 1.0e6f) + " (current), " + ftoa(averageRopOperations) + " (average)</p>\n";
+			html += "<p>Texture operations (million): " + ftoa(profiler.texOperationsFrame / 1.0e6f) + " (current), " + ftoa(averageTexOperations) + " (average)</p>\n";
+			html += "<p>Compressed texture operations (million): " + ftoa(profiler.compressedTexFrame / 1.0e6f) + " (current), " + ftoa(averageCompressedTex) + " (average)</p>\n";
+			html += "<div id='profile' style='position:relative; width:1010px; height:50px; background-color:silver;'>";
+			html += "<div style='position:relative; width:1000px; height:40px; background-color:white; left:5px; top:5px;'>";
+			html += "<div style='position:relative; float:left; width:" + itoa(rastTime)   + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#FFFF7F; overflow:hidden;'>" + ftoa(rastTimeF)   + "% rast</div>\n";
+			html += "<div style='position:relative; float:left; width:" + itoa(pipeTime)   + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#FF7F7F; overflow:hidden;'>" + ftoa(pipeTimeF)   + "% pipe</div>\n";
+			html += "<div style='position:relative; float:left; width:" + itoa(interpTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#7FFFFF; overflow:hidden;'>" + ftoa(interpTimeF) + "% interp</div>\n";
+			html += "<div style='position:relative; float:left; width:" + itoa(shaderTime) + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#7FFF7F; overflow:hidden;'>" + ftoa(shaderTimeF) + "% shader</div>\n";
+			html += "<div style='position:relative; float:left; width:" + itoa(texTime)    + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#FF7FFF; overflow:hidden;'>" + ftoa(texTimeF)    + "% tex</div>\n";
+			html += "<div style='position:relative; float:left; width:" + itoa(ropTime)    + "px; height:40px; border-style:none; text-align:center; line-height:40px; background-color:#7F7FFF; overflow:hidden;'>" + ftoa(ropTimeF)    + "% rop</div>\n";
+			html += "</div></div>\n";
+
+			for(int i = 0; i < PERF_TIMERS; i++)
+			{
+				profiler.cycles[i] = 0;
+			}
+		#endif
+
+		return html;
+	}
+
+	void SwiftConfig::send(Socket *clientSocket, Status code, std::string body)
+	{
+		std::string status;
+		char header[1024];
+
+		switch(code)
+		{
+		case OK:       status += "HTTP/1.1 200 OK\r\n";        break;
+		case NotFound: status += "HTTP/1.1 404 Not Found\r\n"; break;
+		}
+
+		sprintf(header, "Content-Type: text/html; charset=UTF-8\r\n"
+						"Content-Length: %zd\r\n"
+						"Host: localhost\r\n"
+						"\r\n", body.size());
+
+		std::string message = status + header + body;
+		clientSocket->send(message.c_str(), (int)message.length());
+	}
+
+	void SwiftConfig::parsePost(const char *post)
+	{
+		// Only enabled checkboxes appear in the POST
+		config.enableSSE = true;
+		config.enableSSE2 = false;
+		config.enableSSE3 = false;
+		config.enableSSSE3 = false;
+		config.enableSSE4_1 = false;
+		config.disableServer = false;
+		config.forceWindowed = false;
+		config.complementaryDepthBuffer = false;
+		config.postBlendSRGB = false;
+		config.exactColorRounding = false;
+		config.disableAlphaMode = false;
+		config.disable10BitMode = false;
+		config.precache = false;
+		config.forceClearRegisters = false;
+
+		while(*post != 0)
+		{
+			int integer;
+			int index;
+
+			if(sscanf(post, "pixelShaderVersion=%d", &integer))
+			{
+				config.pixelShaderVersion = integer;
+			}
+			else if(sscanf(post, "vertexShaderVersion=%d", &integer))
+			{
+				config.vertexShaderVersion = integer;
+			}
+			else if(sscanf(post, "textureMemory=%d", &integer))
+			{
+				config.textureMemory = integer;
+			}
+			else if(sscanf(post, "identifier=%d", &integer))
+			{
+				config.identifier = integer;
+			}
+			else if(sscanf(post, "vertexRoutineCacheSize=%d", &integer))
+			{
+				config.vertexRoutineCacheSize = integer;
+			}
+			else if(sscanf(post, "pixelRoutineCacheSize=%d", &integer))
+			{
+				config.pixelRoutineCacheSize = integer;
+			}
+			else if(sscanf(post, "setupRoutineCacheSize=%d", &integer))
+			{
+				config.setupRoutineCacheSize = integer;
+			}
+			else if(sscanf(post, "vertexCacheSize=%d", &integer))
+			{
+				config.vertexCacheSize = integer;
+			}
+			else if(sscanf(post, "textureSampleQuality=%d", &integer))
+			{
+				config.textureSampleQuality = integer;
+			}
+			else if(sscanf(post, "mipmapQuality=%d", &integer))
+			{
+				config.mipmapQuality = integer;
+			}
+			else if(sscanf(post, "perspectiveCorrection=%d", &integer))
+			{
+				config.perspectiveCorrection = integer != 0;
+			}
+			else if(sscanf(post, "transcendentalPrecision=%d", &integer))
+			{
+				config.transcendentalPrecision = integer;
+			}
+			else if(sscanf(post, "transparencyAntialiasing=%d", &integer))
+			{
+				config.transparencyAntialiasing = integer;
+			}
+			else if(sscanf(post, "threadCount=%d", &integer))
+			{
+				config.threadCount = integer;
+			}
+			else if(sscanf(post, "frameBufferAPI=%d", &integer))
+			{
+				config.frameBufferAPI = integer;
+			}
+			else if(sscanf(post, "shadowMapping=%d", &integer))
+			{
+				config.shadowMapping = integer;
+			}
+			else if(strstr(post, "enableSSE=on"))
+			{
+				config.enableSSE = true;
+			}
+			else if(strstr(post, "enableSSE2=on"))
+			{
+				if(config.enableSSE)
+				{
+					config.enableSSE2 = true;
+				}
+			}
+			else if(strstr(post, "enableSSE3=on"))
+			{
+				if(config.enableSSE2)
+				{
+					config.enableSSE3 = true;
+				}
+			}
+			else if(strstr(post, "enableSSSE3=on"))
+			{
+				if(config.enableSSE3)
+				{
+					config.enableSSSE3 = true;
+				}
+			}
+			else if(strstr(post, "enableSSE4_1=on"))
+			{
+				if(config.enableSSSE3)
+				{
+					config.enableSSE4_1 = true;
+				}
+			}
+			else if(sscanf(post, "optimization%d=%d", &index, &integer))
+			{
+				config.optimization[index - 1] = (Optimization)integer;
+			}
+			else if(strstr(post, "disableServer=on"))
+			{
+				config.disableServer = true;
+			}
+			else if(strstr(post, "forceWindowed=on"))
+			{
+				config.forceWindowed = true;
+			}
+			else if(strstr(post, "complementaryDepthBuffer=on"))
+			{
+				config.complementaryDepthBuffer = true;
+			}
+			else if(strstr(post, "postBlendSRGB=on"))
+			{
+				config.postBlendSRGB = true;
+			}
+			else if(strstr(post, "exactColorRounding=on"))
+			{
+				config.exactColorRounding = true;
+			}
+			else if(strstr(post, "disableAlphaMode=on"))
+			{
+				config.disableAlphaMode = true;
+			}
+			else if(strstr(post, "disable10BitMode=on"))
+			{
+				config.disable10BitMode = true;
+			}
+			else if(strstr(post, "precache=on"))
+			{
+				config.precache = true;
+			}
+			else if(strstr(post, "forceClearRegisters=on"))
+			{
+				config.forceClearRegisters = true;
+			}
+		#ifndef NDEBUG
+			else if(sscanf(post, "minPrimitives=%d", &integer))
+			{
+				config.minPrimitives = integer;
+			}
+			else if(sscanf(post, "maxPrimitives=%d", &integer))
+			{
+				config.maxPrimitives = integer;
+			}
+		#endif
+			else
+			{
+				ASSERT(false);
+			}
+
+			do
+			{
+				post++;
+			}
+			while(post[-1] != '&' && *post != 0);
+		}
+	}
+
+	void SwiftConfig::readConfiguration(bool disableServerOverride)
+	{
+		Configurator ini("SwiftShader.ini");
+
+		config.pixelShaderVersion = ini.getInteger("Capabilities", "PixelShaderVersion", 30);
+		config.vertexShaderVersion = ini.getInteger("Capabilities", "VertexShaderVersion", 30);
+		config.textureMemory = ini.getInteger("Capabilities", "TextureMemory", 256);
+		config.identifier = ini.getInteger("Capabilities", "Identifier", 0);
+		config.vertexRoutineCacheSize = ini.getInteger("Caches", "VertexRoutineCacheSize", 1024);
+		config.pixelRoutineCacheSize = ini.getInteger("Caches", "PixelRoutineCacheSize", 1024);
+		config.setupRoutineCacheSize = ini.getInteger("Caches", "SetupRoutineCacheSize", 1024);
+		config.vertexCacheSize = ini.getInteger("Caches", "VertexCacheSize", 64);
+		config.textureSampleQuality = ini.getInteger("Quality", "TextureSampleQuality", 2);
+		config.mipmapQuality = ini.getInteger("Quality", "MipmapQuality", 1);
+		config.perspectiveCorrection = ini.getBoolean("Quality", "PerspectiveCorrection", true);
+		config.transcendentalPrecision = ini.getInteger("Quality", "TranscendentalPrecision", 2);
+		config.transparencyAntialiasing = ini.getInteger("Quality", "TransparencyAntialiasing", 0);
+		config.threadCount = ini.getInteger("Processor", "ThreadCount", DEFAULT_THREAD_COUNT);
+		config.enableSSE = ini.getBoolean("Processor", "EnableSSE", true);
+		config.enableSSE2 = ini.getBoolean("Processor", "EnableSSE2", true);
+		config.enableSSE3 = ini.getBoolean("Processor", "EnableSSE3", true);
+		config.enableSSSE3 = ini.getBoolean("Processor", "EnableSSSE3", true);
+		config.enableSSE4_1 = ini.getBoolean("Processor", "EnableSSE4_1", true);
+
+		for(int pass = 0; pass < 10; pass++)
+		{
+			config.optimization[pass] = (Optimization)ini.getInteger("Optimization", "OptimizationPass" + itoa(pass + 1), pass == 0 ? InstructionCombining : Disabled);
+		}
+
+		config.disableServer = ini.getBoolean("Testing", "DisableServer", false);
+		config.forceWindowed = ini.getBoolean("Testing", "ForceWindowed", false);
+		config.complementaryDepthBuffer = ini.getBoolean("Testing", "ComplementaryDepthBuffer", false);
+		config.postBlendSRGB = ini.getBoolean("Testing", "PostBlendSRGB", false);
+		config.exactColorRounding = ini.getBoolean("Testing", "ExactColorRounding", true);
+		config.disableAlphaMode = ini.getBoolean("Testing", "DisableAlphaMode", false);
+		config.disable10BitMode = ini.getBoolean("Testing", "Disable10BitMode", false);
+		config.frameBufferAPI = ini.getInteger("Testing", "FrameBufferAPI", 0);
+		config.precache = ini.getBoolean("Testing", "Precache", false);
+		config.shadowMapping = ini.getInteger("Testing", "ShadowMapping", 3);
+		config.forceClearRegisters = ini.getBoolean("Testing", "ForceClearRegisters", false);
+
+	#ifndef NDEBUG
+		config.minPrimitives = 1;
+		config.maxPrimitives = 1 << 21;
+	#endif
+
+		struct stat status;
+		int lastModified = ini.getInteger("LastModified", "Time", 0);
+
+		bool noConfig = stat("SwiftShader.ini", &status) != 0;
+		newConfig = !noConfig && abs((int)status.st_mtime - lastModified) > 1;
+
+		if(disableServerOverride)
+		{
+			config.disableServer = true;
+		}
+	}
+
+	void SwiftConfig::writeConfiguration()
+	{
+		Configurator ini("SwiftShader.ini");
+
+		ini.addValue("Capabilities", "PixelShaderVersion", itoa(config.pixelShaderVersion));
+		ini.addValue("Capabilities", "VertexShaderVersion", itoa(config.vertexShaderVersion));
+		ini.addValue("Capabilities", "TextureMemory", itoa(config.textureMemory));
+		ini.addValue("Capabilities", "Identifier", itoa(config.identifier));
+		ini.addValue("Caches", "VertexRoutineCacheSize", itoa(config.vertexRoutineCacheSize));
+		ini.addValue("Caches", "PixelRoutineCacheSize", itoa(config.pixelRoutineCacheSize));
+		ini.addValue("Caches", "SetupRoutineCacheSize", itoa(config.setupRoutineCacheSize));
+		ini.addValue("Caches", "VertexCacheSize", itoa(config.vertexCacheSize));
+		ini.addValue("Quality", "TextureSampleQuality", itoa(config.textureSampleQuality));
+		ini.addValue("Quality", "MipmapQuality", itoa(config.mipmapQuality));
+		ini.addValue("Quality", "PerspectiveCorrection", itoa(config.perspectiveCorrection));
+		ini.addValue("Quality", "TranscendentalPrecision", itoa(config.transcendentalPrecision));
+		ini.addValue("Quality", "TransparencyAntialiasing", itoa(config.transparencyAntialiasing));
+		ini.addValue("Processor", "ThreadCount", itoa(config.threadCount));
+	//	ini.addValue("Processor", "EnableSSE", itoa(config.enableSSE));
+		ini.addValue("Processor", "EnableSSE2", itoa(config.enableSSE2));
+		ini.addValue("Processor", "EnableSSE3", itoa(config.enableSSE3));
+		ini.addValue("Processor", "EnableSSSE3", itoa(config.enableSSSE3));
+		ini.addValue("Processor", "EnableSSE4_1", itoa(config.enableSSE4_1));
+
+		for(int pass = 0; pass < 10; pass++)
+		{
+			ini.addValue("Optimization", "OptimizationPass" + itoa(pass + 1), itoa(config.optimization[pass]));
+		}
+
+		ini.addValue("Testing", "DisableServer", itoa(config.disableServer));
+		ini.addValue("Testing", "ForceWindowed", itoa(config.forceWindowed));
+		ini.addValue("Testing", "ComplementaryDepthBuffer", itoa(config.complementaryDepthBuffer));
+		ini.addValue("Testing", "PostBlendSRGB", itoa(config.postBlendSRGB));
+		ini.addValue("Testing", "ExactColorRounding", itoa(config.exactColorRounding));
+		ini.addValue("Testing", "DisableAlphaMode", itoa(config.disableAlphaMode));
+		ini.addValue("Testing", "Disable10BitMode", itoa(config.disable10BitMode));
+		ini.addValue("Testing", "FrameBufferAPI", itoa(config.frameBufferAPI));
+		ini.addValue("Testing", "Precache", itoa(config.precache));
+		ini.addValue("Testing", "ShadowMapping", itoa(config.shadowMapping));
+		ini.addValue("Testing", "ForceClearRegisters", itoa(config.forceClearRegisters));
+		ini.addValue("LastModified", "Time", itoa((int)time(0)));
+
+		ini.writeFile("SwiftShader Configuration File\n"
+		              ";\n"
+					  "; To get an overview of the valid settings and their meaning,\n"
+					  "; run the application in windowed mode and open the\n"
+					  "; SwiftConfig application or go to http://localhost:8080/swiftconfig.");
+	}
+}
diff --git a/src/Device/SwiftConfig.hpp b/src/Device/SwiftConfig.hpp
new file mode 100644
index 0000000..233b438
--- /dev/null
+++ b/src/Device/SwiftConfig.hpp
@@ -0,0 +1,115 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SwiftConfig_hpp
+#define sw_SwiftConfig_hpp
+
+#include "Reactor/Nucleus.hpp"
+
+#include "Common/Thread.hpp"
+#include "Common/MutexLock.hpp"
+#include "Common/Socket.hpp"
+
+#include <string>
+
+namespace sw
+{
+	class SwiftConfig
+	{
+	public:
+		struct Configuration
+		{
+			int pixelShaderVersion;
+			int vertexShaderVersion;
+			int textureMemory;
+			int identifier;
+			int vertexRoutineCacheSize;
+			int pixelRoutineCacheSize;
+			int setupRoutineCacheSize;
+			int vertexCacheSize;
+			int textureSampleQuality;
+			int mipmapQuality;
+			bool perspectiveCorrection;
+			int transcendentalPrecision;
+			int threadCount;
+			bool enableSSE;
+			bool enableSSE2;
+			bool enableSSE3;
+			bool enableSSSE3;
+			bool enableSSE4_1;
+			Optimization optimization[10];
+			bool disableServer;
+			bool keepSystemCursor;
+			bool forceWindowed;
+			bool complementaryDepthBuffer;
+			bool postBlendSRGB;
+			bool exactColorRounding;
+			bool disableAlphaMode;
+			bool disable10BitMode;
+			int transparencyAntialiasing;
+			int frameBufferAPI;
+			bool precache;
+			int shadowMapping;
+			bool forceClearRegisters;
+		#ifndef NDEBUG
+			unsigned int minPrimitives;
+			unsigned int maxPrimitives;
+		#endif
+		};
+
+		SwiftConfig(bool disableServerOverride);
+
+		~SwiftConfig();
+
+		bool hasNewConfiguration(bool reset = true);
+		void getConfiguration(Configuration &configuration);
+
+	private:
+		enum Status
+		{
+			OK = 200,
+			NotFound = 404
+		};
+
+		void createServer();
+		void destroyServer();
+
+		static void serverRoutine(void *parameters);
+
+		void serverLoop();
+		void respond(Socket *clientSocket, const char *request);
+		std::string page();
+		std::string profile();
+		void send(Socket *clientSocket, Status code, std::string body = "");
+		void parsePost(const char *post);
+
+		void readConfiguration(bool disableServerOverride = false);
+		void writeConfiguration();
+
+		Configuration config;
+
+		Thread *serverThread;
+		volatile bool terminate;
+		MutexLock criticalSection;   // Protects reading and writing the configuration settings
+
+		bool newConfig;
+
+		Socket *listenSocket;
+
+		int bufferLength;
+		char *receiveBuffer;
+	};
+}
+
+#endif   // sw_SwiftConfig_hpp
diff --git a/src/Device/TextureStage.cpp b/src/Device/TextureStage.cpp
new file mode 100644
index 0000000..0327478
--- /dev/null
+++ b/src/Device/TextureStage.cpp
@@ -0,0 +1,412 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "TextureStage.hpp"
+
+#include "Sampler.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+	TextureStage::State::State()
+	{
+		memset(this, 0, sizeof(State));
+	}
+
+	TextureStage::TextureStage() : sampler(0), previousStage(0)
+	{
+	}
+
+	TextureStage::~TextureStage()
+	{
+	}
+
+	void TextureStage::init(int stage, const Sampler *sampler, const TextureStage *previousStage)
+	{
+		this->stage = stage;
+
+		stageOperation = (stage == 0 ? STAGE_MODULATE : STAGE_DISABLE);
+		firstArgument = SOURCE_TEXTURE;
+		secondArgument = SOURCE_CURRENT;
+		thirdArgument = SOURCE_CURRENT;
+		stageOperationAlpha = (stage == 0 ? STAGE_SELECTARG1 : STAGE_DISABLE);
+		firstArgumentAlpha = SOURCE_DIFFUSE;
+		secondArgumentAlpha = SOURCE_CURRENT;
+		thirdArgumentAlpha = SOURCE_CURRENT;
+		firstModifier = MODIFIER_COLOR;
+		secondModifier = MODIFIER_COLOR;
+		thirdModifier = MODIFIER_COLOR;
+	    firstModifierAlpha = MODIFIER_COLOR;
+		secondModifierAlpha = MODIFIER_COLOR;
+		thirdModifierAlpha = MODIFIER_COLOR;
+		destinationArgument = DESTINATION_CURRENT;
+
+		texCoordIndex = stage;
+		this->sampler = sampler;
+		this->previousStage = previousStage;
+	}
+
+	TextureStage::State TextureStage::textureStageState() const
+	{
+		State state;
+
+		if(!isStageDisabled())
+		{
+			state.stageOperation = stageOperation;
+			state.firstArgument = firstArgument;
+			state.secondArgument = secondArgument;
+			state.thirdArgument = thirdArgument;
+			state.stageOperationAlpha = stageOperationAlpha;
+			state.firstArgumentAlpha = firstArgumentAlpha;
+			state.secondArgumentAlpha = secondArgumentAlpha;
+			state.thirdArgumentAlpha = thirdArgumentAlpha;
+			state.firstModifier = firstModifier;
+			state.secondModifier = secondModifier;
+			state.thirdModifier = thirdModifier;
+			state.firstModifierAlpha = firstModifierAlpha;
+			state.secondModifierAlpha = secondModifierAlpha;
+			state.thirdModifierAlpha = thirdModifierAlpha;
+			state.destinationArgument = destinationArgument;
+			state.texCoordIndex = texCoordIndex;
+
+			state.cantUnderflow = sampler->hasUnsignedTexture() || !usesTexture();
+			state.usesTexture = usesTexture();
+		}
+
+		return state;
+	}
+
+	void TextureStage::setConstantColor(const Color<float> &constantColor)
+	{
+		// FIXME: Compact into generic function   // FIXME: Clamp
+		short r = iround(4095 * constantColor.r);
+		short g = iround(4095 * constantColor.g);
+		short b = iround(4095 * constantColor.b);
+		short a = iround(4095 * constantColor.a);
+
+		uniforms.constantColor4[0][0] = uniforms.constantColor4[0][1] = uniforms.constantColor4[0][2] = uniforms.constantColor4[0][3] = r;
+		uniforms.constantColor4[1][0] = uniforms.constantColor4[1][1] = uniforms.constantColor4[1][2] = uniforms.constantColor4[1][3] = g;
+		uniforms.constantColor4[2][0] = uniforms.constantColor4[2][1] = uniforms.constantColor4[2][2] = uniforms.constantColor4[2][3] = b;
+		uniforms.constantColor4[3][0] = uniforms.constantColor4[3][1] = uniforms.constantColor4[3][2] = uniforms.constantColor4[3][3] = a;
+	}
+
+	void TextureStage::setBumpmapMatrix(int element, float value)
+	{
+		uniforms.bumpmapMatrix4F[element / 2][element % 2][0] = value;
+		uniforms.bumpmapMatrix4F[element / 2][element % 2][1] = value;
+		uniforms.bumpmapMatrix4F[element / 2][element % 2][2] = value;
+		uniforms.bumpmapMatrix4F[element / 2][element % 2][3] = value;
+
+		uniforms.bumpmapMatrix4W[element / 2][element % 2][0] = iround(4095 * value);
+		uniforms.bumpmapMatrix4W[element / 2][element % 2][1] = iround(4095 * value);
+		uniforms.bumpmapMatrix4W[element / 2][element % 2][2] = iround(4095 * value);
+		uniforms.bumpmapMatrix4W[element / 2][element % 2][3] = iround(4095 * value);
+	}
+
+	void TextureStage::setLuminanceScale(float value)
+	{
+		short scale = iround(4095 * value);
+
+		uniforms.luminanceScale4[0] = uniforms.luminanceScale4[1] = uniforms.luminanceScale4[2] = uniforms.luminanceScale4[3] = scale;
+	}
+
+	void TextureStage::setLuminanceOffset(float value)
+	{
+		short offset = iround(4095 * value);
+
+		uniforms.luminanceOffset4[0] = uniforms.luminanceOffset4[1] = uniforms.luminanceOffset4[2] = uniforms.luminanceOffset4[3] = offset;
+	}
+
+	void TextureStage::setTexCoordIndex(unsigned int texCoordIndex)
+	{
+		ASSERT(texCoordIndex < 8);
+
+		this->texCoordIndex = texCoordIndex;
+	}
+
+	void TextureStage::setStageOperation(StageOperation stageOperation)
+	{
+		this->stageOperation = stageOperation;
+	}
+
+	void TextureStage::setFirstArgument(SourceArgument firstArgument)
+	{
+		this->firstArgument = firstArgument;
+	}
+
+	void TextureStage::setSecondArgument(SourceArgument secondArgument)
+	{
+		this->secondArgument = secondArgument;
+	}
+
+	void TextureStage::setThirdArgument(SourceArgument thirdArgument)
+	{
+		this->thirdArgument = thirdArgument;
+	}
+
+	void TextureStage::setStageOperationAlpha(StageOperation stageOperationAlpha)
+	{
+		this->stageOperationAlpha = stageOperationAlpha;
+	}
+
+	void TextureStage::setFirstArgumentAlpha(SourceArgument firstArgumentAlpha)
+	{
+		this->firstArgumentAlpha = firstArgumentAlpha;
+	}
+
+	void TextureStage::setSecondArgumentAlpha(SourceArgument secondArgumentAlpha)
+	{
+		this->secondArgumentAlpha = secondArgumentAlpha;
+	}
+
+	void TextureStage::setThirdArgumentAlpha(SourceArgument thirdArgumentAlpha)
+	{
+		this->thirdArgumentAlpha= thirdArgumentAlpha;
+	}
+
+	void TextureStage::setFirstModifier(ArgumentModifier firstModifier)
+	{
+		this->firstModifier = firstModifier;
+	}
+
+	void TextureStage::setSecondModifier(ArgumentModifier secondModifier)
+	{
+		this->secondModifier = secondModifier;
+	}
+
+	void TextureStage::setThirdModifier(ArgumentModifier thirdModifier)
+	{
+		this->thirdModifier = thirdModifier;
+	}
+
+	void TextureStage::setFirstModifierAlpha(ArgumentModifier firstModifierAlpha)
+	{
+		this->firstModifierAlpha = firstModifierAlpha;
+	}
+
+	void TextureStage::setSecondModifierAlpha(ArgumentModifier secondModifierAlpha)
+	{
+		this->secondModifierAlpha = secondModifierAlpha;
+	}
+
+	void TextureStage::setThirdModifierAlpha(ArgumentModifier thirdModifierAlpha)
+	{
+		this->thirdModifierAlpha = thirdModifierAlpha;
+	}
+
+	void TextureStage::setDestinationArgument(DestinationArgument destinationArgument)
+	{
+		this->destinationArgument = destinationArgument;
+	}
+
+	bool TextureStage::usesColor(SourceArgument source) const
+	{
+		// One argument
+		if(stageOperation == STAGE_SELECTARG1 || stageOperation == STAGE_PREMODULATE)
+		{
+			return firstArgument == source;
+		}
+		else if(stageOperation == STAGE_SELECTARG2)
+		{
+			return secondArgument == source;
+		}
+		else if(stageOperation == STAGE_SELECTARG3)
+		{
+			return thirdArgument == source;
+		}
+		else
+		{
+			// Two arguments or more
+			if(firstArgument == source || secondArgument == source)
+			{
+				return true;
+			}
+
+			// Three arguments
+			if(stageOperation == STAGE_MULTIPLYADD || stageOperation == STAGE_LERP)
+			{
+				return thirdArgument == source;
+			}
+		}
+	
+		return false;
+	}
+
+	bool TextureStage::usesAlpha(SourceArgument source) const
+	{
+		if(stageOperationAlpha == STAGE_DISABLE)
+		{
+			return false;
+		}
+
+		if(source == SOURCE_TEXTURE)
+		{
+			if(stageOperation == STAGE_BLENDTEXTUREALPHA ||	stageOperation == STAGE_BLENDTEXTUREALPHAPM)
+			{
+				return true;
+			}
+		}
+		else if(source == SOURCE_CURRENT)
+		{
+			if(stageOperation == STAGE_BLENDCURRENTALPHA)
+			{
+				return true;
+			}
+		}
+		else if(source == SOURCE_DIFFUSE)
+		{
+			if(stageOperation == STAGE_BLENDDIFFUSEALPHA)
+			{
+				return true;
+			}
+		}
+		else if(source == SOURCE_TFACTOR)
+		{
+			if(stageOperation == STAGE_BLENDFACTORALPHA)
+			{
+				return true;
+			}
+		}
+
+		// One argument
+		if(stageOperation == STAGE_SELECTARG1 || stageOperation == STAGE_PREMODULATE)
+		{
+			if(firstArgument == source && (firstModifier == MODIFIER_ALPHA || firstModifier == MODIFIER_INVALPHA))
+			{
+				return true;
+			}
+		}
+		else if(stageOperation == STAGE_SELECTARG2)
+		{
+			if(secondArgument == source && (secondModifier == MODIFIER_ALPHA || secondModifier == MODIFIER_INVALPHA))
+			{
+				return true;
+			}
+		}
+		else if(stageOperation == STAGE_SELECTARG3)
+		{
+			if(thirdArgument == source && (thirdModifier == MODIFIER_ALPHA || thirdModifier == MODIFIER_INVALPHA))
+			{
+				return true;
+			}
+		}
+		else
+		{
+			// Two arguments or more
+			if(firstArgument == source || secondArgument == source)
+			{
+				if(firstArgument == source && (firstModifier == MODIFIER_ALPHA || firstModifier == MODIFIER_INVALPHA))
+				{
+					return true;
+				}
+
+				if(secondArgument == source && (secondModifier == MODIFIER_ALPHA || secondModifier == MODIFIER_INVALPHA))
+				{
+					return true;
+				}
+			}
+
+			// Three arguments
+			if(stageOperation == STAGE_MULTIPLYADD || stageOperation == STAGE_LERP)
+			{
+				if(thirdArgument == source && (thirdModifier == MODIFIER_ALPHA || thirdModifier == MODIFIER_INVALPHA))
+				{
+					return true;
+				}
+			}
+		}
+
+		// One argument
+		if(stageOperationAlpha == STAGE_SELECTARG1 || stageOperationAlpha == STAGE_PREMODULATE)
+		{
+			return firstArgumentAlpha == source;
+		}
+		else if(stageOperationAlpha == STAGE_SELECTARG2)
+		{
+			return secondArgumentAlpha == source;
+		}
+		else if(stageOperationAlpha == STAGE_SELECTARG3)
+		{
+			return thirdArgumentAlpha == source;
+		}
+		else
+		{
+			// Two arguments or more
+			if(firstArgumentAlpha == source || secondArgumentAlpha == source)
+			{
+				return true;
+			}
+
+			// Three arguments
+			if(stageOperationAlpha == STAGE_MULTIPLYADD || stageOperationAlpha == STAGE_LERP)
+			{
+				return thirdArgumentAlpha == source;
+			}
+		}
+		
+		return false;
+	}
+
+	bool TextureStage::uses(SourceArgument source) const
+	{
+		return usesColor(source) || usesAlpha(source);
+	}
+
+	bool TextureStage::usesCurrent() const
+	{
+		return uses(SOURCE_CURRENT) || (stageOperation == STAGE_BLENDCURRENTALPHA || stageOperationAlpha == STAGE_BLENDCURRENTALPHA);
+	}
+
+	bool TextureStage::usesDiffuse() const
+	{
+		return uses(SOURCE_DIFFUSE) || (stageOperation == STAGE_BLENDDIFFUSEALPHA || stageOperationAlpha == STAGE_BLENDDIFFUSEALPHA);
+	}
+
+	bool TextureStage::usesSpecular() const
+	{
+		return uses(SOURCE_SPECULAR);
+	}
+
+	bool TextureStage::usesTexture() const
+	{
+		return uses(SOURCE_TEXTURE) ||
+		       stageOperation == STAGE_BLENDTEXTUREALPHA ||
+		       stageOperationAlpha == STAGE_BLENDTEXTUREALPHA ||
+		       stageOperation == STAGE_BLENDTEXTUREALPHAPM ||
+		       stageOperationAlpha == STAGE_BLENDTEXTUREALPHAPM ||
+		       (previousStage && previousStage->stageOperation == STAGE_PREMODULATE) ||
+		       (previousStage && previousStage->stageOperationAlpha == STAGE_PREMODULATE);
+	}
+
+	bool TextureStage::isStageDisabled() const
+	{
+		bool disabled = (stageOperation == STAGE_DISABLE) || (!sampler->hasTexture() && usesTexture());
+
+		if(!previousStage || disabled)
+		{
+			return disabled;
+		}
+		else
+		{
+			return previousStage->isStageDisabled();
+		}
+	}
+
+	bool TextureStage::writesCurrent() const
+	{
+		return !isStageDisabled() && destinationArgument == DESTINATION_CURRENT && stageOperation != STAGE_BUMPENVMAP && stageOperation != STAGE_BUMPENVMAPLUMINANCE;
+	}
+}
diff --git a/src/Device/TextureStage.hpp b/src/Device/TextureStage.hpp
new file mode 100644
index 0000000..2c9ecbd
--- /dev/null
+++ b/src/Device/TextureStage.hpp
@@ -0,0 +1,198 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_TextureStage_hpp
+#define sw_TextureStage_hpp
+
+#include "Common/Types.hpp"
+#include "Common/Math.hpp"
+#include "Renderer/Color.hpp"
+
+namespace sw
+{
+	class Sampler;
+	class PixelRoutine;
+	class Context;
+
+	class TextureStage
+	{
+		friend class Context;   // FIXME
+
+	public:
+		enum StageOperation
+		{
+			STAGE_DISABLE,
+			STAGE_SELECTARG1,
+			STAGE_SELECTARG2,
+			STAGE_SELECTARG3,
+			STAGE_MODULATE,
+			STAGE_MODULATE2X,
+			STAGE_MODULATE4X,
+			STAGE_ADD,
+			STAGE_ADDSIGNED,
+			STAGE_ADDSIGNED2X,
+			STAGE_SUBTRACT,
+			STAGE_ADDSMOOTH,
+			STAGE_MULTIPLYADD,
+			STAGE_LERP,
+			STAGE_DOT3,
+			STAGE_BLENDCURRENTALPHA,
+			STAGE_BLENDDIFFUSEALPHA,
+			STAGE_BLENDFACTORALPHA,
+			STAGE_BLENDTEXTUREALPHA,
+			STAGE_BLENDTEXTUREALPHAPM,
+			STAGE_PREMODULATE,
+			STAGE_MODULATEALPHA_ADDCOLOR,
+			STAGE_MODULATECOLOR_ADDALPHA,
+			STAGE_MODULATEINVALPHA_ADDCOLOR,
+			STAGE_MODULATEINVCOLOR_ADDALPHA,
+			STAGE_BUMPENVMAP,
+			STAGE_BUMPENVMAPLUMINANCE,
+
+			STAGE_LAST = STAGE_BUMPENVMAPLUMINANCE
+		};
+
+		enum SourceArgument
+		{
+			SOURCE_TEXTURE,
+			SOURCE_CONSTANT,
+			SOURCE_CURRENT,
+			SOURCE_DIFFUSE,
+			SOURCE_SPECULAR,
+			SOURCE_TEMP,
+			SOURCE_TFACTOR,
+
+			SOURCE_LAST = SOURCE_TFACTOR
+		};
+
+		enum DestinationArgument
+		{
+			DESTINATION_CURRENT,
+			DESTINATION_TEMP,
+
+			DESTINATION_LAST = DESTINATION_TEMP
+		};
+
+		enum ArgumentModifier
+		{
+			MODIFIER_COLOR,
+			MODIFIER_INVCOLOR,
+			MODIFIER_ALPHA,
+			MODIFIER_INVALPHA,
+
+			MODIFIER_LAST = MODIFIER_INVALPHA
+		};
+
+		struct State
+		{
+			State();
+
+			unsigned int stageOperation			: BITS(STAGE_LAST);
+			unsigned int firstArgument			: BITS(SOURCE_LAST);
+			unsigned int secondArgument			: BITS(SOURCE_LAST);
+			unsigned int thirdArgument			: BITS(SOURCE_LAST);
+			unsigned int stageOperationAlpha	: BITS(STAGE_LAST);
+			unsigned int firstArgumentAlpha		: BITS(SOURCE_LAST);
+			unsigned int secondArgumentAlpha	: BITS(SOURCE_LAST);
+			unsigned int thirdArgumentAlpha		: BITS(SOURCE_LAST);
+			unsigned int firstModifier			: BITS(MODIFIER_LAST);
+			unsigned int secondModifier			: BITS(MODIFIER_LAST);
+			unsigned int thirdModifier			: BITS(MODIFIER_LAST);
+			unsigned int firstModifierAlpha		: BITS(MODIFIER_LAST);
+			unsigned int secondModifierAlpha	: BITS(MODIFIER_LAST);
+			unsigned int thirdModifierAlpha		: BITS(MODIFIER_LAST);
+			unsigned int destinationArgument	: BITS(DESTINATION_LAST);
+			unsigned int texCoordIndex			: BITS(7);
+
+			unsigned int cantUnderflow			: 1;
+			unsigned int usesTexture			: 1;
+		};
+
+		struct Uniforms
+		{
+			word4 constantColor4[4];
+			float4 bumpmapMatrix4F[2][2];
+			word4 bumpmapMatrix4W[2][2];
+			word4 luminanceScale4;
+			word4 luminanceOffset4;
+		};
+
+		TextureStage();
+
+		~TextureStage();
+
+		void init(int stage, const Sampler *sampler, const TextureStage *previousStage);
+
+		State textureStageState() const;
+
+		void setConstantColor(const Color<float> &constantColor);
+		void setBumpmapMatrix(int element, float value);
+		void setLuminanceScale(float value);
+		void setLuminanceOffset(float value);
+
+		void setTexCoordIndex(unsigned int texCoordIndex);
+		void setStageOperation(StageOperation stageOperation);
+		void setFirstArgument(SourceArgument firstArgument);
+		void setSecondArgument(SourceArgument secondArgument);
+		void setThirdArgument(SourceArgument thirdArgument);
+		void setStageOperationAlpha(StageOperation stageOperationAlpha);
+		void setFirstArgumentAlpha(SourceArgument firstArgumentAlpha);
+		void setSecondArgumentAlpha(SourceArgument secondArgumentAlpha);
+		void setThirdArgumentAlpha(SourceArgument thirdArgumentAlpha);
+		void setFirstModifier(ArgumentModifier firstModifier);
+		void setSecondModifier(ArgumentModifier secondModifier);
+		void setThirdModifier(ArgumentModifier thirdModifier);
+		void setFirstModifierAlpha(ArgumentModifier firstModifierAlpha);
+		void setSecondModifierAlpha(ArgumentModifier secondModifierAlpha);
+		void setThirdModifierAlpha(ArgumentModifier thirdModifierAlpha);
+		void setDestinationArgument(DestinationArgument destinationArgument);
+
+		Uniforms uniforms;   // FIXME: Private
+
+	private:
+		bool usesColor(SourceArgument source) const;
+		bool usesAlpha(SourceArgument source) const;
+		bool uses(SourceArgument source) const;
+		bool usesCurrent() const;
+		bool usesDiffuse() const;
+		bool usesSpecular() const;
+		bool usesTexture() const;
+		bool isStageDisabled() const;
+		bool writesCurrent() const;
+
+		int stage;
+
+		StageOperation stageOperation;
+		SourceArgument firstArgument;
+		SourceArgument secondArgument;
+		SourceArgument thirdArgument;
+		StageOperation stageOperationAlpha;
+		SourceArgument firstArgumentAlpha;
+		SourceArgument secondArgumentAlpha;
+		SourceArgument thirdArgumentAlpha;
+		ArgumentModifier firstModifier;
+		ArgumentModifier secondModifier;
+		ArgumentModifier thirdModifier;
+		ArgumentModifier firstModifierAlpha;
+		ArgumentModifier secondModifierAlpha;
+		ArgumentModifier thirdModifierAlpha;
+		DestinationArgument destinationArgument;
+
+		int texCoordIndex;
+		const Sampler *sampler;
+		const TextureStage *previousStage;
+	};
+}
+
+#endif  // sw_TextureStage_hpp
diff --git a/src/Device/Triangle.hpp b/src/Device/Triangle.hpp
new file mode 100644
index 0000000..8a91fab
--- /dev/null
+++ b/src/Device/Triangle.hpp
@@ -0,0 +1,30 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Triangle_hpp
+#define sw_Triangle_hpp
+
+#include "Vertex.hpp"
+
+namespace sw
+{
+	struct Triangle
+	{
+		Vertex V0;
+		Vertex V1;
+		Vertex V2;
+	};
+}
+
+#endif   // sw_Triangle_hpp
diff --git a/src/Device/Vector.cpp b/src/Device/Vector.cpp
new file mode 100644
index 0000000..4a02534
--- /dev/null
+++ b/src/Device/Vector.cpp
@@ -0,0 +1,175 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Vector.hpp"
+
+#include "Matrix.hpp"
+#include "Common/Math.hpp"
+
+namespace sw
+{
+	Vector Vector::operator+() const
+	{
+		return *this;
+	}
+
+	Vector Vector::operator-() const
+	{
+		return Vector(-x, -y, -z);
+	}
+
+	Vector &Vector::operator+=(const Vector &v)
+	{
+		x += v.x;
+		y += v.y;
+		z += v.z;
+
+		return *this;
+	}
+
+	Vector &Vector::operator-=(const Vector &v)
+	{
+		x -= v.x;
+		y -= v.y;
+		z -= v.z;
+
+		return *this;
+	}
+
+	Vector &Vector::operator*=(float s)
+	{
+		x *= s;
+		y *= s;
+		z *= s;
+
+		return *this;
+	}
+
+	Vector &Vector::operator/=(float s)
+	{
+		float r = 1.0f / s;
+
+		return *this *= r;
+	}
+
+	bool operator==(const Vector &U, const Vector &v)
+	{
+		if(U.x == v.x && U.y == v.y && U.z == v.z)
+			return true;
+		else
+			return false;
+	}
+
+	bool operator!=(const Vector &U, const Vector &v)
+	{
+		if(U.x != v.x || U.y != v.y || U.z != v.z)
+			return true;
+		else
+			return false;
+	}
+
+	bool operator>(const Vector &u, const Vector &v)
+	{
+		if((u^2) > (v^2))
+			return true;
+		else
+			return false;
+	}
+
+	bool operator<(const Vector &u, const Vector &v)
+	{
+		if((u^2) < (v^2))
+			return true;
+		else
+			return false;
+	}
+
+	Vector operator+(const Vector &u, const Vector &v)
+	{
+		return Vector(u.x + v.x, u.y + v.y, u.z + v.z);
+	}
+
+	Vector operator-(const Vector &u, const Vector &v)
+	{
+		return Vector(u.x - v.x, u.y - v.y, u.z - v.z);
+	}
+
+	float operator*(const Vector &u, const Vector &v)
+	{
+		return u.x * v.x + u.y * v.y + u.z * v.z;
+	}
+
+	Vector operator*(float s, const Vector &v)
+	{
+		return Vector(s * v.x, s * v.y, s * v.z);
+	}
+
+	Vector operator*(const Vector &v, float s)
+	{
+		return Vector(v.x * s, v.y * s, v.z * s);
+	}
+
+	Vector operator/(const Vector &v, float s)
+	{
+		float r = 1.0f / s;
+
+		return Vector(v.x * r, v.y * r, v.z * r);
+	}
+
+	float operator^(const Vector &u, const Vector &v)
+	{
+		return acos(u / Vector::N(u) * v / Vector::N(v));
+	}
+
+	Vector operator%(const Vector &u, const Vector &v)
+	{
+		return Vector(u.y * v.z - u.z * v.y, u.z * v.x - u.x * v.z, u.x * v.y - u.y * v.x);
+	}
+
+	Vector operator*(const Matrix &M, const Vector &v)
+	{
+		return Vector(M(1, 1) * v.x + M(1, 2) * v.y + M(1, 3) * v.z,
+		              M(2, 1) * v.x + M(2, 2) * v.y + M(2, 3) * v.z,
+		              M(3, 1) * v.x + M(3, 2) * v.y + M(3, 3) * v.z);
+	}
+
+	Vector operator*(const Vector &v, const Matrix &M)
+	{
+		return Vector(v.x * M(1, 1) + v.y * M(2, 1) + v.z * M(3, 1) + M(4, 1),
+		              v.x * M(1, 2) + v.y * M(2, 2) + v.z * M(3, 2) + M(4, 2),
+		              v.x * M(1, 3) + v.y * M(2, 3) + v.z * M(3, 3) + M(4, 3));
+	}
+
+	Vector &operator*=(Vector &v, const Matrix &M)
+	{
+		return v = v * M;
+	}
+
+	float Vector::N(const Vector &v)
+	{
+		return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
+	}
+
+	float Vector::N2(const Vector &v)
+	{
+		return v.x*v.x + v.y*v.y + v.z*v.z;
+	}
+
+	Vector lerp(const Vector &u, const Vector &v, float t)
+	{
+		return Vector(u.x + t * (v.x - u.x),
+		              u.y + t * (v.y - u.y),
+					  u.z + t * (v.z - u.x));
+	}
+}
diff --git a/src/Device/Vector.hpp b/src/Device/Vector.hpp
new file mode 100644
index 0000000..e7f261d
--- /dev/null
+++ b/src/Device/Vector.hpp
@@ -0,0 +1,153 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Vector_hpp
+#define Vector_hpp
+
+namespace sw
+{
+	struct Point;
+	struct Matrix;
+	struct Plane;
+
+	struct Vector
+	{
+		Vector();
+		Vector(const int i);
+		Vector(const Vector &v);
+		Vector(const Point &p);
+		Vector(float v_x, float v_y, float v_z);
+
+		Vector &operator=(const Vector &v);
+
+		union
+		{
+			float v[3];
+
+			struct
+			{
+				float x;
+				float y;
+				float z;
+			};
+		};
+
+		float &operator[](int i);
+		float &operator()(int i);
+
+		const float &operator[](int i) const;
+		const float &operator()(int i) const;
+
+		Vector operator+() const;
+		Vector operator-() const;
+
+		Vector &operator+=(const Vector &v);
+		Vector &operator-=(const Vector &v);
+		Vector &operator*=(float s);
+		Vector &operator/=(float s);
+
+		friend bool operator==(const Vector &u, const Vector &v);
+		friend bool operator!=(const Vector &u, const Vector &v);
+
+		friend Vector operator+(const Vector &u, const Vector &v);
+		friend Vector operator-(const Vector &u, const Vector &v);
+		friend float operator*(const Vector &u, const Vector &v);   // Dot product
+		friend Vector operator*(float s, const Vector &v);
+		friend Vector operator*(const Vector &v, float s);
+		friend Vector operator/(const Vector &v, float s);
+		friend float operator^(const Vector &u, const Vector &v);   // Angle between vectors
+		friend Vector operator%(const Vector &u, const Vector &v);   // Cross product
+
+		friend Vector operator*(const Matrix &M, const Vector& v);
+		friend Vector operator*(const Vector &v, const Matrix &M);
+		friend Vector &operator*=(Vector &v, const Matrix &M);
+
+		static float N(const Vector &v);   // Norm
+		static float N2(const Vector &v);   // Squared norm
+
+		static Vector mirror(const Vector &v, const Plane &p);
+		static Vector reflect(const Vector &v, const Plane &p);
+		static Vector lerp(const Vector &u, const Vector &v, float t);
+	};
+}
+
+#include "Point.hpp"
+
+namespace sw
+{
+	inline Vector::Vector()
+	{
+	}
+
+	inline Vector::Vector(const int i)
+	{
+		const float s = (float)i;
+
+		x = s;
+		y = s;
+		z = s;
+	}
+
+	inline Vector::Vector(const Vector &v)
+	{
+		x = v.x;
+		y = v.y;
+		z = v.z;
+	}
+
+	inline Vector::Vector(const Point &P)
+	{
+		x = P.x;
+		y = P.y;
+		z = P.z;
+	}
+
+	inline Vector::Vector(float v_x, float v_y, float v_z)
+	{
+		x = v_x;
+		y = v_y;
+		z = v_z;
+	}
+
+	inline Vector &Vector::operator=(const Vector &v)
+	{
+		x = v.x;
+		y = v.y;
+		z = v.z;
+
+		return *this;
+	}
+
+	inline float &Vector::operator()(int i)
+	{
+		return v[i];
+	}
+
+	inline float &Vector::operator[](int i)
+	{
+		return v[i];
+	}
+
+	inline const float &Vector::operator()(int i) const
+	{
+		return v[i];
+	}
+
+	inline const float &Vector::operator[](int i) const
+	{
+		return v[i];
+	}
+}
+
+#endif   // Vector_hpp
diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
new file mode 100644
index 0000000..9ae8d14
--- /dev/null
+++ b/src/Device/Vertex.hpp
@@ -0,0 +1,98 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Vertex_hpp
+#define Vertex_hpp
+
+#include "Color.hpp"
+#include "Common/Types.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+	enum Out
+	{
+		// Default vertex output semantics
+		Pos = 0,
+		C0 = 1,   // Diffuse
+		C1 = 2,   // Specular
+		T0 = 3,
+		T1 = 4,
+		T2 = 5,
+		T3 = 6,
+		T4 = 7,
+		T5 = 8,
+		T6 = 9,
+		T7 = 10,
+		Fog = 11,    // x component
+		Pts = Fog,   // y component
+
+		// Variable semantics
+		V0 = 0,
+		Vn_1 = MAX_VERTEX_OUTPUTS - 1,
+
+		Unused,
+		VERTEX_OUTPUT_LAST = Unused,
+	};
+
+	struct UVWQ
+	{
+		float u;
+		float v;
+		float w;
+		float q;
+
+		float &operator[](int i)
+		{
+			return (&u)[i];
+		}
+	};
+
+	ALIGN(16, struct Vertex
+	{
+		union
+		{
+			struct   // Fixed semantics
+			{
+				// Position
+				float x;
+				float y;
+				float z;
+				float w;
+
+				float4 C[2];   // Diffuse and specular color
+
+				UVWQ T[8];           // Texture coordinates
+
+				float f;             // Fog
+				float pSize;         // Point size
+			};
+
+			float4 v[MAX_VERTEX_OUTPUTS];   // Generic components using semantic declaration
+		};
+
+		// Projected coordinates
+		int X;
+		int Y;
+		float Z;
+		float W;
+
+		int clipFlags;
+		int padding[3];
+	});
+
+	static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
+}
+
+#endif   // Vertex_hpp
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
new file mode 100644
index 0000000..976ea2b
--- /dev/null
+++ b/src/Device/VertexProcessor.cpp
@@ -0,0 +1,1118 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexProcessor.hpp"
+
+#include "Shader/VertexPipeline.hpp"
+#include "Shader/VertexProgram.hpp"
+#include "Shader/VertexShader.hpp"
+#include "Shader/PixelShader.hpp"
+#include "Shader/Constants.hpp"
+#include "Common/Math.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+	bool precacheVertex = false;
+
+	void VertexCache::clear()
+	{
+		for(int i = 0; i < 16; i++)
+		{
+			tag[i] = 0x80000000;
+		}
+	}
+
+	unsigned int VertexProcessor::States::computeHash()
+	{
+		unsigned int *state = (unsigned int*)this;
+		unsigned int hash = 0;
+
+		for(unsigned int i = 0; i < sizeof(States) / 4; i++)
+		{
+			hash ^= state[i];
+		}
+
+		return hash;
+	}
+
+	VertexProcessor::State::State()
+	{
+		memset(this, 0, sizeof(State));
+	}
+
+	bool VertexProcessor::State::operator==(const State &state) const
+	{
+		if(hash != state.hash)
+		{
+			return false;
+		}
+
+		return memcmp(static_cast<const States*>(this), static_cast<const States*>(&state), sizeof(States)) == 0;
+	}
+
+	VertexProcessor::TransformFeedbackInfo::TransformFeedbackInfo()
+	{
+		buffer = nullptr;
+		offset = 0;
+		reg = 0;
+		row = 0;
+		col = 0;
+		stride = 0;
+	}
+
+	VertexProcessor::UniformBufferInfo::UniformBufferInfo()
+	{
+		buffer = nullptr;
+		offset = 0;
+	}
+
+	VertexProcessor::VertexProcessor(Context *context) : context(context)
+	{
+		for(int i = 0; i < 12; i++)
+		{
+			M[i] = 1;
+		}
+
+		V = 1;
+		B = 1;
+		P = 0;
+		PB = 0;
+		PBV = 0;
+
+		for(int i = 0; i < 12; i++)
+		{
+			PBVM[i] = 0;
+		}
+
+		setLightingEnable(true);
+		setSpecularEnable(false);
+
+		for(int i = 0; i < 8; i++)
+		{
+			setLightEnable(i, false);
+			setLightPosition(i, 0);
+		}
+
+		updateMatrix = true;
+		updateViewMatrix = true;
+		updateBaseMatrix = true;
+		updateProjectionMatrix = true;
+		updateLighting = true;
+
+		for(int i = 0; i < 12; i++)
+		{
+			updateModelMatrix[i] = true;
+		}
+
+		routineCache = 0;
+		setRoutineCacheSize(1024);
+	}
+
+	VertexProcessor::~VertexProcessor()
+	{
+		delete routineCache;
+		routineCache = 0;
+	}
+
+	void VertexProcessor::setInputStream(int index, const Stream &stream)
+	{
+		context->input[index] = stream;
+	}
+
+	void VertexProcessor::resetInputStreams(bool preTransformed)
+	{
+		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		{
+			context->input[i].defaults();
+		}
+
+		context->preTransformed = preTransformed;
+	}
+
+	void VertexProcessor::setFloatConstant(unsigned int index, const float value[4])
+	{
+		if(index < VERTEX_UNIFORM_VECTORS)
+		{
+			c[index][0] = value[0];
+			c[index][1] = value[1];
+			c[index][2] = value[2];
+			c[index][3] = value[3];
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setIntegerConstant(unsigned int index, const int integer[4])
+	{
+		if(index < 16)
+		{
+			i[index][0] = integer[0];
+			i[index][1] = integer[1];
+			i[index][2] = integer[2];
+			i[index][3] = integer[3];
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setBooleanConstant(unsigned int index, int boolean)
+	{
+		if(index < 16)
+		{
+			b[index] = boolean != 0;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setUniformBuffer(int index, sw::Resource* buffer, int offset)
+	{
+		uniformBufferInfo[index].buffer = buffer;
+		uniformBufferInfo[index].offset = offset;
+	}
+
+	void VertexProcessor::lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[])
+	{
+		for(int i = 0; i < MAX_UNIFORM_BUFFER_BINDINGS; ++i)
+		{
+			u[i] = uniformBufferInfo[i].buffer ? static_cast<byte*>(uniformBufferInfo[i].buffer->lock(PUBLIC, PRIVATE)) + uniformBufferInfo[i].offset : nullptr;
+			uniformBuffers[i] = uniformBufferInfo[i].buffer;
+		}
+	}
+
+	void VertexProcessor::setTransformFeedbackBuffer(int index, sw::Resource* buffer, int offset, unsigned int reg, unsigned int row, unsigned int col, unsigned int stride)
+	{
+		transformFeedbackInfo[index].buffer = buffer;
+		transformFeedbackInfo[index].offset = offset;
+		transformFeedbackInfo[index].reg = reg;
+		transformFeedbackInfo[index].row = row;
+		transformFeedbackInfo[index].col = col;
+		transformFeedbackInfo[index].stride = stride;
+	}
+
+	void VertexProcessor::lockTransformFeedbackBuffers(byte** t, unsigned int* v, unsigned int* r, unsigned int* c, unsigned int* s, sw::Resource* transformFeedbackBuffers[])
+	{
+		for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; ++i)
+		{
+			t[i] = transformFeedbackInfo[i].buffer ? static_cast<byte*>(transformFeedbackInfo[i].buffer->lock(PUBLIC, PRIVATE)) + transformFeedbackInfo[i].offset : nullptr;
+			transformFeedbackBuffers[i] = transformFeedbackInfo[i].buffer;
+			v[i] = transformFeedbackInfo[i].reg;
+			r[i] = transformFeedbackInfo[i].row;
+			c[i] = transformFeedbackInfo[i].col;
+			s[i] = transformFeedbackInfo[i].stride;
+		}
+	}
+
+	void VertexProcessor::setModelMatrix(const Matrix &M, int i)
+	{
+		if(i < 12)
+		{
+			this->M[i] = M;
+
+			updateMatrix = true;
+			updateModelMatrix[i] = true;
+			updateLighting = true;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setViewMatrix(const Matrix &V)
+	{
+		this->V = V;
+
+		updateMatrix = true;
+		updateViewMatrix = true;
+	}
+
+	void VertexProcessor::setBaseMatrix(const Matrix &B)
+	{
+		this->B = B;
+
+		updateMatrix = true;
+		updateBaseMatrix = true;
+	}
+
+	void VertexProcessor::setProjectionMatrix(const Matrix &P)
+	{
+		this->P = P;
+		context->wBasedFog = (P[3][0] != 0.0f) || (P[3][1] != 0.0f) || (P[3][2] != 0.0f) || (P[3][3] != 1.0f);
+
+		updateMatrix = true;
+		updateProjectionMatrix = true;
+	}
+
+	void VertexProcessor::setLightingEnable(bool lightingEnable)
+	{
+		context->setLightingEnable(lightingEnable);
+
+		updateLighting = true;
+	}
+
+	void VertexProcessor::setLightEnable(unsigned int light, bool lightEnable)
+	{
+		if(light < 8)
+		{
+			context->setLightEnable(light, lightEnable);
+		}
+		else ASSERT(false);
+
+		updateLighting = true;
+	}
+
+	void VertexProcessor::setSpecularEnable(bool specularEnable)
+	{
+		context->setSpecularEnable(specularEnable);
+
+		updateLighting = true;
+	}
+
+	void VertexProcessor::setLightPosition(unsigned int light, const Point &lightPosition)
+	{
+		if(light < 8)
+		{
+			context->setLightPosition(light, lightPosition);
+		}
+		else ASSERT(false);
+
+		updateLighting = true;
+	}
+
+	void VertexProcessor::setLightDiffuse(unsigned int light, const Color<float> &lightDiffuse)
+	{
+		if(light < 8)
+		{
+			ff.lightDiffuse[light][0] = lightDiffuse.r;
+			ff.lightDiffuse[light][1] = lightDiffuse.g;
+			ff.lightDiffuse[light][2] = lightDiffuse.b;
+			ff.lightDiffuse[light][3] = lightDiffuse.a;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setLightSpecular(unsigned int light, const Color<float> &lightSpecular)
+	{
+		if(light < 8)
+		{
+			ff.lightSpecular[light][0] = lightSpecular.r;
+			ff.lightSpecular[light][1] = lightSpecular.g;
+			ff.lightSpecular[light][2] = lightSpecular.b;
+			ff.lightSpecular[light][3] = lightSpecular.a;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setLightAmbient(unsigned int light, const Color<float> &lightAmbient)
+	{
+		if(light < 8)
+		{
+			ff.lightAmbient[light][0] = lightAmbient.r;
+			ff.lightAmbient[light][1] = lightAmbient.g;
+			ff.lightAmbient[light][2] = lightAmbient.b;
+			ff.lightAmbient[light][3] = lightAmbient.a;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setLightAttenuation(unsigned int light, float constant, float linear, float quadratic)
+	{
+		if(light < 8)
+		{
+			ff.attenuationConstant[light] = replicate(constant);
+			ff.attenuationLinear[light] = replicate(linear);
+			ff.attenuationQuadratic[light] = replicate(quadratic);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setLightRange(unsigned int light, float lightRange)
+	{
+		if(light < 8)
+		{
+			ff.lightRange[light] = lightRange;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setFogEnable(bool fogEnable)
+	{
+		context->fogEnable = fogEnable;
+	}
+
+	void VertexProcessor::setVertexFogMode(FogMode fogMode)
+	{
+		context->vertexFogMode = fogMode;
+	}
+
+	void VertexProcessor::setInstanceID(int instanceID)
+	{
+		context->instanceID = instanceID;
+	}
+
+	void VertexProcessor::setColorVertexEnable(bool colorVertexEnable)
+	{
+		context->setColorVertexEnable(colorVertexEnable);
+	}
+
+	void VertexProcessor::setDiffuseMaterialSource(MaterialSource diffuseMaterialSource)
+	{
+		context->setDiffuseMaterialSource(diffuseMaterialSource);
+	}
+
+	void VertexProcessor::setSpecularMaterialSource(MaterialSource specularMaterialSource)
+	{
+		context->setSpecularMaterialSource(specularMaterialSource);
+	}
+
+	void VertexProcessor::setAmbientMaterialSource(MaterialSource ambientMaterialSource)
+	{
+		context->setAmbientMaterialSource(ambientMaterialSource);
+	}
+
+	void VertexProcessor::setEmissiveMaterialSource(MaterialSource emissiveMaterialSource)
+	{
+		context->setEmissiveMaterialSource(emissiveMaterialSource);
+	}
+
+	void VertexProcessor::setGlobalAmbient(const Color<float> &globalAmbient)
+	{
+		ff.globalAmbient[0] = globalAmbient.r;
+		ff.globalAmbient[1] = globalAmbient.g;
+		ff.globalAmbient[2] = globalAmbient.b;
+		ff.globalAmbient[3] = globalAmbient.a;
+	}
+
+	void VertexProcessor::setMaterialEmission(const Color<float> &emission)
+	{
+		ff.materialEmission[0] = emission.r;
+		ff.materialEmission[1] = emission.g;
+		ff.materialEmission[2] = emission.b;
+		ff.materialEmission[3] = emission.a;
+	}
+
+	void VertexProcessor::setMaterialAmbient(const Color<float> &materialAmbient)
+	{
+		ff.materialAmbient[0] = materialAmbient.r;
+		ff.materialAmbient[1] = materialAmbient.g;
+		ff.materialAmbient[2] = materialAmbient.b;
+		ff.materialAmbient[3] = materialAmbient.a;
+	}
+
+	void VertexProcessor::setMaterialDiffuse(const Color<float> &diffuseColor)
+	{
+		ff.materialDiffuse[0] = diffuseColor.r;
+		ff.materialDiffuse[1] = diffuseColor.g;
+		ff.materialDiffuse[2] = diffuseColor.b;
+		ff.materialDiffuse[3] = diffuseColor.a;
+	}
+
+	void VertexProcessor::setMaterialSpecular(const Color<float> &specularColor)
+	{
+		ff.materialSpecular[0] = specularColor.r;
+		ff.materialSpecular[1] = specularColor.g;
+		ff.materialSpecular[2] = specularColor.b;
+		ff.materialSpecular[3] = specularColor.a;
+	}
+
+	void VertexProcessor::setMaterialShininess(float specularPower)
+	{
+		ff.materialShininess = specularPower;
+	}
+
+	void VertexProcessor::setLightViewPosition(unsigned int light, const Point &P)
+	{
+		if(light < 8)
+		{
+			ff.lightPosition[light][0] = P.x;
+			ff.lightPosition[light][1] = P.y;
+			ff.lightPosition[light][2] = P.z;
+			ff.lightPosition[light][3] = 1;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setRangeFogEnable(bool enable)
+	{
+		context->rangeFogEnable = enable;
+	}
+
+	void VertexProcessor::setIndexedVertexBlendEnable(bool indexedVertexBlendEnable)
+	{
+		context->indexedVertexBlendEnable = indexedVertexBlendEnable;
+	}
+
+	void VertexProcessor::setVertexBlendMatrixCount(unsigned int vertexBlendMatrixCount)
+	{
+		if(vertexBlendMatrixCount <= 4)
+		{
+			context->vertexBlendMatrixCount = vertexBlendMatrixCount;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setTextureWrap(unsigned int stage, int mask)
+	{
+		if(stage < TEXTURE_IMAGE_UNITS)
+		{
+			context->textureWrap[stage] = mask;
+		}
+		else ASSERT(false);
+
+		context->textureWrapActive = false;
+
+		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
+		{
+			context->textureWrapActive |= (context->textureWrap[i] != 0x00);
+		}
+	}
+
+	void VertexProcessor::setTexGen(unsigned int stage, TexGen texGen)
+	{
+		if(stage < 8)
+		{
+			context->texGen[stage] = texGen;
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setLocalViewer(bool localViewer)
+	{
+		context->localViewer = localViewer;
+	}
+
+	void VertexProcessor::setNormalizeNormals(bool normalizeNormals)
+	{
+		context->normalizeNormals = normalizeNormals;
+	}
+
+	void VertexProcessor::setTextureMatrix(int stage, const Matrix &T)
+	{
+		for(int i = 0; i < 4; i++)
+		{
+			for(int j = 0; j < 4; j++)
+			{
+				ff.textureTransform[stage][i][j] = T[i][j];
+			}
+		}
+	}
+
+	void VertexProcessor::setTextureTransform(int stage, int count, bool project)
+	{
+		context->textureTransformCount[stage] = count;
+		context->textureTransformProject[stage] = project;
+	}
+
+	void VertexProcessor::setTextureFilter(unsigned int sampler, FilterType textureFilter)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setTextureFilter(textureFilter);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMipmapFilter(mipmapFilter);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setGatherEnable(unsigned int sampler, bool enable)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setGatherEnable(enable);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setAddressingModeU(unsigned int sampler, AddressingMode addressMode)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setAddressingModeU(addressMode);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setAddressingModeV(unsigned int sampler, AddressingMode addressMode)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setAddressingModeV(addressMode);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setAddressingModeW(unsigned int sampler, AddressingMode addressMode)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setAddressingModeW(addressMode);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setReadSRGB(unsigned int sampler, bool sRGB)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setReadSRGB(sRGB);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setMipmapLOD(unsigned int sampler, float bias)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMipmapLOD(bias);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setBorderColor(unsigned int sampler, const Color<float> &borderColor)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setBorderColor(borderColor);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setMaxAnisotropy(unsigned int sampler, float maxAnisotropy)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMaxAnisotropy(maxAnisotropy);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setHighPrecisionFiltering(highPrecisionFiltering);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setSwizzleR(unsigned int sampler, SwizzleType swizzleR)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleR(swizzleR);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setSwizzleG(unsigned int sampler, SwizzleType swizzleG)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleG(swizzleG);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setSwizzleB(unsigned int sampler, SwizzleType swizzleB)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleB(swizzleB);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setSwizzleA(unsigned int sampler, SwizzleType swizzleA)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setSwizzleA(swizzleA);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setCompareFunc(unsigned int sampler, CompareFunc compFunc)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setCompareFunc(compFunc);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setBaseLevel(unsigned int sampler, int baseLevel)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setBaseLevel(baseLevel);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setMaxLevel(unsigned int sampler, int maxLevel)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMaxLevel(maxLevel);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setMinLod(unsigned int sampler, float minLod)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMinLod(minLod);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setMaxLod(unsigned int sampler, float maxLod)
+	{
+		if(sampler < VERTEX_TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[TEXTURE_IMAGE_UNITS + sampler].setMaxLod(maxLod);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setSyncRequired(unsigned int sampler, bool isSincRequired)
+	{
+		if(sampler < TEXTURE_IMAGE_UNITS)
+		{
+			context->sampler[sampler].setSyncRequired(isSincRequired);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProcessor::setPointSize(float pointSize)
+	{
+		point.pointSize = replicate(pointSize);
+	}
+
+	void VertexProcessor::setPointSizeMin(float pointSizeMin)
+	{
+		point.pointSizeMin = pointSizeMin;
+	}
+
+	void VertexProcessor::setPointSizeMax(float pointSizeMax)
+	{
+		point.pointSizeMax = pointSizeMax;
+	}
+
+	void VertexProcessor::setPointScaleA(float pointScaleA)
+	{
+		point.pointScaleA = pointScaleA;
+	}
+
+	void VertexProcessor::setPointScaleB(float pointScaleB)
+	{
+		point.pointScaleB = pointScaleB;
+	}
+
+	void VertexProcessor::setPointScaleC(float pointScaleC)
+	{
+		point.pointScaleC = pointScaleC;
+	}
+
+	void VertexProcessor::setTransformFeedbackQueryEnabled(bool enable)
+	{
+		context->transformFeedbackQueryEnabled = enable;
+	}
+
+	void VertexProcessor::enableTransformFeedback(uint64_t enable)
+	{
+		context->transformFeedbackEnabled = enable;
+	}
+
+	const Matrix &VertexProcessor::getModelTransform(int i)
+	{
+		updateTransform();
+		return PBVM[i];
+	}
+
+	const Matrix &VertexProcessor::getViewTransform()
+	{
+		updateTransform();
+		return PBV;
+	}
+
+	bool VertexProcessor::isFixedFunction()
+	{
+		return !context->vertexShader;
+	}
+
+	void VertexProcessor::setTransform(const Matrix &M, int i)
+	{
+		ff.transformT[i][0][0] = M[0][0];
+		ff.transformT[i][0][1] = M[1][0];
+		ff.transformT[i][0][2] = M[2][0];
+		ff.transformT[i][0][3] = M[3][0];
+
+		ff.transformT[i][1][0] = M[0][1];
+		ff.transformT[i][1][1] = M[1][1];
+		ff.transformT[i][1][2] = M[2][1];
+		ff.transformT[i][1][3] = M[3][1];
+
+		ff.transformT[i][2][0] = M[0][2];
+		ff.transformT[i][2][1] = M[1][2];
+		ff.transformT[i][2][2] = M[2][2];
+		ff.transformT[i][2][3] = M[3][2];
+
+		ff.transformT[i][3][0] = M[0][3];
+		ff.transformT[i][3][1] = M[1][3];
+		ff.transformT[i][3][2] = M[2][3];
+		ff.transformT[i][3][3] = M[3][3];
+	}
+
+	void VertexProcessor::setCameraTransform(const Matrix &M, int i)
+	{
+		ff.cameraTransformT[i][0][0] = M[0][0];
+		ff.cameraTransformT[i][0][1] = M[1][0];
+		ff.cameraTransformT[i][0][2] = M[2][0];
+		ff.cameraTransformT[i][0][3] = M[3][0];
+
+		ff.cameraTransformT[i][1][0] = M[0][1];
+		ff.cameraTransformT[i][1][1] = M[1][1];
+		ff.cameraTransformT[i][1][2] = M[2][1];
+		ff.cameraTransformT[i][1][3] = M[3][1];
+
+		ff.cameraTransformT[i][2][0] = M[0][2];
+		ff.cameraTransformT[i][2][1] = M[1][2];
+		ff.cameraTransformT[i][2][2] = M[2][2];
+		ff.cameraTransformT[i][2][3] = M[3][2];
+
+		ff.cameraTransformT[i][3][0] = M[0][3];
+		ff.cameraTransformT[i][3][1] = M[1][3];
+		ff.cameraTransformT[i][3][2] = M[2][3];
+		ff.cameraTransformT[i][3][3] = M[3][3];
+	}
+
+	void VertexProcessor::setNormalTransform(const Matrix &M, int i)
+	{
+		ff.normalTransformT[i][0][0] = M[0][0];
+		ff.normalTransformT[i][0][1] = M[1][0];
+		ff.normalTransformT[i][0][2] = M[2][0];
+		ff.normalTransformT[i][0][3] = M[3][0];
+
+		ff.normalTransformT[i][1][0] = M[0][1];
+		ff.normalTransformT[i][1][1] = M[1][1];
+		ff.normalTransformT[i][1][2] = M[2][1];
+		ff.normalTransformT[i][1][3] = M[3][1];
+
+		ff.normalTransformT[i][2][0] = M[0][2];
+		ff.normalTransformT[i][2][1] = M[1][2];
+		ff.normalTransformT[i][2][2] = M[2][2];
+		ff.normalTransformT[i][2][3] = M[3][2];
+
+		ff.normalTransformT[i][3][0] = M[0][3];
+		ff.normalTransformT[i][3][1] = M[1][3];
+		ff.normalTransformT[i][3][2] = M[2][3];
+		ff.normalTransformT[i][3][3] = M[3][3];
+	}
+
+	void VertexProcessor::updateTransform()
+	{
+		if(!updateMatrix) return;
+
+		int activeMatrices = context->indexedVertexBlendEnable ? 12 : max(context->vertexBlendMatrixCount, 1);
+
+		if(updateProjectionMatrix)
+		{
+			PB = P * B;
+			PBV = PB * V;
+
+			for(int i = 0; i < activeMatrices; i++)
+			{
+				PBVM[i] = PBV * M[i];
+				updateModelMatrix[i] = false;
+			}
+
+			updateProjectionMatrix = false;
+			updateBaseMatrix = false;
+			updateViewMatrix = false;
+		}
+
+		if(updateBaseMatrix)
+		{
+			PB = P * B;
+			PBV = PB * V;
+
+			for(int i = 0; i < activeMatrices; i++)
+			{
+				PBVM[i] = PBV * M[i];
+				updateModelMatrix[i] = false;
+			}
+
+			updateBaseMatrix = false;
+			updateViewMatrix = false;
+		}
+
+		if(updateViewMatrix)
+		{
+			PBV = PB * V;
+
+			for(int i = 0; i < activeMatrices; i++)
+			{
+				PBVM[i] = PBV * M[i];
+				updateModelMatrix[i] = false;
+			}
+
+			updateViewMatrix = false;
+		}
+
+		for(int i = 0; i < activeMatrices; i++)
+		{
+			if(updateModelMatrix[i])
+			{
+				PBVM[i] = PBV * M[i];
+				updateModelMatrix[i] = false;
+			}
+		}
+
+		for(int i = 0; i < activeMatrices; i++)
+		{
+			setTransform(PBVM[i], i);
+			setCameraTransform(B * V * M[i], i);
+			setNormalTransform(~!(B * V * M[i]), i);
+		}
+
+		updateMatrix = false;
+	}
+
+	void VertexProcessor::setRoutineCacheSize(int cacheSize)
+	{
+		delete routineCache;
+		routineCache = new RoutineCache<State>(clamp(cacheSize, 1, 65536), precacheVertex ? "sw-vertex" : 0);
+	}
+
+	const VertexProcessor::State VertexProcessor::update(DrawType drawType)
+	{
+		if(isFixedFunction())
+		{
+			updateTransform();
+
+			if(updateLighting)
+			{
+				for(int i = 0; i < 8; i++)
+				{
+					if(context->vertexLightActive(i))
+					{
+						// Light position in camera coordinates
+						setLightViewPosition(i, B * V * context->getLightPosition(i));
+					}
+				}
+
+				updateLighting = false;
+			}
+		}
+
+		State state;
+
+		if(context->vertexShader)
+		{
+			state.shaderID = context->vertexShader->getSerialID();
+		}
+		else
+		{
+			state.shaderID = 0;
+		}
+
+		state.fixedFunction = !context->vertexShader && context->pixelShaderModel() < 0x0300;
+		state.textureSampling = context->vertexShader ? context->vertexShader->containsTextureSampling() : false;
+		state.positionRegister = context->vertexShader ? context->vertexShader->getPositionRegister() : Pos;
+		state.pointSizeRegister = context->vertexShader ? context->vertexShader->getPointSizeRegister() : Pts;
+
+		state.vertexBlendMatrixCount = context->vertexBlendMatrixCountActive();
+		state.indexedVertexBlendEnable = context->indexedVertexBlendActive();
+		state.vertexNormalActive = context->vertexNormalActive();
+		state.normalizeNormals = context->normalizeNormalsActive();
+		state.vertexLightingActive = context->vertexLightingActive();
+		state.diffuseActive = context->diffuseActive();
+		state.specularActive = context->specularActive();
+		state.vertexSpecularActive = context->vertexSpecularActive();
+
+		state.vertexLightActive = context->vertexLightActive(0) << 0 |
+		                          context->vertexLightActive(1) << 1 |
+		                          context->vertexLightActive(2) << 2 |
+		                          context->vertexLightActive(3) << 3 |
+		                          context->vertexLightActive(4) << 4 |
+		                          context->vertexLightActive(5) << 5 |
+		                          context->vertexLightActive(6) << 6 |
+		                          context->vertexLightActive(7) << 7;
+
+		state.vertexDiffuseMaterialSourceActive = context->vertexDiffuseMaterialSourceActive();
+		state.vertexSpecularMaterialSourceActive = context->vertexSpecularMaterialSourceActive();
+		state.vertexAmbientMaterialSourceActive = context->vertexAmbientMaterialSourceActive();
+		state.vertexEmissiveMaterialSourceActive = context->vertexEmissiveMaterialSourceActive();
+		state.fogActive = context->fogActive();
+		state.vertexFogMode = context->vertexFogModeActive();
+		state.rangeFogActive = context->rangeFogActive();
+		state.localViewerActive = context->localViewerActive();
+		state.pointSizeActive = context->pointSizeActive();
+		state.pointScaleActive = context->pointScaleActive();
+
+		state.preTransformed = context->preTransformed;
+		state.superSampling = context->getSuperSampleCount() > 1;
+		state.multiSampling = context->getMultiSampleCount() > 1;
+
+		state.transformFeedbackQueryEnabled = context->transformFeedbackQueryEnabled;
+		state.transformFeedbackEnabled = context->transformFeedbackEnabled;
+
+		// Note: Quads aren't handled for verticesPerPrimitive, but verticesPerPrimitive is used for transform feedback,
+		//       which is an OpenGL ES 3.0 feature, and OpenGL ES 3.0 doesn't support quads as a primitive type.
+		DrawType type = static_cast<DrawType>(static_cast<unsigned int>(drawType) & 0xF);
+		state.verticesPerPrimitive = 1 + (type >= DRAW_LINELIST) + (type >= DRAW_TRIANGLELIST);
+
+		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		{
+			state.input[i].type = context->input[i].type;
+			state.input[i].count = context->input[i].count;
+			state.input[i].normalized = context->input[i].normalized;
+			state.input[i].attribType = context->vertexShader ? context->vertexShader->getAttribType(i) : VertexShader::ATTRIBTYPE_FLOAT;
+		}
+
+		if(!context->vertexShader)
+		{
+			for(int i = 0; i < 8; i++)
+			{
+			//	state.textureState[i].vertexTextureActive = context->vertexTextureActive(i, 0);
+				state.textureState[i].texGenActive = context->texGenActive(i);
+				state.textureState[i].textureTransformCountActive = context->textureTransformCountActive(i);
+				state.textureState[i].texCoordIndexActive = context->texCoordIndexActive(i);
+			}
+		}
+		else
+		{
+			for(unsigned int i = 0; i < VERTEX_TEXTURE_IMAGE_UNITS; i++)
+			{
+				if(context->vertexShader->usesSampler(i))
+				{
+					state.sampler[i] = context->sampler[TEXTURE_IMAGE_UNITS + i].samplerState();
+				}
+			}
+		}
+
+		if(context->vertexShader)   // FIXME: Also when pre-transformed?
+		{
+			for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+			{
+				state.output[i].xWrite = context->vertexShader->getOutput(i, 0).active();
+				state.output[i].yWrite = context->vertexShader->getOutput(i, 1).active();
+				state.output[i].zWrite = context->vertexShader->getOutput(i, 2).active();
+				state.output[i].wWrite = context->vertexShader->getOutput(i, 3).active();
+			}
+		}
+		else if(!context->preTransformed || context->pixelShaderModel() < 0x0300)
+		{
+			state.output[Pos].write = 0xF;
+
+			if(context->diffuseActive() && (context->lightingEnable || context->input[Color0]))
+			{
+				state.output[C0].write = 0xF;
+			}
+
+			if(context->specularActive())
+			{
+				state.output[C1].write = 0xF;
+			}
+
+			for(int stage = 0; stage < 8; stage++)
+			{
+				if(context->texCoordActive(stage, 0)) state.output[T0 + stage].write |= 0x01;
+				if(context->texCoordActive(stage, 1)) state.output[T0 + stage].write |= 0x02;
+				if(context->texCoordActive(stage, 2)) state.output[T0 + stage].write |= 0x04;
+				if(context->texCoordActive(stage, 3)) state.output[T0 + stage].write |= 0x08;
+			}
+
+			if(context->fogActive())
+			{
+				state.output[Fog].xWrite = true;
+			}
+
+			if(context->pointSizeActive())
+			{
+				state.output[Pts].yWrite = true;
+			}
+		}
+		else
+		{
+			state.output[Pos].write = 0xF;
+
+			for(int i = 0; i < 2; i++)
+			{
+				if(context->input[Color0 + i])
+				{
+					state.output[C0 + i].write = 0xF;
+				}
+			}
+
+			for(int i = 0; i < 8; i++)
+			{
+				if(context->input[TexCoord0 + i])
+				{
+					state.output[T0 + i].write = 0xF;
+				}
+			}
+
+			if(context->input[PointSize])
+			{
+				state.output[Pts].yWrite = true;
+			}
+		}
+
+		if(context->vertexShaderModel() < 0x0300)
+		{
+			state.output[C0].clamp = 0xF;
+			state.output[C1].clamp = 0xF;
+			state.output[Fog].xClamp = true;
+		}
+
+		state.hash = state.computeHash();
+
+		return state;
+	}
+
+	Routine *VertexProcessor::routine(const State &state)
+	{
+		Routine *routine = routineCache->query(state);
+
+		if(!routine)   // Create one
+		{
+			VertexRoutine *generator = nullptr;
+
+			if(state.fixedFunction)
+			{
+				generator = new VertexPipeline(state);
+			}
+			else
+			{
+				generator = new VertexProgram(state, context->vertexShader);
+			}
+
+			generator->generate();
+			routine = (*generator)(L"VertexRoutine_%0.8X", state.shaderID);
+			delete generator;
+
+			routineCache->add(state, routine);
+		}
+
+		return routine;
+	}
+}
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
new file mode 100644
index 0000000..277a155
--- /dev/null
+++ b/src/Device/VertexProcessor.hpp
@@ -0,0 +1,352 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexProcessor_hpp
+#define sw_VertexProcessor_hpp
+
+#include "Matrix.hpp"
+#include "Context.hpp"
+#include "RoutineCache.hpp"
+#include "Shader/VertexShader.hpp"
+
+namespace sw
+{
+	struct DrawData;
+
+	struct VertexCache   // FIXME: Variable size
+	{
+		void clear();
+
+		Vertex vertex[16][4];
+		unsigned int tag[16];
+
+		int drawCall;
+	};
+
+	struct VertexTask
+	{
+		unsigned int vertexCount;
+		unsigned int primitiveStart;
+		VertexCache vertexCache;
+	};
+
+	class VertexProcessor
+	{
+	public:
+		struct States
+		{
+			unsigned int computeHash();
+
+			uint64_t shaderID;
+
+			bool fixedFunction             : 1;   // TODO: Eliminate by querying shader.
+			bool textureSampling           : 1;   // TODO: Eliminate by querying shader.
+			unsigned int positionRegister  : BITS(MAX_VERTEX_OUTPUTS);   // TODO: Eliminate by querying shader.
+			unsigned int pointSizeRegister : BITS(MAX_VERTEX_OUTPUTS);   // TODO: Eliminate by querying shader.
+
+			unsigned int vertexBlendMatrixCount               : 3;
+			bool indexedVertexBlendEnable                     : 1;
+			bool vertexNormalActive                           : 1;
+			bool normalizeNormals                             : 1;
+			bool vertexLightingActive                         : 1;
+			bool diffuseActive                                : 1;
+			bool specularActive                               : 1;
+			bool vertexSpecularActive                         : 1;
+			unsigned int vertexLightActive                    : 8;
+			MaterialSource vertexDiffuseMaterialSourceActive  : BITS(MATERIAL_LAST);
+			MaterialSource vertexSpecularMaterialSourceActive : BITS(MATERIAL_LAST);
+			MaterialSource vertexAmbientMaterialSourceActive  : BITS(MATERIAL_LAST);
+			MaterialSource vertexEmissiveMaterialSourceActive : BITS(MATERIAL_LAST);
+			bool fogActive                                    : 1;
+			FogMode vertexFogMode                             : BITS(FOG_LAST);
+			bool rangeFogActive                               : 1;
+			bool localViewerActive                            : 1;
+			bool pointSizeActive                              : 1;
+			bool pointScaleActive                             : 1;
+			bool transformFeedbackQueryEnabled                : 1;
+			uint64_t transformFeedbackEnabled                 : 64;
+			unsigned char verticesPerPrimitive                : 2; // 1 (points), 2 (lines) or 3 (triangles)
+
+			bool preTransformed : 1;
+			bool superSampling  : 1;
+			bool multiSampling  : 1;
+
+			struct TextureState
+			{
+				TexGen texGenActive                       : BITS(TEXGEN_LAST);
+				unsigned char textureTransformCountActive : 3;
+				unsigned char texCoordIndexActive         : 3;
+			};
+
+			TextureState textureState[8];
+
+			Sampler::State sampler[VERTEX_TEXTURE_IMAGE_UNITS];
+
+			struct Input
+			{
+				operator bool() const   // Returns true if stream contains data
+				{
+					return count != 0;
+				}
+
+				StreamType type    : BITS(STREAMTYPE_LAST);
+				unsigned int count : 3;
+				bool normalized    : 1;
+				unsigned int attribType : BITS(VertexShader::ATTRIBTYPE_LAST);
+			};
+
+			struct Output
+			{
+				union
+				{
+					unsigned char write : 4;
+
+					struct
+					{
+						unsigned char xWrite : 1;
+						unsigned char yWrite : 1;
+						unsigned char zWrite : 1;
+						unsigned char wWrite : 1;
+					};
+				};
+
+				union
+				{
+					unsigned char clamp : 4;
+
+					struct
+					{
+						unsigned char xClamp : 1;
+						unsigned char yClamp : 1;
+						unsigned char zClamp : 1;
+						unsigned char wClamp : 1;
+					};
+				};
+			};
+
+			Input input[MAX_VERTEX_INPUTS];
+			Output output[MAX_VERTEX_OUTPUTS];
+		};
+
+		struct State : States
+		{
+			State();
+
+			bool operator==(const State &state) const;
+
+			unsigned int hash;
+		};
+
+		struct FixedFunction
+		{
+			float4 transformT[12][4];
+			float4 cameraTransformT[12][4];
+			float4 normalTransformT[12][4];
+			float4 textureTransform[8][4];
+
+			float4 lightPosition[8];
+			float4 lightAmbient[8];
+			float4 lightSpecular[8];
+			float4 lightDiffuse[8];
+			float4 attenuationConstant[8];
+			float4 attenuationLinear[8];
+			float4 attenuationQuadratic[8];
+			float lightRange[8];
+			float4 materialDiffuse;
+			float4 materialSpecular;
+			float materialShininess;
+			float4 globalAmbient;
+			float4 materialEmission;
+			float4 materialAmbient;
+		};
+
+		struct PointSprite
+		{
+			float4 pointSize;
+			float pointSizeMin;
+			float pointSizeMax;
+			float pointScaleA;
+			float pointScaleB;
+			float pointScaleC;
+		};
+
+		typedef void (*RoutinePointer)(Vertex *output, unsigned int *batch, VertexTask *vertexTask, DrawData *draw);
+
+		VertexProcessor(Context *context);
+
+		virtual ~VertexProcessor();
+
+		void setInputStream(int index, const Stream &stream);
+		void resetInputStreams(bool preTransformed);
+
+		void setFloatConstant(unsigned int index, const float value[4]);
+		void setIntegerConstant(unsigned int index, const int integer[4]);
+		void setBooleanConstant(unsigned int index, int boolean);
+
+		void setUniformBuffer(int index, sw::Resource* uniformBuffer, int offset);
+		void lockUniformBuffers(byte** u, sw::Resource* uniformBuffers[]);
+
+		void setTransformFeedbackBuffer(int index, sw::Resource* transformFeedbackBuffer, int offset, unsigned int reg, unsigned int row, unsigned int col, unsigned int stride);
+		void lockTransformFeedbackBuffers(byte** t, unsigned int* v, unsigned int* r, unsigned int* c, unsigned int* s, sw::Resource* transformFeedbackBuffers[]);
+
+		// Transformations
+		void setModelMatrix(const Matrix &M, int i = 0);
+		void setViewMatrix(const Matrix &V);
+		void setBaseMatrix(const Matrix &B);
+		void setProjectionMatrix(const Matrix &P);
+
+		// Lighting
+		void setLightingEnable(bool lightingEnable);
+		void setLightEnable(unsigned int light, bool lightEnable);
+		void setSpecularEnable(bool specularEnable);
+
+		void setGlobalAmbient(const Color<float> &globalAmbient);
+		void setLightPosition(unsigned int light, const Point &lightPosition);
+		void setLightViewPosition(unsigned int light, const Point &lightPosition);
+		void setLightDiffuse(unsigned int light, const Color<float> &lightDiffuse);
+		void setLightSpecular(unsigned int light, const Color<float> &lightSpecular);
+		void setLightAmbient(unsigned int light, const Color<float> &lightAmbient);
+		void setLightAttenuation(unsigned int light, float constant, float linear, float quadratic);
+		void setLightRange(unsigned int light, float lightRange);
+
+		void setInstanceID(int instanceID);
+
+		void setFogEnable(bool fogEnable);
+		void setVertexFogMode(FogMode fogMode);
+		void setRangeFogEnable(bool enable);
+
+		void setColorVertexEnable(bool colorVertexEnable);
+		void setDiffuseMaterialSource(MaterialSource diffuseMaterialSource);
+		void setSpecularMaterialSource(MaterialSource specularMaterialSource);
+		void setAmbientMaterialSource(MaterialSource ambientMaterialSource);
+		void setEmissiveMaterialSource(MaterialSource emissiveMaterialSource);
+
+		void setMaterialEmission(const Color<float> &emission);
+		void setMaterialAmbient(const Color<float> &materialAmbient);
+		void setMaterialDiffuse(const Color<float> &diffuseColor);
+		void setMaterialSpecular(const Color<float> &specularColor);
+		void setMaterialShininess(float specularPower);
+
+		void setIndexedVertexBlendEnable(bool indexedVertexBlendEnable);
+		void setVertexBlendMatrixCount(unsigned int vertexBlendMatrixCount);
+
+		void setTextureWrap(unsigned int stage, int mask);
+		void setTexGen(unsigned int stage, TexGen texGen);
+		void setLocalViewer(bool localViewer);
+		void setNormalizeNormals(bool normalizeNormals);
+		void setTextureMatrix(int stage, const Matrix &T);
+		void setTextureTransform(int stage, int count, bool project);
+
+		void setTextureFilter(unsigned int sampler, FilterType textureFilter);
+		void setMipmapFilter(unsigned int sampler, MipmapType mipmapFilter);
+		void setGatherEnable(unsigned int sampler, bool enable);
+		void setAddressingModeU(unsigned int sampler, AddressingMode addressingMode);
+		void setAddressingModeV(unsigned int sampler, AddressingMode addressingMode);
+		void setAddressingModeW(unsigned int sampler, AddressingMode addressingMode);
+		void setReadSRGB(unsigned int sampler, bool sRGB);
+		void setMipmapLOD(unsigned int sampler, float bias);
+		void setBorderColor(unsigned int sampler, const Color<float> &borderColor);
+		void setMaxAnisotropy(unsigned int stage, float maxAnisotropy);
+		void setHighPrecisionFiltering(unsigned int sampler, bool highPrecisionFiltering);
+		void setSwizzleR(unsigned int sampler, SwizzleType swizzleR);
+		void setSwizzleG(unsigned int sampler, SwizzleType swizzleG);
+		void setSwizzleB(unsigned int sampler, SwizzleType swizzleB);
+		void setSwizzleA(unsigned int sampler, SwizzleType swizzleA);
+		void setCompareFunc(unsigned int sampler, CompareFunc compare);
+		void setBaseLevel(unsigned int sampler, int baseLevel);
+		void setMaxLevel(unsigned int sampler, int maxLevel);
+		void setMinLod(unsigned int sampler, float minLod);
+		void setMaxLod(unsigned int sampler, float maxLod);
+		void setSyncRequired(unsigned int sampler, bool isSincRequired);
+
+		void setPointSize(float pointSize);
+		void setPointSizeMin(float pointSizeMin);
+		void setPointSizeMax(float pointSizeMax);
+		void setPointScaleA(float pointScaleA);
+		void setPointScaleB(float pointScaleB);
+		void setPointScaleC(float pointScaleC);
+
+		void setTransformFeedbackQueryEnabled(bool enable);
+		void enableTransformFeedback(uint64_t enable);
+
+	protected:
+		const Matrix &getModelTransform(int i);
+		const Matrix &getViewTransform();
+
+		const State update(DrawType drawType);
+		Routine *routine(const State &state);
+
+		bool isFixedFunction();
+		void setRoutineCacheSize(int cacheSize);
+
+		// Shader constants
+		float4 c[VERTEX_UNIFORM_VECTORS + 1];   // One extra for indices out of range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+		int4 i[16];
+		bool b[16];
+
+		PointSprite point;
+		FixedFunction ff;
+
+	private:
+		struct UniformBufferInfo
+		{
+			UniformBufferInfo();
+
+			Resource* buffer;
+			int offset;
+		};
+		UniformBufferInfo uniformBufferInfo[MAX_UNIFORM_BUFFER_BINDINGS];
+
+		struct TransformFeedbackInfo
+		{
+			TransformFeedbackInfo();
+
+			Resource* buffer;
+			unsigned int offset;
+			unsigned int reg;
+			unsigned int row;
+			unsigned int col;
+			unsigned int stride;
+		};
+		TransformFeedbackInfo transformFeedbackInfo[MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS];
+
+		void updateTransform();
+		void setTransform(const Matrix &M, int i);
+		void setCameraTransform(const Matrix &M, int i);
+		void setNormalTransform(const Matrix &M, int i);
+
+		Context *const context;
+
+		RoutineCache<State> *routineCache;
+
+	protected:
+		Matrix M[12];      // Model/Geometry/World matrix
+		Matrix V;          // View/Camera/Eye matrix
+		Matrix B;          // Base matrix
+		Matrix P;          // Projection matrix
+		Matrix PB;         // P * B
+		Matrix PBV;        // P * B * V
+		Matrix PBVM[12];   // P * B * V * M
+
+		// Update hierarchy
+		bool updateMatrix;
+		bool updateModelMatrix[12];
+		bool updateViewMatrix;
+		bool updateBaseMatrix;
+		bool updateProjectionMatrix;
+		bool updateLighting;
+	};
+}
+
+#endif   // sw_VertexProcessor_hpp
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
new file mode 100644
index 0000000..06dda32
--- /dev/null
+++ b/src/Pipeline/Constants.cpp
@@ -0,0 +1,362 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Constants.hpp"
+
+#include "Common/Math.hpp"
+#include "Common/Half.hpp"
+
+#include <memory.h>
+
+namespace sw
+{
+	Constants constants;
+
+	Constants::Constants()
+	{
+		static const unsigned int transposeBit0[16] =
+		{
+			0x00000000,
+			0x00000001,
+			0x00000010,
+			0x00000011,
+			0x00000100,
+			0x00000101,
+			0x00000110,
+			0x00000111,
+			0x00001000,
+			0x00001001,
+			0x00001010,
+			0x00001011,
+			0x00001100,
+			0x00001101,
+			0x00001110,
+			0x00001111
+		};
+
+		static const unsigned int transposeBit1[16] =
+		{
+			0x00000000,
+			0x00000002,
+			0x00000020,
+			0x00000022,
+			0x00000200,
+			0x00000202,
+			0x00000220,
+			0x00000222,
+			0x00002000,
+			0x00002002,
+			0x00002020,
+			0x00002022,
+			0x00002200,
+			0x00002202,
+			0x00002220,
+			0x00002222
+		};
+
+		static const unsigned int transposeBit2[16] =
+		{
+			0x00000000,
+			0x00000004,
+			0x00000040,
+			0x00000044,
+			0x00000400,
+			0x00000404,
+			0x00000440,
+			0x00000444,
+			0x00004000,
+			0x00004004,
+			0x00004040,
+			0x00004044,
+			0x00004400,
+			0x00004404,
+			0x00004440,
+			0x00004444
+		};
+
+		memcpy(&this->transposeBit0, transposeBit0, sizeof(transposeBit0));
+		memcpy(&this->transposeBit1, transposeBit1, sizeof(transposeBit1));
+		memcpy(&this->transposeBit2, transposeBit2, sizeof(transposeBit2));
+
+		static const ushort4 cWeight[17] =
+		{
+			{0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF},   // 0xFFFF / 1  = 0xFFFF
+			{0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF},   // 0xFFFF / 1  = 0xFFFF
+			{0x8000, 0x8000, 0x8000, 0x8000},   // 0xFFFF / 2  = 0x8000
+			{0x5555, 0x5555, 0x5555, 0x5555},   // 0xFFFF / 3  = 0x5555
+			{0x4000, 0x4000, 0x4000, 0x4000},   // 0xFFFF / 4  = 0x4000
+			{0x3333, 0x3333, 0x3333, 0x3333},   // 0xFFFF / 5  = 0x3333
+			{0x2AAA, 0x2AAA, 0x2AAA, 0x2AAA},   // 0xFFFF / 6  = 0x2AAA
+			{0x2492, 0x2492, 0x2492, 0x2492},   // 0xFFFF / 7  = 0x2492
+			{0x2000, 0x2000, 0x2000, 0x2000},   // 0xFFFF / 8  = 0x2000
+			{0x1C71, 0x1C71, 0x1C71, 0x1C71},   // 0xFFFF / 9  = 0x1C71
+			{0x1999, 0x1999, 0x1999, 0x1999},   // 0xFFFF / 10 = 0x1999
+			{0x1745, 0x1745, 0x1745, 0x1745},   // 0xFFFF / 11 = 0x1745
+			{0x1555, 0x1555, 0x1555, 0x1555},   // 0xFFFF / 12 = 0x1555
+			{0x13B1, 0x13B1, 0x13B1, 0x13B1},   // 0xFFFF / 13 = 0x13B1
+			{0x1249, 0x1249, 0x1249, 0x1249},   // 0xFFFF / 14 = 0x1249
+			{0x1111, 0x1111, 0x1111, 0x1111},   // 0xFFFF / 15 = 0x1111
+			{0x1000, 0x1000, 0x1000, 0x1000},   // 0xFFFF / 16 = 0x1000
+		};
+
+		static const float4 uvWeight[17] =
+		{
+			{1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f},
+			{1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f,  1.0f / 1.0f},
+			{1.0f / 2.0f,  1.0f / 2.0f,  1.0f / 2.0f,  1.0f / 2.0f},
+			{1.0f / 3.0f,  1.0f / 3.0f,  1.0f / 3.0f,  1.0f / 3.0f},
+			{1.0f / 4.0f,  1.0f / 4.0f,  1.0f / 4.0f,  1.0f / 4.0f},
+			{1.0f / 5.0f,  1.0f / 5.0f,  1.0f / 5.0f,  1.0f / 5.0f},
+			{1.0f / 6.0f,  1.0f / 6.0f,  1.0f / 6.0f,  1.0f / 6.0f},
+			{1.0f / 7.0f,  1.0f / 7.0f,  1.0f / 7.0f,  1.0f / 7.0f},
+			{1.0f / 8.0f,  1.0f / 8.0f,  1.0f / 8.0f,  1.0f / 8.0f},
+			{1.0f / 9.0f,  1.0f / 9.0f,  1.0f / 9.0f,  1.0f / 9.0f},
+			{1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f, 1.0f / 10.0f},
+			{1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f, 1.0f / 11.0f},
+			{1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f, 1.0f / 12.0f},
+			{1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f, 1.0f / 13.0f},
+			{1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f, 1.0f / 14.0f},
+			{1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f},
+			{1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f},
+		};
+
+		static const float4 uvStart[17] =
+		{
+			{-0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f},
+			{-0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f,   -0.0f / 2.0f},
+			{-1.0f / 4.0f,   -1.0f / 4.0f,   -1.0f / 4.0f,   -1.0f / 4.0f},
+			{-2.0f / 6.0f,   -2.0f / 6.0f,   -2.0f / 6.0f,   -2.0f / 6.0f},
+			{-3.0f / 8.0f,   -3.0f / 8.0f,   -3.0f / 8.0f,   -3.0f / 8.0f},
+			{-4.0f / 10.0f,  -4.0f / 10.0f,  -4.0f / 10.0f,  -4.0f / 10.0f},
+			{-5.0f / 12.0f,  -5.0f / 12.0f,  -5.0f / 12.0f,  -5.0f / 12.0f},
+			{-6.0f / 14.0f,  -6.0f / 14.0f,  -6.0f / 14.0f,  -6.0f / 14.0f},
+			{-7.0f / 16.0f,  -7.0f / 16.0f,  -7.0f / 16.0f,  -7.0f / 16.0f},
+			{-8.0f / 18.0f,  -8.0f / 18.0f,  -8.0f / 18.0f,  -8.0f / 18.0f},
+			{-9.0f / 20.0f,  -9.0f / 20.0f,  -9.0f / 20.0f,  -9.0f / 20.0f},
+			{-10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f, -10.0f / 22.0f},
+			{-11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f, -11.0f / 24.0f},
+			{-12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f, -12.0f / 26.0f},
+			{-13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f, -13.0f / 28.0f},
+			{-14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f, -14.0f / 30.0f},
+			{-15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f, -15.0f / 32.0f},
+		};
+
+		memcpy(&this->cWeight, cWeight, sizeof(cWeight));
+		memcpy(&this->uvWeight, uvWeight, sizeof(uvWeight));
+		memcpy(&this->uvStart, uvStart, sizeof(uvStart));
+
+		static const unsigned int occlusionCount[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+		memcpy(&this->occlusionCount, &occlusionCount, sizeof(occlusionCount));
+
+		for(int i = 0; i < 16; i++)
+		{
+			maskB4Q[i][0] = -(i >> 0 & 1);
+			maskB4Q[i][1] = -(i >> 1 & 1);
+			maskB4Q[i][2] = -(i >> 2 & 1);
+			maskB4Q[i][3] = -(i >> 3 & 1);
+			maskB4Q[i][4] = -(i >> 0 & 1);
+			maskB4Q[i][5] = -(i >> 1 & 1);
+			maskB4Q[i][6] = -(i >> 2 & 1);
+			maskB4Q[i][7] = -(i >> 3 & 1);
+
+			invMaskB4Q[i][0] = ~maskB4Q[i][0];
+			invMaskB4Q[i][1] = ~maskB4Q[i][1];
+			invMaskB4Q[i][2] = ~maskB4Q[i][2];
+			invMaskB4Q[i][3] = ~maskB4Q[i][3];
+			invMaskB4Q[i][4] = ~maskB4Q[i][4];
+			invMaskB4Q[i][5] = ~maskB4Q[i][5];
+			invMaskB4Q[i][6] = ~maskB4Q[i][6];
+			invMaskB4Q[i][7] = ~maskB4Q[i][7];
+
+			maskW4Q[i][0] = -(i >> 0 & 1);
+			maskW4Q[i][1] = -(i >> 1 & 1);
+			maskW4Q[i][2] = -(i >> 2 & 1);
+			maskW4Q[i][3] = -(i >> 3 & 1);
+
+			invMaskW4Q[i][0] = ~maskW4Q[i][0];
+			invMaskW4Q[i][1] = ~maskW4Q[i][1];
+			invMaskW4Q[i][2] = ~maskW4Q[i][2];
+			invMaskW4Q[i][3] = ~maskW4Q[i][3];
+
+			maskD4X[i][0] = -(i >> 0 & 1);
+			maskD4X[i][1] = -(i >> 1 & 1);
+			maskD4X[i][2] = -(i >> 2 & 1);
+			maskD4X[i][3] = -(i >> 3 & 1);
+
+			invMaskD4X[i][0] = ~maskD4X[i][0];
+			invMaskD4X[i][1] = ~maskD4X[i][1];
+			invMaskD4X[i][2] = ~maskD4X[i][2];
+			invMaskD4X[i][3] = ~maskD4X[i][3];
+
+			maskQ0Q[i] = -(i >> 0 & 1);
+			maskQ1Q[i] = -(i >> 1 & 1);
+			maskQ2Q[i] = -(i >> 2 & 1);
+			maskQ3Q[i] = -(i >> 3 & 1);
+
+			invMaskQ0Q[i] = ~maskQ0Q[i];
+			invMaskQ1Q[i] = ~maskQ1Q[i];
+			invMaskQ2Q[i] = ~maskQ2Q[i];
+			invMaskQ3Q[i] = ~maskQ3Q[i];
+
+			maskX0X[i][0] = maskX0X[i][1] = maskX0X[i][2] = maskX0X[i][3] = -(i >> 0 & 1);
+			maskX1X[i][0] = maskX1X[i][1] = maskX1X[i][2] = maskX1X[i][3] = -(i >> 1 & 1);
+			maskX2X[i][0] = maskX2X[i][1] = maskX2X[i][2] = maskX2X[i][3] = -(i >> 2 & 1);
+			maskX3X[i][0] = maskX3X[i][1] = maskX3X[i][2] = maskX3X[i][3] = -(i >> 3 & 1);
+
+			invMaskX0X[i][0] = invMaskX0X[i][1] = invMaskX0X[i][2] = invMaskX0X[i][3] = ~maskX0X[i][0];
+			invMaskX1X[i][0] = invMaskX1X[i][1] = invMaskX1X[i][2] = invMaskX1X[i][3] = ~maskX1X[i][0];
+			invMaskX2X[i][0] = invMaskX2X[i][1] = invMaskX2X[i][2] = invMaskX2X[i][3] = ~maskX2X[i][0];
+			invMaskX3X[i][0] = invMaskX3X[i][1] = invMaskX3X[i][2] = invMaskX3X[i][3] = ~maskX3X[i][0];
+
+			maskD01Q[i][0] = -(i >> 0 & 1);
+			maskD01Q[i][1] = -(i >> 1 & 1);
+			maskD23Q[i][0] = -(i >> 2 & 1);
+			maskD23Q[i][1] = -(i >> 3 & 1);
+
+			invMaskD01Q[i][0] = ~maskD01Q[i][0];
+			invMaskD01Q[i][1] = ~maskD01Q[i][1];
+			invMaskD23Q[i][0] = ~maskD23Q[i][0];
+			invMaskD23Q[i][1] = ~maskD23Q[i][1];
+
+			maskQ01X[i][0] = -(i >> 0 & 1);
+			maskQ01X[i][1] = -(i >> 1 & 1);
+			maskQ23X[i][0] = -(i >> 2 & 1);
+			maskQ23X[i][1] = -(i >> 3 & 1);
+
+			invMaskQ01X[i][0] = ~maskQ01X[i][0];
+			invMaskQ01X[i][1] = ~maskQ01X[i][1];
+			invMaskQ23X[i][0] = ~maskQ23X[i][0];
+			invMaskQ23X[i][1] = ~maskQ23X[i][1];
+		}
+
+		for(int i = 0; i < 8; i++)
+		{
+			mask565Q[i][0] =
+			mask565Q[i][1] =
+			mask565Q[i][2] =
+			mask565Q[i][3] = (i & 0x1 ? 0x001F : 0) | (i & 0x2 ? 0x07E0 : 0) | (i & 0x4 ? 0xF800 : 0);
+		}
+
+		for(int i = 0; i < 4; i++)
+		{
+			maskW01Q[i][0] =  -(i >> 0 & 1);
+			maskW01Q[i][1] =  -(i >> 1 & 1);
+			maskW01Q[i][2] =  -(i >> 0 & 1);
+			maskW01Q[i][3] =  -(i >> 1 & 1);
+
+			maskD01X[i][0] =  -(i >> 0 & 1);
+			maskD01X[i][1] =  -(i >> 1 & 1);
+			maskD01X[i][2] =  -(i >> 0 & 1);
+			maskD01X[i][3] =  -(i >> 1 & 1);
+		}
+
+		for(int i = 0; i < 256; i++)
+		{
+			sRGBtoLinear8_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0xFF) * 0xFFFF + 0.5f);
+		}
+
+		for(int i = 0; i < 64; i++)
+		{
+			sRGBtoLinear6_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0x3F) * 0xFFFF + 0.5f);
+		}
+
+		for(int i = 0; i < 32; i++)
+		{
+			sRGBtoLinear5_16[i] = (unsigned short)(sw::sRGBtoLinear((float)i / 0x1F) * 0xFFFF + 0.5f);
+		}
+
+		for(int i = 0; i < 0x1000; i++)
+		{
+			linearToSRGB12_16[i] = (unsigned short)(clamp(sw::linearToSRGB((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+			sRGBtoLinear12_16[i] = (unsigned short)(clamp(sw::sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+		}
+
+		for(int q = 0; q < 4; q++)
+		{
+			for(int c = 0; c < 16; c++)
+			{
+				for(int i = 0; i < 4; i++)
+				{
+					const float X[4] = {+0.3125f, -0.3125f, -0.1250f, +0.1250f};
+					const float Y[4] = {+0.1250f, -0.1250f, +0.3125f, -0.3125f};
+
+					sampleX[q][c][i] = c & (1 << i) ? X[q] : 0.0f;
+					sampleY[q][c][i] = c & (1 << i) ? Y[q] : 0.0f;
+					weight[c][i] = c & (1 << i) ? 1.0f : 0.0f;
+				}
+			}
+		}
+
+		const int Xf[4] = {-5, +5, +2, -2};   // Fragment offsets
+		const int Yf[4] = {-2, +2, -5, +5};   // Fragment offsets
+
+		memcpy(&this->Xf, &Xf, sizeof(Xf));
+		memcpy(&this->Yf, &Yf, sizeof(Yf));
+
+		static const float4 X[4] = {{-0.3125f, -0.3125f, -0.3125f, -0.3125f},
+					                {+0.3125f, +0.3125f, +0.3125f, +0.3125f},
+					                {+0.1250f, +0.1250f, +0.1250f, +0.1250f},
+					                {-0.1250f, -0.1250f, -0.1250f, -0.1250f}};
+
+		static const float4 Y[4] = {{-0.1250f, -0.1250f, -0.1250f, -0.1250f},
+		                            {+0.1250f, +0.1250f, +0.1250f, +0.1250f},
+		                            {-0.3125f, -0.3125f, -0.3125f, -0.3125f},
+		                            {+0.3125f, +0.3125f, +0.3125f, +0.3125f}};
+
+		memcpy(&this->X, &X, sizeof(X));
+		memcpy(&this->Y, &Y, sizeof(Y));
+
+		const dword maxX[16] = {0x00000000, 0x00000001, 0x00000100, 0x00000101, 0x00010000, 0x00010001, 0x00010100, 0x00010101, 0x01000000, 0x01000001, 0x01000100, 0x01000101, 0x01010000, 0x01010001, 0x01010100, 0x01010101};
+		const dword maxY[16] = {0x00000000, 0x00000002, 0x00000200, 0x00000202, 0x00020000, 0x00020002, 0x00020200, 0x00020202, 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02020000, 0x02020002, 0x02020200, 0x02020202};
+		const dword maxZ[16] = {0x00000000, 0x00000004, 0x00000400, 0x00000404, 0x00040000, 0x00040004, 0x00040400, 0x00040404, 0x04000000, 0x04000004, 0x04000400, 0x04000404, 0x04040000, 0x04040004, 0x04040400, 0x04040404};
+		const dword minX[16] = {0x00000000, 0x00000008, 0x00000800, 0x00000808, 0x00080000, 0x00080008, 0x00080800, 0x00080808, 0x08000000, 0x08000008, 0x08000800, 0x08000808, 0x08080000, 0x08080008, 0x08080800, 0x08080808};
+		const dword minY[16] = {0x00000000, 0x00000010, 0x00001000, 0x00001010, 0x00100000, 0x00100010, 0x00101000, 0x00101010, 0x10000000, 0x10000010, 0x10001000, 0x10001010, 0x10100000, 0x10100010, 0x10101000, 0x10101010};
+		const dword minZ[16] = {0x00000000, 0x00000020, 0x00002000, 0x00002020, 0x00200000, 0x00200020, 0x00202000, 0x00202020, 0x20000000, 0x20000020, 0x20002000, 0x20002020, 0x20200000, 0x20200020, 0x20202000, 0x20202020};
+		const dword fini[16] = {0x00000000, 0x00000080, 0x00008000, 0x00008080, 0x00800000, 0x00800080, 0x00808000, 0x00808080, 0x80000000, 0x80000080, 0x80008000, 0x80008080, 0x80800000, 0x80800080, 0x80808000, 0x80808080};
+
+		memcpy(&this->maxX, &maxX, sizeof(maxX));
+		memcpy(&this->maxY, &maxY, sizeof(maxY));
+		memcpy(&this->maxZ, &maxZ, sizeof(maxZ));
+		memcpy(&this->minX, &minX, sizeof(minX));
+		memcpy(&this->minY, &minY, sizeof(minY));
+		memcpy(&this->minZ, &minZ, sizeof(minZ));
+		memcpy(&this->fini, &fini, sizeof(fini));
+
+		static const dword4 maxPos = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFE};
+
+		memcpy(&this->maxPos, &maxPos, sizeof(maxPos));
+
+		static const float4 unscaleByte = {1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF, 1.0f / 0xFF};
+		static const float4 unscaleSByte = {1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F, 1.0f / 0x7F};
+		static const float4 unscaleShort = {1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF};
+		static const float4 unscaleUShort = {1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF};
+		static const float4 unscaleInt = {1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF, 1.0f / 0x7FFFFFFF};
+		static const float4 unscaleUInt = {1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF, 1.0f / 0xFFFFFFFF};
+		static const float4 unscaleFixed = {1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000, 1.0f / 0x00010000};
+
+		memcpy(&this->unscaleByte, &unscaleByte, sizeof(unscaleByte));
+		memcpy(&this->unscaleSByte, &unscaleSByte, sizeof(unscaleSByte));
+		memcpy(&this->unscaleShort, &unscaleShort, sizeof(unscaleShort));
+		memcpy(&this->unscaleUShort, &unscaleUShort, sizeof(unscaleUShort));
+		memcpy(&this->unscaleInt, &unscaleInt, sizeof(unscaleInt));
+		memcpy(&this->unscaleUInt, &unscaleUInt, sizeof(unscaleUInt));
+		memcpy(&this->unscaleFixed, &unscaleFixed, sizeof(unscaleFixed));
+
+		for(int i = 0; i <= 0xFFFF; i++)
+		{
+			half2float[i] = (float)reinterpret_cast<half&>(i);
+		}
+	}
+}
\ No newline at end of file
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
new file mode 100644
index 0000000..6b70e04
--- /dev/null
+++ b/src/Pipeline/Constants.hpp
@@ -0,0 +1,113 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Constants_hpp
+#define sw_Constants_hpp
+
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	struct Constants
+	{
+		Constants();
+
+		unsigned int transposeBit0[16];
+		unsigned int transposeBit1[16];
+		unsigned int transposeBit2[16];
+
+		ushort4 cWeight[17];
+		float4 uvWeight[17];
+		float4 uvStart[17];
+
+		unsigned int occlusionCount[16];
+
+		byte8 maskB4Q[16];
+		byte8 invMaskB4Q[16];
+		word4 maskW4Q[16];
+		word4 invMaskW4Q[16];
+		dword4 maskD4X[16];
+		dword4 invMaskD4X[16];
+		qword maskQ0Q[16];
+		qword maskQ1Q[16];
+		qword maskQ2Q[16];
+		qword maskQ3Q[16];
+		qword invMaskQ0Q[16];
+		qword invMaskQ1Q[16];
+		qword invMaskQ2Q[16];
+		qword invMaskQ3Q[16];
+		dword4 maskX0X[16];
+		dword4 maskX1X[16];
+		dword4 maskX2X[16];
+		dword4 maskX3X[16];
+		dword4 invMaskX0X[16];
+		dword4 invMaskX1X[16];
+		dword4 invMaskX2X[16];
+		dword4 invMaskX3X[16];
+		dword2 maskD01Q[16];
+		dword2 maskD23Q[16];
+		dword2 invMaskD01Q[16];
+		dword2 invMaskD23Q[16];
+		qword2 maskQ01X[16];
+		qword2 maskQ23X[16];
+		qword2 invMaskQ01X[16];
+		qword2 invMaskQ23X[16];
+		word4 maskW01Q[4];
+		dword4 maskD01X[4];
+		word4 mask565Q[8];
+
+		unsigned short sRGBtoLinear8_16[256];
+		unsigned short sRGBtoLinear6_16[64];
+		unsigned short sRGBtoLinear5_16[32];
+
+		unsigned short linearToSRGB12_16[4096];
+		unsigned short sRGBtoLinear12_16[4096];
+
+		// Centroid parameters
+		float4 sampleX[4][16];
+		float4 sampleY[4][16];
+		float4 weight[16];
+
+		// Fragment offsets
+		int Xf[4];
+		int Yf[4];
+
+		float4 X[4];
+		float4 Y[4];
+
+		dword maxX[16];
+		dword maxY[16];
+		dword maxZ[16];
+		dword minX[16];
+		dword minY[16];
+		dword minZ[16];
+		dword fini[16];
+
+		dword4 maxPos;
+
+		float4 unscaleByte;
+		float4 unscaleSByte;
+		float4 unscaleShort;
+		float4 unscaleUShort;
+		float4 unscaleInt;
+		float4 unscaleUInt;
+		float4 unscaleFixed;
+
+		float half2float[65536];
+	};
+
+	extern Constants constants;
+}
+
+#endif   // sw_Constants_hpp
diff --git a/src/Pipeline/PixelPipeline.cpp b/src/Pipeline/PixelPipeline.cpp
new file mode 100644
index 0000000..d4faebd
--- /dev/null
+++ b/src/Pipeline/PixelPipeline.cpp
@@ -0,0 +1,1959 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelPipeline.hpp"
+#include "SamplerCore.hpp"
+#include "Renderer/Renderer.hpp"
+
+namespace sw
+{
+	extern bool postBlendSRGB;
+
+	void PixelPipeline::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
+	{
+		if(state.color[0].component & 0x1) diffuse.x = convertFixed12(v[0].x); else diffuse.x = Short4(0x1000);
+		if(state.color[0].component & 0x2) diffuse.y = convertFixed12(v[0].y); else diffuse.y = Short4(0x1000);
+		if(state.color[0].component & 0x4) diffuse.z = convertFixed12(v[0].z); else diffuse.z = Short4(0x1000);
+		if(state.color[0].component & 0x8) diffuse.w = convertFixed12(v[0].w); else diffuse.w = Short4(0x1000);
+
+		if(state.color[1].component & 0x1) specular.x = convertFixed12(v[1].x); else specular.x = Short4(0x0000);
+		if(state.color[1].component & 0x2) specular.y = convertFixed12(v[1].y); else specular.y = Short4(0x0000);
+		if(state.color[1].component & 0x4) specular.z = convertFixed12(v[1].z); else specular.z = Short4(0x0000);
+		if(state.color[1].component & 0x8) specular.w = convertFixed12(v[1].w); else specular.w = Short4(0x0000);
+	}
+
+	void PixelPipeline::fixedFunction()
+	{
+		current = diffuse;
+		Vector4s temp(0x0000, 0x0000, 0x0000, 0x0000);
+
+		for(int stage = 0; stage < 8; stage++)
+		{
+			if(state.textureStage[stage].stageOperation == TextureStage::STAGE_DISABLE)
+			{
+				break;
+			}
+
+			Vector4s texture;
+
+			if(state.textureStage[stage].usesTexture)
+			{
+				texture = sampleTexture(stage, stage);
+			}
+
+			blendTexture(temp, texture, stage);
+		}
+
+		specularPixel(current, specular);
+	}
+
+	void PixelPipeline::applyShader(Int cMask[4])
+	{
+		if(!shader)
+		{
+			fixedFunction();
+			return;
+		}
+
+		int pad = 0;        // Count number of texm3x3pad instructions
+		Vector4s dPairing;   // Destination for first pairing instruction
+
+		for(size_t i = 0; i < shader->getLength(); i++)
+		{
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
+
+			//	#ifndef NDEBUG   // FIXME: Centralize debug output control
+			//		shader->printInstruction(i, "debug.txt");
+			//	#endif
+
+			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
+			{
+				continue;
+			}
+
+			const Dst &dst = instruction->dst;
+			const Src &src0 = instruction->src[0];
+			const Src &src1 = instruction->src[1];
+			const Src &src2 = instruction->src[2];
+
+			unsigned short shaderModel = shader->getShaderModel();
+			bool pairing = i + 1 < shader->getLength() && shader->getInstruction(i + 1)->coissue;   // First instruction of pair
+			bool coissue = instruction->coissue;                                                              // Second instruction of pair
+
+			Vector4s d;
+			Vector4s s0;
+			Vector4s s1;
+			Vector4s s2;
+
+			if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
+			if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
+			if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
+
+			Float4 x = shaderModel < 0x0104 ? v[2 + dst.index].x : v[2 + src0.index].x;
+			Float4 y = shaderModel < 0x0104 ? v[2 + dst.index].y : v[2 + src0.index].y;
+			Float4 z = shaderModel < 0x0104 ? v[2 + dst.index].z : v[2 + src0.index].z;
+			Float4 w = shaderModel < 0x0104 ? v[2 + dst.index].w : v[2 + src0.index].w;
+
+			switch(opcode)
+			{
+			case Shader::OPCODE_PS_1_0: break;
+			case Shader::OPCODE_PS_1_1: break;
+			case Shader::OPCODE_PS_1_2: break;
+			case Shader::OPCODE_PS_1_3: break;
+			case Shader::OPCODE_PS_1_4: break;
+
+			case Shader::OPCODE_DEF:    break;
+
+			case Shader::OPCODE_NOP:    break;
+			case Shader::OPCODE_MOV: MOV(d, s0);         break;
+			case Shader::OPCODE_ADD: ADD(d, s0, s1);     break;
+			case Shader::OPCODE_SUB: SUB(d, s0, s1);     break;
+			case Shader::OPCODE_MAD: MAD(d, s0, s1, s2); break;
+			case Shader::OPCODE_MUL: MUL(d, s0, s1);     break;
+			case Shader::OPCODE_DP3: DP3(d, s0, s1);     break;
+			case Shader::OPCODE_DP4: DP4(d, s0, s1);     break;
+			case Shader::OPCODE_LRP: LRP(d, s0, s1, s2); break;
+			case Shader::OPCODE_TEXCOORD:
+				if(shaderModel < 0x0104)
+				{
+					TEXCOORD(d, x, y, z, dst.index);
+			}
+				else
+				{
+					if((src0.swizzle & 0x30) == 0x20)   // .xyz
+					{
+						TEXCRD(d, x, y, z, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+					}
+					else   // .xwy
+					{
+						TEXCRD(d, x, y, w, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+					}
+				}
+				break;
+			case Shader::OPCODE_TEXKILL:
+				if(shaderModel < 0x0104)
+				{
+					TEXKILL(cMask, x, y, z);
+				}
+				else if(shaderModel == 0x0104)
+				{
+					if(dst.type == Shader::PARAMETER_TEXTURE)
+					{
+						TEXKILL(cMask, x, y, z);
+					}
+					else
+					{
+						TEXKILL(cMask, rs[dst.index]);
+					}
+				}
+				else ASSERT(false);
+				break;
+			case Shader::OPCODE_TEX:
+				if(shaderModel < 0x0104)
+				{
+					TEX(d, x, y, z, dst.index, false);
+				}
+				else if(shaderModel == 0x0104)
+				{
+					if(src0.type == Shader::PARAMETER_TEXTURE)
+					{
+						if((src0.swizzle & 0x30) == 0x20)   // .xyz
+						{
+							TEX(d, x, y, z, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+						}
+						else   // .xyw
+						{
+							TEX(d, x, y, w, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+						}
+					}
+					else
+					{
+						TEXLD(d, s0, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
+					}
+				}
+				else ASSERT(false);
+				break;
+			case Shader::OPCODE_TEXBEM:       TEXBEM(d, s0, x, y, z, dst.index);                                             break;
+			case Shader::OPCODE_TEXBEML:      TEXBEML(d, s0, x, y, z, dst.index);                                            break;
+			case Shader::OPCODE_TEXREG2AR:    TEXREG2AR(d, s0, dst.index);                                                   break;
+			case Shader::OPCODE_TEXREG2GB:    TEXREG2GB(d, s0, dst.index);                                                   break;
+			case Shader::OPCODE_TEXM3X2PAD:   TEXM3X2PAD(x, y, z, s0, 0, src0.modifier == Shader::MODIFIER_SIGN);            break;
+			case Shader::OPCODE_TEXM3X2TEX:   TEXM3X2TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
+			case Shader::OPCODE_TEXM3X3PAD:   TEXM3X3PAD(x, y, z, s0, pad++ % 2, src0.modifier == Shader::MODIFIER_SIGN);    break;
+			case Shader::OPCODE_TEXM3X3TEX:   TEXM3X3TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
+			case Shader::OPCODE_TEXM3X3SPEC:  TEXM3X3SPEC(d, x, y, z, dst.index, s0, s1);                                    break;
+			case Shader::OPCODE_TEXM3X3VSPEC: TEXM3X3VSPEC(d, x, y, z, dst.index, s0);                                       break;
+			case Shader::OPCODE_CND:          CND(d, s0, s1, s2);                                                            break;
+			case Shader::OPCODE_TEXREG2RGB:   TEXREG2RGB(d, s0, dst.index);                                                  break;
+			case Shader::OPCODE_TEXDP3TEX:    TEXDP3TEX(d, x, y, z, dst.index, s0);                                          break;
+			case Shader::OPCODE_TEXM3X2DEPTH: TEXM3X2DEPTH(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);          break;
+			case Shader::OPCODE_TEXDP3:       TEXDP3(d, x, y, z, s0);                                                        break;
+			case Shader::OPCODE_TEXM3X3:      TEXM3X3(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);               break;
+			case Shader::OPCODE_TEXDEPTH:     TEXDEPTH();                                                                    break;
+			case Shader::OPCODE_CMP0:         CMP(d, s0, s1, s2);                                                            break;
+			case Shader::OPCODE_BEM:          BEM(d, s0, s1, dst.index);                                                     break;
+			case Shader::OPCODE_PHASE:                                                                                       break;
+			case Shader::OPCODE_END:                                                                                         break;
+			default:
+				ASSERT(false);
+			}
+
+			if(dst.type != Shader::PARAMETER_VOID && opcode != Shader::OPCODE_TEXKILL)
+			{
+				if(dst.shift > 0)
+				{
+					if(dst.mask & 0x1) { d.x = AddSat(d.x, d.x); if(dst.shift > 1) d.x = AddSat(d.x, d.x); if(dst.shift > 2) d.x = AddSat(d.x, d.x); }
+					if(dst.mask & 0x2) { d.y = AddSat(d.y, d.y); if(dst.shift > 1) d.y = AddSat(d.y, d.y); if(dst.shift > 2) d.y = AddSat(d.y, d.y); }
+					if(dst.mask & 0x4) { d.z = AddSat(d.z, d.z); if(dst.shift > 1) d.z = AddSat(d.z, d.z); if(dst.shift > 2) d.z = AddSat(d.z, d.z); }
+					if(dst.mask & 0x8) { d.w = AddSat(d.w, d.w); if(dst.shift > 1) d.w = AddSat(d.w, d.w); if(dst.shift > 2) d.w = AddSat(d.w, d.w); }
+				}
+				else if(dst.shift < 0)
+				{
+					if(dst.mask & 0x1) d.x = d.x >> -dst.shift;
+					if(dst.mask & 0x2) d.y = d.y >> -dst.shift;
+					if(dst.mask & 0x4) d.z = d.z >> -dst.shift;
+					if(dst.mask & 0x8) d.w = d.w >> -dst.shift;
+				}
+
+				if(dst.saturate)
+				{
+					if(dst.mask & 0x1) { d.x = Min(d.x, Short4(0x1000)); d.x = Max(d.x, Short4(0x0000)); }
+					if(dst.mask & 0x2) { d.y = Min(d.y, Short4(0x1000)); d.y = Max(d.y, Short4(0x0000)); }
+					if(dst.mask & 0x4) { d.z = Min(d.z, Short4(0x1000)); d.z = Max(d.z, Short4(0x0000)); }
+					if(dst.mask & 0x8) { d.w = Min(d.w, Short4(0x1000)); d.w = Max(d.w, Short4(0x0000)); }
+				}
+
+				if(pairing)
+				{
+					if(dst.mask & 0x1) dPairing.x = d.x;
+					if(dst.mask & 0x2) dPairing.y = d.y;
+					if(dst.mask & 0x4) dPairing.z = d.z;
+					if(dst.mask & 0x8) dPairing.w = d.w;
+				}
+
+				if(coissue)
+				{
+					const Dst &dst = shader->getInstruction(i - 1)->dst;
+
+					writeDestination(dPairing, dst);
+				}
+
+				if(!pairing)
+				{
+					writeDestination(d, dst);
+				}
+			}
+		}
+
+		current.x = Min(current.x, Short4(0x0FFF)); current.x = Max(current.x, Short4(0x0000));
+		current.y = Min(current.y, Short4(0x0FFF)); current.y = Max(current.y, Short4(0x0000));
+		current.z = Min(current.z, Short4(0x0FFF)); current.z = Max(current.z, Short4(0x0000));
+		current.w = Min(current.w, Short4(0x0FFF)); current.w = Max(current.w, Short4(0x0000));
+	}
+
+	Bool PixelPipeline::alphaTest(Int cMask[4])
+	{
+		if(!state.alphaTestActive())
+		{
+			return true;
+		}
+
+		Int aMask;
+
+		if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
+		{
+			PixelRoutine::alphaTest(aMask, current.w);
+
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				cMask[q] &= aMask;
+			}
+		}
+		else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
+		{
+			Float4 alpha = Float4(current.w) * Float4(1.0f / 0x1000);
+
+			alphaToCoverage(cMask, alpha);
+		}
+		else ASSERT(false);
+
+		Int pass = cMask[0];
+
+		for(unsigned int q = 1; q < state.multiSample; q++)
+		{
+			pass = pass | cMask[q];
+		}
+
+		return pass != 0x0;
+	}
+
+	void PixelPipeline::rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+	{
+		if(!state.colorWriteActive(0))
+		{
+			return;
+		}
+
+		Vector4f oC;
+
+		switch(state.targetFormat[0])
+		{
+		case FORMAT_R5G6B5:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_A8:
+		case FORMAT_G16R16:
+		case FORMAT_A16B16G16R16:
+			if(!postBlendSRGB && state.writeSRGB)
+			{
+				linearToSRGB12_16(current);
+			}
+			else
+			{
+				current.x <<= 4;
+				current.y <<= 4;
+				current.z <<= 4;
+				current.w <<= 4;
+			}
+
+			if(state.targetFormat[0] == FORMAT_R5G6B5)
+			{
+				current.x &= Short4(0xF800u);
+				current.y &= Short4(0xFC00u);
+				current.z &= Short4(0xF800u);
+			}
+
+			fogBlend(current, fog);
+
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
+				Vector4s color = current;
+
+				if(state.multiSampleMask & (1 << q))
+				{
+					alphaBlend(0, buffer, color, x);
+					logicOperation(0, buffer, color, x);
+					writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+				}
+			}
+			break;
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+	//	case FORMAT_X32B32G32R32F_UNSIGNED:   // Not renderable in any fixed-function API.
+			convertSigned12(oC, current);
+			PixelRoutine::fogBlend(oC, fog);
+
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
+				Vector4f color = oC;
+
+				if(state.multiSampleMask & (1 << q))
+				{
+					alphaBlend(0, buffer, color, x);
+					writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+				}
+			}
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelPipeline::blendTexture(Vector4s &temp, Vector4s &texture, int stage)
+	{
+		Vector4s *arg1 = nullptr;
+		Vector4s *arg2 = nullptr;
+		Vector4s *arg3 = nullptr;
+		Vector4s res;
+
+		Vector4s constant;
+		Vector4s tfactor;
+
+		const TextureStage::State &textureStage = state.textureStage[stage];
+
+		if(textureStage.firstArgument == TextureStage::SOURCE_CONSTANT ||
+		   textureStage.firstArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
+		   textureStage.secondArgument == TextureStage::SOURCE_CONSTANT ||
+		   textureStage.secondArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
+		   textureStage.thirdArgument == TextureStage::SOURCE_CONSTANT ||
+		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_CONSTANT)
+		{
+			constant.x = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[0]));
+			constant.y = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[1]));
+			constant.z = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[2]));
+			constant.w = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[3]));
+		}
+
+		if(textureStage.firstArgument == TextureStage::SOURCE_TFACTOR ||
+		   textureStage.firstArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
+		   textureStage.secondArgument == TextureStage::SOURCE_TFACTOR ||
+		   textureStage.secondArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
+		   textureStage.thirdArgument == TextureStage::SOURCE_TFACTOR ||
+		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_TFACTOR)
+		{
+			tfactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[0]));
+			tfactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[1]));
+			tfactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[2]));
+			tfactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]));
+		}
+
+		// Premodulate
+		if(stage > 0 && textureStage.usesTexture)
+		{
+			if(state.textureStage[stage - 1].stageOperation == TextureStage::STAGE_PREMODULATE)
+			{
+				current.x = MulHigh(current.x, texture.x) << 4;
+				current.y = MulHigh(current.y, texture.y) << 4;
+				current.z = MulHigh(current.z, texture.z) << 4;
+			}
+
+			if(state.textureStage[stage - 1].stageOperationAlpha == TextureStage::STAGE_PREMODULATE)
+			{
+				current.w = MulHigh(current.w, texture.w) << 4;
+			}
+		}
+
+		if(luminance)
+		{
+			texture.x = MulHigh(texture.x, L) << 4;
+			texture.y = MulHigh(texture.y, L) << 4;
+			texture.z = MulHigh(texture.z, L) << 4;
+
+			luminance = false;
+		}
+
+		switch(textureStage.firstArgument)
+		{
+		case TextureStage::SOURCE_TEXTURE:	arg1 = &texture;    break;
+		case TextureStage::SOURCE_CONSTANT:	arg1 = &constant;   break;
+		case TextureStage::SOURCE_CURRENT:	arg1 = &current;  break;
+		case TextureStage::SOURCE_DIFFUSE:	arg1 = &diffuse;  break;
+		case TextureStage::SOURCE_SPECULAR:	arg1 = &specular; break;
+		case TextureStage::SOURCE_TEMP:		arg1 = &temp;       break;
+		case TextureStage::SOURCE_TFACTOR:	arg1 = &tfactor;    break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.secondArgument)
+		{
+		case TextureStage::SOURCE_TEXTURE:	arg2 = &texture;    break;
+		case TextureStage::SOURCE_CONSTANT:	arg2 = &constant;   break;
+		case TextureStage::SOURCE_CURRENT:	arg2 = &current;  break;
+		case TextureStage::SOURCE_DIFFUSE:	arg2 = &diffuse;  break;
+		case TextureStage::SOURCE_SPECULAR:	arg2 = &specular; break;
+		case TextureStage::SOURCE_TEMP:		arg2 = &temp;       break;
+		case TextureStage::SOURCE_TFACTOR:	arg2 = &tfactor;    break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.thirdArgument)
+		{
+		case TextureStage::SOURCE_TEXTURE:	arg3 = &texture;    break;
+		case TextureStage::SOURCE_CONSTANT:	arg3 = &constant;   break;
+		case TextureStage::SOURCE_CURRENT:	arg3 = &current;  break;
+		case TextureStage::SOURCE_DIFFUSE:	arg3 = &diffuse;  break;
+		case TextureStage::SOURCE_SPECULAR:	arg3 = &specular; break;
+		case TextureStage::SOURCE_TEMP:		arg3 = &temp;       break;
+		case TextureStage::SOURCE_TFACTOR:	arg3 = &tfactor;    break;
+		default:
+			ASSERT(false);
+		}
+
+		Vector4s mod1;
+		Vector4s mod2;
+		Vector4s mod3;
+
+		switch(textureStage.firstModifier)
+		{
+		case TextureStage::MODIFIER_COLOR:
+			break;
+		case TextureStage::MODIFIER_INVCOLOR:
+			mod1.x = SubSat(Short4(0x1000), arg1->x);
+			mod1.y = SubSat(Short4(0x1000), arg1->y);
+			mod1.z = SubSat(Short4(0x1000), arg1->z);
+			mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+			arg1 = &mod1;
+			break;
+		case TextureStage::MODIFIER_ALPHA:
+			mod1.x = arg1->w;
+			mod1.y = arg1->w;
+			mod1.z = arg1->w;
+			mod1.w = arg1->w;
+
+			arg1 = &mod1;
+			break;
+		case TextureStage::MODIFIER_INVALPHA:
+			mod1.x = SubSat(Short4(0x1000), arg1->w);
+			mod1.y = SubSat(Short4(0x1000), arg1->w);
+			mod1.z = SubSat(Short4(0x1000), arg1->w);
+			mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+			arg1 = &mod1;
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.secondModifier)
+		{
+		case TextureStage::MODIFIER_COLOR:
+			break;
+		case TextureStage::MODIFIER_INVCOLOR:
+			mod2.x = SubSat(Short4(0x1000), arg2->x);
+			mod2.y = SubSat(Short4(0x1000), arg2->y);
+			mod2.z = SubSat(Short4(0x1000), arg2->z);
+			mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+			arg2 = &mod2;
+			break;
+		case TextureStage::MODIFIER_ALPHA:
+			mod2.x = arg2->w;
+			mod2.y = arg2->w;
+			mod2.z = arg2->w;
+			mod2.w = arg2->w;
+
+			arg2 = &mod2;
+			break;
+		case TextureStage::MODIFIER_INVALPHA:
+			mod2.x = SubSat(Short4(0x1000), arg2->w);
+			mod2.y = SubSat(Short4(0x1000), arg2->w);
+			mod2.z = SubSat(Short4(0x1000), arg2->w);
+			mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+			arg2 = &mod2;
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.thirdModifier)
+		{
+		case TextureStage::MODIFIER_COLOR:
+			break;
+		case TextureStage::MODIFIER_INVCOLOR:
+			mod3.x = SubSat(Short4(0x1000), arg3->x);
+			mod3.y = SubSat(Short4(0x1000), arg3->y);
+			mod3.z = SubSat(Short4(0x1000), arg3->z);
+			mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+			arg3 = &mod3;
+			break;
+		case TextureStage::MODIFIER_ALPHA:
+			mod3.x = arg3->w;
+			mod3.y = arg3->w;
+			mod3.z = arg3->w;
+			mod3.w = arg3->w;
+
+			arg3 = &mod3;
+			break;
+		case TextureStage::MODIFIER_INVALPHA:
+			mod3.x = SubSat(Short4(0x1000), arg3->w);
+			mod3.y = SubSat(Short4(0x1000), arg3->w);
+			mod3.z = SubSat(Short4(0x1000), arg3->w);
+			mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+			arg3 = &mod3;
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.stageOperation)
+		{
+		case TextureStage::STAGE_DISABLE:
+			break;
+		case TextureStage::STAGE_SELECTARG1: // Arg1
+			res.x = arg1->x;
+			res.y = arg1->y;
+			res.z = arg1->z;
+			break;
+		case TextureStage::STAGE_SELECTARG2: // Arg2
+			res.x = arg2->x;
+			res.y = arg2->y;
+			res.z = arg2->z;
+			break;
+		case TextureStage::STAGE_SELECTARG3: // Arg3
+			res.x = arg3->x;
+			res.y = arg3->y;
+			res.z = arg3->z;
+			break;
+		case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
+			res.x = MulHigh(arg1->x, arg2->x) << 4;
+			res.y = MulHigh(arg1->y, arg2->y) << 4;
+			res.z = MulHigh(arg1->z, arg2->z) << 4;
+			break;
+		case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
+			res.x = MulHigh(arg1->x, arg2->x) << 5;
+			res.y = MulHigh(arg1->y, arg2->y) << 5;
+			res.z = MulHigh(arg1->z, arg2->z) << 5;
+			break;
+		case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
+			res.x = MulHigh(arg1->x, arg2->x) << 6;
+			res.y = MulHigh(arg1->y, arg2->y) << 6;
+			res.z = MulHigh(arg1->z, arg2->z) << 6;
+			break;
+		case TextureStage::STAGE_ADD: // Arg1 + Arg2
+			res.x = AddSat(arg1->x, arg2->x);
+			res.y = AddSat(arg1->y, arg2->y);
+			res.z = AddSat(arg1->z, arg2->z);
+			break;
+		case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
+			res.x = AddSat(arg1->x, arg2->x);
+			res.y = AddSat(arg1->y, arg2->y);
+			res.z = AddSat(arg1->z, arg2->z);
+
+			res.x = SubSat(res.x, Short4(0x0800));
+			res.y = SubSat(res.y, Short4(0x0800));
+			res.z = SubSat(res.z, Short4(0x0800));
+			break;
+		case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
+			res.x = AddSat(arg1->x, arg2->x);
+			res.y = AddSat(arg1->y, arg2->y);
+			res.z = AddSat(arg1->z, arg2->z);
+
+			res.x = SubSat(res.x, Short4(0x0800));
+			res.y = SubSat(res.y, Short4(0x0800));
+			res.z = SubSat(res.z, Short4(0x0800));
+
+			res.x = AddSat(res.x, res.x);
+			res.y = AddSat(res.y, res.y);
+			res.z = AddSat(res.z, res.z);
+			break;
+		case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
+			res.x = SubSat(arg1->x, arg2->x);
+			res.y = SubSat(arg1->y, arg2->y);
+			res.z = SubSat(arg1->z, arg2->z);
+			break;
+		case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
+			{
+				Short4 tmp;
+
+				tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(arg1->x, arg2->x); res.x = SubSat(res.x, tmp);
+				tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(arg1->y, arg2->y); res.y = SubSat(res.y, tmp);
+				tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(arg1->z, arg2->z); res.z = SubSat(res.z, tmp);
+			}
+			break;
+		case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
+			res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg3->x);
+			res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg3->y);
+			res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg3->z);
+			break;
+		case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
+			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, arg3->x) << 4; res.x = AddSat(res.x, arg2->x);
+			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, arg3->y) << 4; res.y = AddSat(res.y, arg2->y);
+			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, arg3->z) << 4; res.z = AddSat(res.z, arg2->z);
+			break;
+		case TextureStage::STAGE_DOT3: // 2 * (Arg1.x - 0.5) * 2 * (Arg2.x - 0.5) + 2 * (Arg1.y - 0.5) * 2 * (Arg2.y - 0.5) + 2 * (Arg1.z - 0.5) * 2 * (Arg2.z - 0.5)
+			{
+				Short4 tmp;
+
+				res.x = SubSat(arg1->x, Short4(0x0800)); tmp = SubSat(arg2->x, Short4(0x0800)); res.x = MulHigh(res.x, tmp);
+				res.y = SubSat(arg1->y, Short4(0x0800)); tmp = SubSat(arg2->y, Short4(0x0800)); res.y = MulHigh(res.y, tmp);
+				res.z = SubSat(arg1->z, Short4(0x0800)); tmp = SubSat(arg2->z, Short4(0x0800)); res.z = MulHigh(res.z, tmp);
+
+				res.x = res.x << 6;
+				res.y = res.y << 6;
+				res.z = res.z << 6;
+
+				res.x = AddSat(res.x, res.y);
+				res.x = AddSat(res.x, res.z);
+
+				// Clamp to [0, 1]
+				res.x = Max(res.x, Short4(0x0000));
+				res.x = Min(res.x, Short4(0x1000));
+
+				res.y = res.x;
+				res.z = res.x;
+				res.w = res.x;
+			}
+			break;
+		case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, current.w) << 4; res.x = AddSat(res.x, arg2->x);
+			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, current.w) << 4; res.y = AddSat(res.y, arg2->y);
+			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, current.w) << 4; res.z = AddSat(res.z, arg2->z);
+			break;
+		case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, diffuse.w) << 4; res.x = AddSat(res.x, arg2->x);
+			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, diffuse.w) << 4; res.y = AddSat(res.y, arg2->y);
+			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, diffuse.w) << 4; res.z = AddSat(res.z, arg2->z);
+			break;
+		case TextureStage::STAGE_BLENDFACTORALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.x = AddSat(res.x, arg2->x);
+			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.y = AddSat(res.y, arg2->y);
+			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.z = AddSat(res.z, arg2->z);
+			break;
+		case TextureStage::STAGE_BLENDTEXTUREALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, texture.w) << 4; res.x = AddSat(res.x, arg2->x);
+			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, texture.w) << 4; res.y = AddSat(res.y, arg2->y);
+			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, texture.w) << 4; res.z = AddSat(res.z, arg2->z);
+			break;
+		case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
+			res.x = SubSat(Short4(0x1000), texture.w); res.x = MulHigh(res.x, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
+			res.y = SubSat(Short4(0x1000), texture.w); res.y = MulHigh(res.y, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
+			res.z = SubSat(Short4(0x1000), texture.w); res.z = MulHigh(res.z, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
+			break;
+		case TextureStage::STAGE_PREMODULATE:
+			res.x = arg1->x;
+			res.y = arg1->y;
+			res.z = arg1->z;
+			break;
+		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR: // Arg1 + Arg1.w * Arg2
+			res.x = MulHigh(arg1->w, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
+			res.y = MulHigh(arg1->w, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
+			res.z = MulHigh(arg1->w, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
+			break;
+		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA: // Arg1 * Arg2 + Arg1.w
+			res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg1->w);
+			res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg1->w);
+			res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg1->w);
+			break;
+		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR: // (1 - Arg1.w) * Arg2 + Arg1
+			{
+				Short4 tmp;
+
+				res.x = AddSat(arg1->x, arg2->x); tmp = MulHigh(arg1->w, arg2->x) << 4; res.x = SubSat(res.x, tmp);
+				res.y = AddSat(arg1->y, arg2->y); tmp = MulHigh(arg1->w, arg2->y) << 4; res.y = SubSat(res.y, tmp);
+				res.z = AddSat(arg1->z, arg2->z); tmp = MulHigh(arg1->w, arg2->z) << 4; res.z = SubSat(res.z, tmp);
+			}
+			break;
+		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA: // (1 - Arg1) * Arg2 + Arg1.w
+			{
+				Short4 tmp;
+
+				res.x = AddSat(arg1->w, arg2->x); tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = SubSat(res.x, tmp);
+				res.y = AddSat(arg1->w, arg2->y); tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = SubSat(res.y, tmp);
+				res.z = AddSat(arg1->w, arg2->z); tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = SubSat(res.z, tmp);
+			}
+			break;
+		case TextureStage::STAGE_BUMPENVMAP:
+			{
+				du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
+				dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
+
+				Float4 du2;
+				Float4 dv2;
+
+				du2 = du;
+				dv2 = dv;
+				du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+				dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+				du += dv2;
+				dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+				du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+				dv += du2;
+
+				perturbate = true;
+
+				res.x = current.x;
+				res.y = current.y;
+				res.z = current.z;
+				res.w = current.w;
+			}
+			break;
+		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+			{
+				du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
+				dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
+
+				Float4 du2;
+				Float4 dv2;
+
+				du2 = du;
+				dv2 = dv;
+
+				du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+				dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+				du += dv2;
+				dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+				du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+				dv += du2;
+
+				perturbate = true;
+
+				L = texture.z;
+				L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
+				L = L << 4;
+				L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
+				L = Max(L, Short4(0x0000));
+				L = Min(L, Short4(0x1000));
+
+				luminance = true;
+
+				res.x = current.x;
+				res.y = current.y;
+				res.z = current.z;
+				res.w = current.w;
+			}
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if(textureStage.stageOperation != TextureStage::STAGE_DOT3)
+		{
+			switch(textureStage.firstArgumentAlpha)
+			{
+			case TextureStage::SOURCE_TEXTURE:	arg1 = &texture;		break;
+			case TextureStage::SOURCE_CONSTANT:	arg1 = &constant;		break;
+			case TextureStage::SOURCE_CURRENT:	arg1 = &current;		break;
+			case TextureStage::SOURCE_DIFFUSE:	arg1 = &diffuse;		break;
+			case TextureStage::SOURCE_SPECULAR:	arg1 = &specular;		break;
+			case TextureStage::SOURCE_TEMP:		arg1 = &temp;			break;
+			case TextureStage::SOURCE_TFACTOR:	arg1 = &tfactor;		break;
+			default:
+				ASSERT(false);
+			}
+
+			switch(textureStage.secondArgumentAlpha)
+			{
+			case TextureStage::SOURCE_TEXTURE:	arg2 = &texture;		break;
+			case TextureStage::SOURCE_CONSTANT:	arg2 = &constant;		break;
+			case TextureStage::SOURCE_CURRENT:	arg2 = &current;		break;
+			case TextureStage::SOURCE_DIFFUSE:	arg2 = &diffuse;		break;
+			case TextureStage::SOURCE_SPECULAR:	arg2 = &specular;		break;
+			case TextureStage::SOURCE_TEMP:		arg2 = &temp;			break;
+			case TextureStage::SOURCE_TFACTOR:	arg2 = &tfactor;		break;
+			default:
+				ASSERT(false);
+			}
+
+			switch(textureStage.thirdArgumentAlpha)
+			{
+			case TextureStage::SOURCE_TEXTURE:	arg3 = &texture;		break;
+			case TextureStage::SOURCE_CONSTANT:	arg3 = &constant;		break;
+			case TextureStage::SOURCE_CURRENT:	arg3 = &current;		break;
+			case TextureStage::SOURCE_DIFFUSE:	arg3 = &diffuse;		break;
+			case TextureStage::SOURCE_SPECULAR:	arg3 = &specular;		break;
+			case TextureStage::SOURCE_TEMP:		arg3 = &temp;			break;
+			case TextureStage::SOURCE_TFACTOR:	arg3 = &tfactor;		break;
+			default:
+				ASSERT(false);
+			}
+
+			switch(textureStage.firstModifierAlpha)   // FIXME: Check if actually used
+			{
+			case TextureStage::MODIFIER_COLOR:
+				break;
+			case TextureStage::MODIFIER_INVCOLOR:
+				mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+				arg1 = &mod1;
+				break;
+			case TextureStage::MODIFIER_ALPHA:
+				// Redudant
+				break;
+			case TextureStage::MODIFIER_INVALPHA:
+				mod1.w = SubSat(Short4(0x1000), arg1->w);
+
+				arg1 = &mod1;
+				break;
+			default:
+				ASSERT(false);
+			}
+
+			switch(textureStage.secondModifierAlpha)   // FIXME: Check if actually used
+			{
+			case TextureStage::MODIFIER_COLOR:
+				break;
+			case TextureStage::MODIFIER_INVCOLOR:
+				mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+				arg2 = &mod2;
+				break;
+			case TextureStage::MODIFIER_ALPHA:
+				// Redudant
+				break;
+			case TextureStage::MODIFIER_INVALPHA:
+				mod2.w = SubSat(Short4(0x1000), arg2->w);
+
+				arg2 = &mod2;
+				break;
+			default:
+				ASSERT(false);
+			}
+
+			switch(textureStage.thirdModifierAlpha)   // FIXME: Check if actually used
+			{
+			case TextureStage::MODIFIER_COLOR:
+				break;
+			case TextureStage::MODIFIER_INVCOLOR:
+				mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+				arg3 = &mod3;
+				break;
+			case TextureStage::MODIFIER_ALPHA:
+				// Redudant
+				break;
+			case TextureStage::MODIFIER_INVALPHA:
+				mod3.w = SubSat(Short4(0x1000), arg3->w);
+
+				arg3 = &mod3;
+				break;
+			default:
+				ASSERT(false);
+			}
+
+			switch(textureStage.stageOperationAlpha)
+			{
+			case TextureStage::STAGE_DISABLE:
+				break;
+			case TextureStage::STAGE_SELECTARG1: // Arg1
+				res.w = arg1->w;
+				break;
+			case TextureStage::STAGE_SELECTARG2: // Arg2
+				res.w = arg2->w;
+				break;
+			case TextureStage::STAGE_SELECTARG3: // Arg3
+				res.w = arg3->w;
+				break;
+			case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
+				res.w = MulHigh(arg1->w, arg2->w) << 4;
+				break;
+			case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
+				res.w = MulHigh(arg1->w, arg2->w) << 5;
+				break;
+			case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
+				res.w = MulHigh(arg1->w, arg2->w) << 6;
+				break;
+			case TextureStage::STAGE_ADD: // Arg1 + Arg2
+				res.w = AddSat(arg1->w, arg2->w);
+				break;
+			case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
+				res.w = AddSat(arg1->w, arg2->w);
+				res.w = SubSat(res.w, Short4(0x0800));
+				break;
+			case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
+				res.w = AddSat(arg1->w, arg2->w);
+				res.w = SubSat(res.w, Short4(0x0800));
+				res.w = AddSat(res.w, res.w);
+				break;
+			case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
+				res.w = SubSat(arg1->w, arg2->w);
+				break;
+			case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
+				{
+					Short4 tmp;
+
+					tmp = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(arg1->w, arg2->w); res.w = SubSat(res.w, tmp);
+				}
+				break;
+			case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
+				res.w = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(res.w, arg3->w);
+				break;
+			case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
+				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, arg3->w) << 4; res.w = AddSat(res.w, arg2->w);
+				break;
+			case TextureStage::STAGE_DOT3:
+				break;   // Already computed in color channel
+			case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
+				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, current.w) << 4; res.w = AddSat(res.w, arg2->w);
+				break;
+			case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
+				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, diffuse.w) << 4; res.w = AddSat(res.w, arg2->w);
+				break;
+			case TextureStage::STAGE_BLENDFACTORALPHA:
+				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.w = AddSat(res.w, arg2->w);
+				break;
+			case TextureStage::STAGE_BLENDTEXTUREALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
+				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, texture.w) << 4; res.w = AddSat(res.w, arg2->w);
+				break;
+			case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
+				res.w = SubSat(Short4(0x1000), texture.w); res.w = MulHigh(res.w, arg2->w) << 4; res.w = AddSat(res.w, arg1->w);
+				break;
+			case TextureStage::STAGE_PREMODULATE:
+				res.w = arg1->w;
+				break;
+			case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+			case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+			case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+			case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+			case TextureStage::STAGE_BUMPENVMAP:
+			case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+				break;   // Invalid alpha operations
+			default:
+				ASSERT(false);
+			}
+		}
+
+		// Clamp result to [0, 1]
+
+		switch(textureStage.stageOperation)
+		{
+		case TextureStage::STAGE_DISABLE:
+		case TextureStage::STAGE_SELECTARG1:
+		case TextureStage::STAGE_SELECTARG2:
+		case TextureStage::STAGE_SELECTARG3:
+		case TextureStage::STAGE_MODULATE:
+		case TextureStage::STAGE_MODULATE2X:
+		case TextureStage::STAGE_MODULATE4X:
+		case TextureStage::STAGE_ADD:
+		case TextureStage::STAGE_MULTIPLYADD:
+		case TextureStage::STAGE_LERP:
+		case TextureStage::STAGE_BLENDCURRENTALPHA:
+		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+		case TextureStage::STAGE_BLENDFACTORALPHA:
+		case TextureStage::STAGE_BLENDTEXTUREALPHA:
+		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+		case TextureStage::STAGE_DOT3:   // Already clamped
+		case TextureStage::STAGE_PREMODULATE:
+		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+		case TextureStage::STAGE_BUMPENVMAP:
+		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+			if(state.textureStage[stage].cantUnderflow)
+			{
+				break;   // Can't go below zero
+			}
+		case TextureStage::STAGE_ADDSIGNED:
+		case TextureStage::STAGE_ADDSIGNED2X:
+		case TextureStage::STAGE_SUBTRACT:
+		case TextureStage::STAGE_ADDSMOOTH:
+			res.x = Max(res.x, Short4(0x0000));
+			res.y = Max(res.y, Short4(0x0000));
+			res.z = Max(res.z, Short4(0x0000));
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.stageOperationAlpha)
+		{
+		case TextureStage::STAGE_DISABLE:
+		case TextureStage::STAGE_SELECTARG1:
+		case TextureStage::STAGE_SELECTARG2:
+		case TextureStage::STAGE_SELECTARG3:
+		case TextureStage::STAGE_MODULATE:
+		case TextureStage::STAGE_MODULATE2X:
+		case TextureStage::STAGE_MODULATE4X:
+		case TextureStage::STAGE_ADD:
+		case TextureStage::STAGE_MULTIPLYADD:
+		case TextureStage::STAGE_LERP:
+		case TextureStage::STAGE_BLENDCURRENTALPHA:
+		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+		case TextureStage::STAGE_BLENDFACTORALPHA:
+		case TextureStage::STAGE_BLENDTEXTUREALPHA:
+		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+		case TextureStage::STAGE_DOT3:   // Already clamped
+		case TextureStage::STAGE_PREMODULATE:
+		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+		case TextureStage::STAGE_BUMPENVMAP:
+		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+			if(state.textureStage[stage].cantUnderflow)
+			{
+				break;   // Can't go below zero
+			}
+		case TextureStage::STAGE_ADDSIGNED:
+		case TextureStage::STAGE_ADDSIGNED2X:
+		case TextureStage::STAGE_SUBTRACT:
+		case TextureStage::STAGE_ADDSMOOTH:
+			res.w = Max(res.w, Short4(0x0000));
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.stageOperation)
+		{
+		case TextureStage::STAGE_DISABLE:
+		case TextureStage::STAGE_SELECTARG1:
+		case TextureStage::STAGE_SELECTARG2:
+		case TextureStage::STAGE_SELECTARG3:
+		case TextureStage::STAGE_MODULATE:
+		case TextureStage::STAGE_SUBTRACT:
+		case TextureStage::STAGE_ADDSMOOTH:
+		case TextureStage::STAGE_LERP:
+		case TextureStage::STAGE_BLENDCURRENTALPHA:
+		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+		case TextureStage::STAGE_BLENDFACTORALPHA:
+		case TextureStage::STAGE_BLENDTEXTUREALPHA:
+		case TextureStage::STAGE_DOT3:   // Already clamped
+		case TextureStage::STAGE_PREMODULATE:
+		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+		case TextureStage::STAGE_BUMPENVMAP:
+		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+			break;   // Can't go above one
+		case TextureStage::STAGE_MODULATE2X:
+		case TextureStage::STAGE_MODULATE4X:
+		case TextureStage::STAGE_ADD:
+		case TextureStage::STAGE_ADDSIGNED:
+		case TextureStage::STAGE_ADDSIGNED2X:
+		case TextureStage::STAGE_MULTIPLYADD:
+		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+			res.x = Min(res.x, Short4(0x1000));
+			res.y = Min(res.y, Short4(0x1000));
+			res.z = Min(res.z, Short4(0x1000));
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.stageOperationAlpha)
+		{
+		case TextureStage::STAGE_DISABLE:
+		case TextureStage::STAGE_SELECTARG1:
+		case TextureStage::STAGE_SELECTARG2:
+		case TextureStage::STAGE_SELECTARG3:
+		case TextureStage::STAGE_MODULATE:
+		case TextureStage::STAGE_SUBTRACT:
+		case TextureStage::STAGE_ADDSMOOTH:
+		case TextureStage::STAGE_LERP:
+		case TextureStage::STAGE_BLENDCURRENTALPHA:
+		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
+		case TextureStage::STAGE_BLENDFACTORALPHA:
+		case TextureStage::STAGE_BLENDTEXTUREALPHA:
+		case TextureStage::STAGE_DOT3:   // Already clamped
+		case TextureStage::STAGE_PREMODULATE:
+		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
+		case TextureStage::STAGE_BUMPENVMAP:
+		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
+			break;   // Can't go above one
+		case TextureStage::STAGE_MODULATE2X:
+		case TextureStage::STAGE_MODULATE4X:
+		case TextureStage::STAGE_ADD:
+		case TextureStage::STAGE_ADDSIGNED:
+		case TextureStage::STAGE_ADDSIGNED2X:
+		case TextureStage::STAGE_MULTIPLYADD:
+		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
+		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
+		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
+			res.w = Min(res.w, Short4(0x1000));
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(textureStage.destinationArgument)
+		{
+		case TextureStage::DESTINATION_CURRENT:
+			current.x = res.x;
+			current.y = res.y;
+			current.z = res.z;
+			current.w = res.w;
+			break;
+		case TextureStage::DESTINATION_TEMP:
+			temp.x = res.x;
+			temp.y = res.y;
+			temp.z = res.z;
+			temp.w = res.w;
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelPipeline::fogBlend(Vector4s &current, Float4 &f)
+	{
+		if(!state.fogActive)
+		{
+			return;
+		}
+
+		if(state.pixelFogMode != FOG_NONE)
+		{
+			pixelFog(f);
+		}
+
+		UShort4 fog = convertFixed16(f, true);
+
+		current.x = As<Short4>(MulHigh(As<UShort4>(current.x), fog));
+		current.y = As<Short4>(MulHigh(As<UShort4>(current.y), fog));
+		current.z = As<Short4>(MulHigh(As<UShort4>(current.z), fog));
+
+		UShort4 invFog = UShort4(0xFFFFu) - fog;
+
+		current.x += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[0]))));
+		current.y += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[1]))));
+		current.z += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[2]))));
+	}
+
+	void PixelPipeline::specularPixel(Vector4s &current, Vector4s &specular)
+	{
+		if(!state.specularAdd)
+		{
+			return;
+		}
+
+		current.x = AddSat(current.x, specular.x);
+		current.y = AddSat(current.y, specular.y);
+		current.z = AddSat(current.z, specular.z);
+	}
+
+	Vector4s PixelPipeline::sampleTexture(int coordinates, int stage, bool project)
+	{
+		Float4 x = v[2 + coordinates].x;
+		Float4 y = v[2 + coordinates].y;
+		Float4 z = v[2 + coordinates].z;
+		Float4 w = v[2 + coordinates].w;
+
+		if(perturbate)
+		{
+			x += du;
+			y += dv;
+
+			perturbate = false;
+		}
+
+		return sampleTexture(stage, x, y, z, w, project);
+	}
+
+	Vector4s PixelPipeline::sampleTexture(int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project)
+	{
+		Vector4s c;
+
+		#if PERF_PROFILE
+			Long texTime = Ticks();
+		#endif
+
+		Vector4f dsx;
+		Vector4f dsy;
+
+		Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + stage * sizeof(Texture);
+
+		if(!project)
+		{
+			c = SamplerCore(constants, state.sampler[stage]).sampleTexture(texture, u, v, w, q, q, dsx, dsy);
+		}
+		else
+		{
+			Float4 rq = reciprocal(q);
+
+			Float4 u_q = u * rq;
+			Float4 v_q = v * rq;
+			Float4 w_q = w * rq;
+
+			c = SamplerCore(constants, state.sampler[stage]).sampleTexture(texture, u_q, v_q, w_q, q, q, dsx, dsy);
+		}
+
+		#if PERF_PROFILE
+			cycles[PERF_TEX] += Ticks() - texTime;
+		#endif
+
+		return c;
+	}
+
+	Short4 PixelPipeline::convertFixed12(RValue<Float4> cf)
+	{
+		return RoundShort4(cf * Float4(0x1000));
+	}
+
+	void PixelPipeline::convertFixed12(Vector4s &cs, Vector4f &cf)
+	{
+		cs.x = convertFixed12(cf.x);
+		cs.y = convertFixed12(cf.y);
+		cs.z = convertFixed12(cf.z);
+		cs.w = convertFixed12(cf.w);
+	}
+
+	Float4 PixelPipeline::convertSigned12(Short4 &cs)
+	{
+		return Float4(cs) * Float4(1.0f / 0x0FFE);
+	}
+
+	void PixelPipeline::convertSigned12(Vector4f &cf, Vector4s &cs)
+	{
+		cf.x = convertSigned12(cs.x);
+		cf.y = convertSigned12(cs.y);
+		cf.z = convertSigned12(cs.z);
+		cf.w = convertSigned12(cs.w);
+	}
+
+	void PixelPipeline::writeDestination(Vector4s &d, const Dst &dst)
+	{
+		switch(dst.type)
+		{
+		case Shader::PARAMETER_TEMP:
+			if(dst.mask & 0x1) rs[dst.index].x = d.x;
+			if(dst.mask & 0x2) rs[dst.index].y = d.y;
+			if(dst.mask & 0x4) rs[dst.index].z = d.z;
+			if(dst.mask & 0x8) rs[dst.index].w = d.w;
+			break;
+		case Shader::PARAMETER_INPUT:
+			if(dst.mask & 0x1) vs[dst.index].x = d.x;
+			if(dst.mask & 0x2) vs[dst.index].y = d.y;
+			if(dst.mask & 0x4) vs[dst.index].z = d.z;
+			if(dst.mask & 0x8) vs[dst.index].w = d.w;
+			break;
+		case Shader::PARAMETER_CONST: ASSERT(false); break;
+		case Shader::PARAMETER_TEXTURE:
+			if(dst.mask & 0x1) ts[dst.index].x = d.x;
+			if(dst.mask & 0x2) ts[dst.index].y = d.y;
+			if(dst.mask & 0x4) ts[dst.index].z = d.z;
+			if(dst.mask & 0x8) ts[dst.index].w = d.w;
+			break;
+		case Shader::PARAMETER_COLOROUT:
+			if(dst.mask & 0x1) vs[dst.index].x = d.x;
+			if(dst.mask & 0x2) vs[dst.index].y = d.y;
+			if(dst.mask & 0x4) vs[dst.index].z = d.z;
+			if(dst.mask & 0x8) vs[dst.index].w = d.w;
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	Vector4s PixelPipeline::fetchRegister(const Src &src)
+	{
+		Vector4s *reg;
+		int i = src.index;
+
+		Vector4s c;
+
+		if(src.type == Shader::PARAMETER_CONST)
+		{
+			c.x = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][0]));
+			c.y = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][1]));
+			c.z = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][2]));
+			c.w = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][3]));
+		}
+
+		switch(src.type)
+		{
+		case Shader::PARAMETER_TEMP:          reg = &rs[i]; break;
+		case Shader::PARAMETER_INPUT:         reg = &vs[i]; break;
+		case Shader::PARAMETER_CONST:         reg = &c;       break;
+		case Shader::PARAMETER_TEXTURE:       reg = &ts[i]; break;
+		case Shader::PARAMETER_VOID:          return rs[0]; // Dummy
+		case Shader::PARAMETER_FLOAT4LITERAL: return rs[0]; // Dummy
+		default: ASSERT(false); return rs[0];
+		}
+
+		const Short4 &x = (*reg)[(src.swizzle >> 0) & 0x3];
+		const Short4 &y = (*reg)[(src.swizzle >> 2) & 0x3];
+		const Short4 &z = (*reg)[(src.swizzle >> 4) & 0x3];
+		const Short4 &w = (*reg)[(src.swizzle >> 6) & 0x3];
+
+		Vector4s mod;
+
+		switch(src.modifier)
+		{
+		case Shader::MODIFIER_NONE:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
+			break;
+		case Shader::MODIFIER_BIAS:
+			mod.x = SubSat(x, Short4(0x0800));
+			mod.y = SubSat(y, Short4(0x0800));
+			mod.z = SubSat(z, Short4(0x0800));
+			mod.w = SubSat(w, Short4(0x0800));
+			break;
+		case Shader::MODIFIER_BIAS_NEGATE:
+			mod.x = SubSat(Short4(0x0800), x);
+			mod.y = SubSat(Short4(0x0800), y);
+			mod.z = SubSat(Short4(0x0800), z);
+			mod.w = SubSat(Short4(0x0800), w);
+			break;
+		case Shader::MODIFIER_COMPLEMENT:
+			mod.x = SubSat(Short4(0x1000), x);
+			mod.y = SubSat(Short4(0x1000), y);
+			mod.z = SubSat(Short4(0x1000), z);
+			mod.w = SubSat(Short4(0x1000), w);
+			break;
+		case Shader::MODIFIER_NEGATE:
+			mod.x = -x;
+			mod.y = -y;
+			mod.z = -z;
+			mod.w = -w;
+			break;
+		case Shader::MODIFIER_X2:
+			mod.x = AddSat(x, x);
+			mod.y = AddSat(y, y);
+			mod.z = AddSat(z, z);
+			mod.w = AddSat(w, w);
+			break;
+		case Shader::MODIFIER_X2_NEGATE:
+			mod.x = -AddSat(x, x);
+			mod.y = -AddSat(y, y);
+			mod.z = -AddSat(z, z);
+			mod.w = -AddSat(w, w);
+			break;
+		case Shader::MODIFIER_SIGN:
+			mod.x = SubSat(x, Short4(0x0800));
+			mod.y = SubSat(y, Short4(0x0800));
+			mod.z = SubSat(z, Short4(0x0800));
+			mod.w = SubSat(w, Short4(0x0800));
+			mod.x = AddSat(mod.x, mod.x);
+			mod.y = AddSat(mod.y, mod.y);
+			mod.z = AddSat(mod.z, mod.z);
+			mod.w = AddSat(mod.w, mod.w);
+			break;
+		case Shader::MODIFIER_SIGN_NEGATE:
+			mod.x = SubSat(Short4(0x0800), x);
+			mod.y = SubSat(Short4(0x0800), y);
+			mod.z = SubSat(Short4(0x0800), z);
+			mod.w = SubSat(Short4(0x0800), w);
+			mod.x = AddSat(mod.x, mod.x);
+			mod.y = AddSat(mod.y, mod.y);
+			mod.z = AddSat(mod.z, mod.z);
+			mod.w = AddSat(mod.w, mod.w);
+			break;
+		case Shader::MODIFIER_DZ:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
+			// Projection performed by texture sampler
+			break;
+		case Shader::MODIFIER_DW:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
+			// Projection performed by texture sampler
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if(src.type == Shader::PARAMETER_CONST && (src.modifier == Shader::MODIFIER_X2 || src.modifier == Shader::MODIFIER_X2_NEGATE))
+		{
+			mod.x = Min(mod.x, Short4(0x1000)); mod.x = Max(mod.x, Short4(-0x1000));
+			mod.y = Min(mod.y, Short4(0x1000)); mod.y = Max(mod.y, Short4(-0x1000));
+			mod.z = Min(mod.z, Short4(0x1000)); mod.z = Max(mod.z, Short4(-0x1000));
+			mod.w = Min(mod.w, Short4(0x1000)); mod.w = Max(mod.w, Short4(-0x1000));
+		}
+
+		return mod;
+	}
+
+	void PixelPipeline::MOV(Vector4s &dst, Vector4s &src0)
+	{
+		dst.x = src0.x;
+		dst.y = src0.y;
+		dst.z = src0.z;
+		dst.w = src0.w;
+	}
+
+	void PixelPipeline::ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+	{
+		dst.x = AddSat(src0.x, src1.x);
+		dst.y = AddSat(src0.y, src1.y);
+		dst.z = AddSat(src0.z, src1.z);
+		dst.w = AddSat(src0.w, src1.w);
+	}
+
+	void PixelPipeline::SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+	{
+		dst.x = SubSat(src0.x, src1.x);
+		dst.y = SubSat(src0.y, src1.y);
+		dst.z = SubSat(src0.z, src1.z);
+		dst.w = SubSat(src0.w, src1.w);
+	}
+
+	void PixelPipeline::MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+	{
+		// FIXME: Long fixed-point multiply fixup
+		{ dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
+		{ dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y); }
+		{ dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
+		{ dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
+	}
+
+	void PixelPipeline::MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+	{
+		// FIXME: Long fixed-point multiply fixup
+		{ dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); }
+		{ dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); }
+		{ dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); }
+		{ dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); }
+	}
+
+	void PixelPipeline::DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+	{
+		Short4 t0;
+		Short4 t1;
+
+		// FIXME: Long fixed-point multiply fixup
+		t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
+		t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+		t0 = AddSat(t0, t1);
+		t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+		t0 = AddSat(t0, t1);
+
+		dst.x = t0;
+		dst.y = t0;
+		dst.z = t0;
+		dst.w = t0;
+	}
+
+	void PixelPipeline::DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1)
+	{
+		Short4 t0;
+		Short4 t1;
+
+		// FIXME: Long fixed-point multiply fixup
+		t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
+		t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+		t0 = AddSat(t0, t1);
+		t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+		t0 = AddSat(t0, t1);
+		t1 = MulHigh(src0.w, src1.w); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
+		t0 = AddSat(t0, t1);
+
+		dst.x = t0;
+		dst.y = t0;
+		dst.z = t0;
+		dst.w = t0;
+	}
+
+	void PixelPipeline::LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+	{
+		// FIXME: Long fixed-point multiply fixup
+		{ dst.x = SubSat(src1.x, src2.x); dst.x = MulHigh(dst.x, src0.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
+		{
+		dst.y = SubSat(src1.y, src2.y); dst.y = MulHigh(dst.y, src0.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y);
+	}
+		{dst.z = SubSat(src1.z, src2.z); dst.z = MulHigh(dst.z, src0.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
+		{dst.w = SubSat(src1.w, src2.w); dst.w = MulHigh(dst.w, src0.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
+	}
+
+	void PixelPipeline::TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
+	{
+		Float4 uw;
+		Float4 vw;
+		Float4 sw;
+
+		if(state.interpolant[2 + coordinate].component & 0x01)
+		{
+			uw = Max(u, Float4(0.0f));
+			uw = Min(uw, Float4(1.0f));
+			dst.x = convertFixed12(uw);
+		}
+		else
+		{
+			dst.x = Short4(0x0000);
+		}
+
+		if(state.interpolant[2 + coordinate].component & 0x02)
+		{
+			vw = Max(v, Float4(0.0f));
+			vw = Min(vw, Float4(1.0f));
+			dst.y = convertFixed12(vw);
+		}
+		else
+		{
+			dst.y = Short4(0x0000);
+		}
+
+		if(state.interpolant[2 + coordinate].component & 0x04)
+		{
+			sw = Max(s, Float4(0.0f));
+			sw = Min(sw, Float4(1.0f));
+			dst.z = convertFixed12(sw);
+		}
+		else
+		{
+			dst.z = Short4(0x0000);
+		}
+
+		dst.w = Short4(0x1000);
+	}
+
+	void PixelPipeline::TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
+	{
+		Float4 uw = u;
+		Float4 vw = v;
+		Float4 sw = s;
+
+		if(project)
+		{
+			uw *= Rcp_pp(s);
+			vw *= Rcp_pp(s);
+		}
+
+		if(state.interpolant[2 + coordinate].component & 0x01)
+		{
+			uw *= Float4(0x1000);
+			uw = Max(uw, Float4(-0x8000));
+			uw = Min(uw, Float4(0x7FFF));
+			dst.x = RoundShort4(uw);
+		}
+		else
+		{
+			dst.x = Short4(0x0000);
+		}
+
+		if(state.interpolant[2 + coordinate].component & 0x02)
+		{
+			vw *= Float4(0x1000);
+			vw = Max(vw, Float4(-0x8000));
+			vw = Min(vw, Float4(0x7FFF));
+			dst.y = RoundShort4(vw);
+		}
+		else
+		{
+			dst.y = Short4(0x0000);
+		}
+
+		if(state.interpolant[2 + coordinate].component & 0x04)
+		{
+			sw *= Float4(0x1000);
+			sw = Max(sw, Float4(-0x8000));
+			sw = Min(sw, Float4(0x7FFF));
+			dst.z = RoundShort4(sw);
+		}
+		else
+		{
+			dst.z = Short4(0x0000);
+		}
+	}
+
+	void PixelPipeline::TEXDP3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src)
+	{
+		TEXM3X3PAD(u, v, s, src, 0, false);
+
+		Short4 t0 = RoundShort4(u_ * Float4(0x1000));
+
+		dst.x = t0;
+		dst.y = t0;
+		dst.z = t0;
+		dst.w = t0;
+	}
+
+	void PixelPipeline::TEXDP3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0)
+	{
+		TEXM3X3PAD(u, v, s, src0, 0, false);
+
+		v_ = Float4(0.0f);
+		w_ = Float4(0.0f);
+
+		dst = sampleTexture(stage, u_, v_, w_, w_);
+	}
+
+	void PixelPipeline::TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s)
+	{
+		Int kill = SignMask(CmpNLT(u, Float4(0.0f))) &
+			SignMask(CmpNLT(v, Float4(0.0f))) &
+			SignMask(CmpNLT(s, Float4(0.0f)));
+
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			cMask[q] &= kill;
+		}
+	}
+
+	void PixelPipeline::TEXKILL(Int cMask[4], Vector4s &src)
+	{
+		Short4 test = src.x | src.y | src.z;
+		Int kill = SignMask(PackSigned(test, test)) ^ 0x0000000F;
+
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			cMask[q] &= kill;
+		}
+	}
+
+	void PixelPipeline::TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
+	{
+		dst = sampleTexture(sampler, u, v, s, s, project);
+	}
+
+	void PixelPipeline::TEXLD(Vector4s &dst, Vector4s &src, int sampler, bool project)
+	{
+		Float4 u = Float4(src.x) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src.y) * Float4(1.0f / 0x0FFE);
+		Float4 s = Float4(src.z) * Float4(1.0f / 0x0FFE);
+
+		dst = sampleTexture(sampler, u, v, s, s, project);
+	}
+
+	void PixelPipeline::TEXBEM(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+	{
+		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
+		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
+
+		Float4 du2 = du;
+		Float4 dv2 = dv;
+
+		du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+		dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+		du += dv2;
+		dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+		du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+		dv += du2;
+
+		Float4 u_ = u + du;
+		Float4 v_ = v + dv;
+
+		dst = sampleTexture(stage, u_, v_, s, s);
+	}
+
+	void PixelPipeline::TEXBEML(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+	{
+		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
+		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
+
+		Float4 du2 = du;
+		Float4 dv2 = dv;
+
+		du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
+		dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
+		du += dv2;
+		dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
+		du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
+		dv += du2;
+
+		Float4 u_ = u + du;
+		Float4 v_ = v + dv;
+
+		dst = sampleTexture(stage, u_, v_, s, s);
+
+		Short4 L;
+
+		L = src.z;
+		L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
+		L = L << 4;
+		L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
+		L = Max(L, Short4(0x0000));
+		L = Min(L, Short4(0x1000));
+
+		dst.x = MulHigh(dst.x, L); dst.x = dst.x << 4;
+		dst.y = MulHigh(dst.y, L); dst.y = dst.y << 4;
+		dst.z = MulHigh(dst.z, L); dst.z = dst.z << 4;
+	}
+
+	void PixelPipeline::TEXREG2AR(Vector4s &dst, Vector4s &src0, int stage)
+	{
+		Float4 u = Float4(src0.w) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src0.x) * Float4(1.0f / 0x0FFE);
+		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
+
+		dst = sampleTexture(stage, u, v, s, s);
+	}
+
+	void PixelPipeline::TEXREG2GB(Vector4s &dst, Vector4s &src0, int stage)
+	{
+		Float4 u = Float4(src0.y) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src0.z) * Float4(1.0f / 0x0FFE);
+		Float4 s = v;
+
+		dst = sampleTexture(stage, u, v, s, s);
+	}
+
+	void PixelPipeline::TEXREG2RGB(Vector4s &dst, Vector4s &src0, int stage)
+	{
+		Float4 u = Float4(src0.x) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src0.y) * Float4(1.0f / 0x0FFE);
+		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
+
+		dst = sampleTexture(stage, u, v, s, s);
+	}
+
+	void PixelPipeline::TEXM3X2DEPTH(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling)
+	{
+		TEXM3X2PAD(u, v, s, src, 1, signedScaling);
+
+		// z / w
+		u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
+
+		oDepth = u_;
+	}
+
+	void PixelPipeline::TEXM3X2PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
+	{
+		TEXM3X3PAD(u, v, s, src0, component, signedScaling);
+	}
+
+	void PixelPipeline::TEXM3X2TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
+	{
+		TEXM3X2PAD(u, v, s, src0, 1, signedScaling);
+
+		w_ = Float4(0.0f);
+
+		dst = sampleTexture(stage, u_, v_, w_, w_);
+	}
+
+	void PixelPipeline::TEXM3X3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling)
+	{
+		TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
+
+		dst.x = RoundShort4(u_ * Float4(0x1000));
+		dst.y = RoundShort4(v_ * Float4(0x1000));
+		dst.z = RoundShort4(w_ * Float4(0x1000));
+		dst.w = Short4(0x1000);
+	}
+
+	void PixelPipeline::TEXM3X3PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
+	{
+		if(component == 0 || previousScaling != signedScaling)   // FIXME: Other source modifiers?
+		{
+			U = Float4(src0.x);
+			V = Float4(src0.y);
+			W = Float4(src0.z);
+
+			previousScaling = signedScaling;
+		}
+
+		Float4 x = U * u + V * v + W * s;
+
+		x *= Float4(1.0f / 0x1000);
+
+		switch(component)
+		{
+		case 0:	u_ = x; break;
+		case 1:	v_ = x; break;
+		case 2: w_ = x; break;
+		default: ASSERT(false);
+		}
+	}
+
+	void PixelPipeline::TEXM3X3SPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1)
+	{
+		TEXM3X3PAD(u, v, s, src0, 2, false);
+
+		Float4 E[3];   // Eye vector
+
+		E[0] = Float4(src1.x) * Float4(1.0f / 0x0FFE);
+		E[1] = Float4(src1.y) * Float4(1.0f / 0x0FFE);
+		E[2] = Float4(src1.z) * Float4(1.0f / 0x0FFE);
+
+		// Reflection
+		Float4 u__;
+		Float4 v__;
+		Float4 w__;
+
+		// (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
+		u__ = u_ * E[0];
+		v__ = v_ * E[1];
+		w__ = w_ * E[2];
+		u__ += v__ + w__;
+		u__ += u__;
+		v__ = u__;
+		w__ = u__;
+		u__ *= u_;
+		v__ *= v_;
+		w__ *= w_;
+		u_ *= u_;
+		v_ *= v_;
+		w_ *= w_;
+		u_ += v_ + w_;
+		u__ -= E[0] * u_;
+		v__ -= E[1] * u_;
+		w__ -= E[2] * u_;
+
+		dst = sampleTexture(stage, u__, v__, w__, w__);
+	}
+
+	void PixelPipeline::TEXM3X3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
+	{
+		TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
+
+		dst = sampleTexture(stage, u_, v_, w_, w_);
+	}
+
+	void PixelPipeline::TEXM3X3VSPEC(Vector4s &dst, Float4 &x, Float4 &y, Float4 &z, int stage, Vector4s &src0)
+	{
+		TEXM3X3PAD(x, y, z, src0, 2, false);
+
+		Float4 E[3];   // Eye vector
+
+		E[0] = v[2 + stage - 2].w;
+		E[1] = v[2 + stage - 1].w;
+		E[2] = v[2 + stage - 0].w;
+
+		// Reflection
+		Float4 u__;
+		Float4 v__;
+		Float4 w__;
+
+		// (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
+		u__ = u_ * E[0];
+		v__ = v_ * E[1];
+		w__ = w_ * E[2];
+		u__ += v__ + w__;
+		u__ += u__;
+		v__ = u__;
+		w__ = u__;
+		u__ *= u_;
+		v__ *= v_;
+		w__ *= w_;
+		u_ *= u_;
+		v_ *= v_;
+		w_ *= w_;
+		u_ += v_ + w_;
+		u__ -= E[0] * u_;
+		v__ -= E[1] * u_;
+		w__ -= E[2] * u_;
+
+		dst = sampleTexture(stage, u__, v__, w__, w__);
+	}
+
+	void PixelPipeline::TEXDEPTH()
+	{
+		u_ = Float4(rs[5].x);
+		v_ = Float4(rs[5].y);
+
+		// z / w
+		u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
+
+		oDepth = u_;
+	}
+
+	void PixelPipeline::CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+	{
+		{Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.x = t0; };
+		{Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.y = t0; };
+		{Short4 t0; t0 = src0.z; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.z; t1 = t1 & t0; t0 = ~t0 & src2.z; t0 = t0 | t1; dst.z = t0; };
+		{Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.w = t0; };
+	}
+
+	void PixelPipeline::CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
+	{
+		{Short4 t0 = CmpGT(Short4(0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.x = t0; };
+		{Short4 t0 = CmpGT(Short4(0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.y = t0; };
+		{Short4 t0 = CmpGT(Short4(0x0000), src0.z); Short4 t1; t1 = src2.z; t1 &= t0; t0 = ~t0 & src1.z; t0 |= t1; dst.z = t0; };
+		{Short4 t0 = CmpGT(Short4(0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.w = t0; };
+	}
+
+	void PixelPipeline::BEM(Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage)
+	{
+		Short4 t0;
+		Short4 t1;
+
+		// dst.x = src0.x + BUMPENVMAT00(stage) * src1.x + BUMPENVMAT10(stage) * src1.y
+		t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][0]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
+		t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][0]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
+		t0 = AddSat(t0, t1);
+		t0 = AddSat(t0, src0.x);
+		dst.x = t0;
+
+		// dst.y = src0.y + BUMPENVMAT01(stage) * src1.x + BUMPENVMAT11(stage) * src1.y
+		t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][1]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
+		t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][1]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
+		t0 = AddSat(t0, t1);
+		t0 = AddSat(t0, src0.y);
+		dst.y = t0;
+	}
+}
+
diff --git a/src/Pipeline/PixelPipeline.hpp b/src/Pipeline/PixelPipeline.hpp
new file mode 100644
index 0000000..66f0ec7
--- /dev/null
+++ b/src/Pipeline/PixelPipeline.hpp
@@ -0,0 +1,114 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelPipeline_hpp
+#define sw_PixelPipeline_hpp
+
+#include "PixelRoutine.hpp"
+
+namespace sw
+{
+	class PixelPipeline : public PixelRoutine
+	{
+	public:
+		PixelPipeline(const PixelProcessor::State &state, const PixelShader *shader) :
+			PixelRoutine(state, shader), current(rs[0]), diffuse(vs[0]), specular(vs[1]), perturbate(false), luminance(false), previousScaling(false) {}
+		virtual ~PixelPipeline() {}
+
+	protected:
+		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w);
+		virtual void applyShader(Int cMask[4]);
+		virtual Bool alphaTest(Int cMask[4]);
+		virtual void rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
+
+	private:
+		Vector4s &current;
+		Vector4s &diffuse;
+		Vector4s &specular;
+
+		Vector4s rs[6];
+		Vector4s vs[2];
+		Vector4s ts[6];
+
+		// bem(l) offsets and luminance
+		Float4 du;
+		Float4 dv;
+		Short4 L;
+
+		// texm3x3 temporaries
+		Float4 u_; // FIXME
+		Float4 v_; // FIXME
+		Float4 w_; // FIXME
+		Float4 U;  // FIXME
+		Float4 V;  // FIXME
+		Float4 W;  // FIXME
+
+		void fixedFunction();
+		void blendTexture(Vector4s &temp, Vector4s &texture, int stage);
+		void fogBlend(Vector4s &current, Float4 &fog);
+		void specularPixel(Vector4s &current, Vector4s &specular);
+
+		Vector4s sampleTexture(int coordinates, int sampler, bool project = false);
+		Vector4s sampleTexture(int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project = false);
+
+		Short4 convertFixed12(RValue<Float4> cf);
+		void convertFixed12(Vector4s &cs, Vector4f &cf);
+		Float4 convertSigned12(Short4 &cs);
+		void convertSigned12(Vector4f &cf, Vector4s &cs);
+
+		void writeDestination(Vector4s &d, const Dst &dst);
+		Vector4s fetchRegister(const Src &src);
+
+		// Instructions
+		void MOV(Vector4s &dst, Vector4s &src0);
+		void ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+		void SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+		void MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+		void MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+		void DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+		void DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1);
+		void LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+		void TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate);
+		void TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project);
+		void TEXDP3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src);
+		void TEXDP3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0);
+		void TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s);
+		void TEXKILL(Int cMask[4], Vector4s &dst);
+		void TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, bool project);
+		void TEXLD(Vector4s &dst, Vector4s &src, int stage, bool project);
+		void TEXBEM(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage);
+		void TEXBEML(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage);
+		void TEXREG2AR(Vector4s &dst, Vector4s &src0, int stage);
+		void TEXREG2GB(Vector4s &dst, Vector4s &src0, int stage);
+		void TEXREG2RGB(Vector4s &dst, Vector4s &src0, int stage);
+		void TEXM3X2DEPTH(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling);
+		void TEXM3X2PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling);
+		void TEXM3X2TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling);
+		void TEXM3X3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling);
+		void TEXM3X3PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling);
+		void TEXM3X3SPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1);
+		void TEXM3X3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool singedScaling);
+		void TEXM3X3VSPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0);
+		void TEXDEPTH();
+		void CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+		void CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2);
+		void BEM(Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage);
+
+		bool perturbate;
+		bool luminance;
+		bool previousScaling;
+	};
+}
+
+#endif
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
new file mode 100644
index 0000000..473712b
--- /dev/null
+++ b/src/Pipeline/PixelProgram.cpp
@@ -0,0 +1,1850 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelProgram.hpp"
+
+#include "SamplerCore.hpp"
+#include "Renderer/Primitive.hpp"
+#include "Renderer/Renderer.hpp"
+
+namespace sw
+{
+	extern bool postBlendSRGB;
+	extern bool booleanFaceRegister;
+	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
+	extern bool fullPixelPositionRegister;
+
+	void PixelProgram::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
+	{
+		if(shader->getShaderModel() >= 0x0300)
+		{
+			if(shader->isVPosDeclared())
+			{
+				if(!halfIntegerCoordinates)
+				{
+					vPos.x = Float4(Float(x)) + Float4(0, 1, 0, 1);
+					vPos.y = Float4(Float(y)) + Float4(0, 0, 1, 1);
+				}
+				else
+				{
+					vPos.x = Float4(Float(x)) + Float4(0.5f, 1.5f, 0.5f, 1.5f);
+					vPos.y = Float4(Float(y)) + Float4(0.5f, 0.5f, 1.5f, 1.5f);
+				}
+
+				if(fullPixelPositionRegister)
+				{
+					vPos.z = z[0]; // FIXME: Centroid?
+					vPos.w = w;    // FIXME: Centroid?
+				}
+			}
+
+			if(shader->isVFaceDeclared())
+			{
+				Float4 face = *Pointer<Float>(primitive + OFFSET(Primitive, area));
+
+				if(booleanFaceRegister)
+				{
+					face = As<Float4>(state.frontFaceCCW ? CmpNLT(face, Float4(0.0f)) : CmpLT(face, Float4(0.0f)));
+				}
+
+				vFace.x = face;
+				vFace.y = face;
+				vFace.z = face;
+				vFace.w = face;
+			}
+		}
+	}
+
+	void PixelProgram::applyShader(Int cMask[4])
+	{
+		enableIndex = 0;
+		stackIndex = 0;
+
+		if(shader->containsLeaveInstruction())
+		{
+			enableLeave = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		}
+
+		for(int i = 0; i < RENDERTARGETS; i++)
+		{
+			if(state.targetFormat[i] != FORMAT_NULL)
+			{
+				oC[i] = Vector4f(0.0f, 0.0f, 0.0f, 0.0f);
+			}
+		}
+
+		// Create all call site return blocks up front
+		for(size_t i = 0; i < shader->getLength(); i++)
+		{
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
+
+			if(opcode == Shader::OPCODE_CALL || opcode == Shader::OPCODE_CALLNZ)
+			{
+				const Dst &dst = instruction->dst;
+
+				ASSERT(callRetBlock[dst.label].size() == dst.callSite);
+				callRetBlock[dst.label].push_back(Nucleus::createBasicBlock());
+			}
+		}
+
+		bool broadcastColor0 = true;
+
+		for(size_t i = 0; i < shader->getLength(); i++)
+		{
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
+
+			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
+			{
+				continue;
+			}
+
+			const Dst &dst = instruction->dst;
+			const Src &src0 = instruction->src[0];
+			const Src &src1 = instruction->src[1];
+			const Src &src2 = instruction->src[2];
+			const Src &src3 = instruction->src[3];
+			const Src &src4 = instruction->src[4];
+
+			bool predicate = instruction->predicate;
+			Control control = instruction->control;
+			bool pp = dst.partialPrecision;
+			bool project = instruction->project;
+			bool bias = instruction->bias;
+
+			Vector4f d;
+			Vector4f s0;
+			Vector4f s1;
+			Vector4f s2;
+			Vector4f s3;
+			Vector4f s4;
+
+			if(opcode == Shader::OPCODE_TEXKILL)   // Takes destination as input
+			{
+				if(dst.type == Shader::PARAMETER_TEXTURE)
+				{
+					d.x = v[2 + dst.index].x;
+					d.y = v[2 + dst.index].y;
+					d.z = v[2 + dst.index].z;
+					d.w = v[2 + dst.index].w;
+				}
+				else
+				{
+					d = r[dst.index];
+				}
+			}
+
+			if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
+			if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
+			if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
+			if(src3.type != Shader::PARAMETER_VOID) s3 = fetchRegister(src3);
+			if(src4.type != Shader::PARAMETER_VOID) s4 = fetchRegister(src4);
+
+			switch(opcode)
+			{
+			case Shader::OPCODE_PS_2_0:                                                    break;
+			case Shader::OPCODE_PS_2_x:                                                    break;
+			case Shader::OPCODE_PS_3_0:                                                    break;
+			case Shader::OPCODE_DEF:                                                       break;
+			case Shader::OPCODE_DCL:                                                       break;
+			case Shader::OPCODE_NOP:                                                       break;
+			case Shader::OPCODE_MOV:        mov(d, s0);                                    break;
+			case Shader::OPCODE_NEG:        neg(d, s0);                                    break;
+			case Shader::OPCODE_INEG:       ineg(d, s0);                                   break;
+			case Shader::OPCODE_F2B:        f2b(d, s0);                                    break;
+			case Shader::OPCODE_B2F:        b2f(d, s0);                                    break;
+			case Shader::OPCODE_F2I:        f2i(d, s0);                                    break;
+			case Shader::OPCODE_I2F:        i2f(d, s0);                                    break;
+			case Shader::OPCODE_F2U:        f2u(d, s0);                                    break;
+			case Shader::OPCODE_U2F:        u2f(d, s0);                                    break;
+			case Shader::OPCODE_I2B:        i2b(d, s0);                                    break;
+			case Shader::OPCODE_B2I:        b2i(d, s0);                                    break;
+			case Shader::OPCODE_ADD:        add(d, s0, s1);                                break;
+			case Shader::OPCODE_IADD:       iadd(d, s0, s1);                               break;
+			case Shader::OPCODE_SUB:        sub(d, s0, s1);                                break;
+			case Shader::OPCODE_ISUB:       isub(d, s0, s1);                               break;
+			case Shader::OPCODE_MUL:        mul(d, s0, s1);                                break;
+			case Shader::OPCODE_IMUL:       imul(d, s0, s1);                               break;
+			case Shader::OPCODE_MAD:        mad(d, s0, s1, s2);                            break;
+			case Shader::OPCODE_IMAD:       imad(d, s0, s1, s2);                           break;
+			case Shader::OPCODE_DP1:        dp1(d, s0, s1);                                break;
+			case Shader::OPCODE_DP2:        dp2(d, s0, s1);                                break;
+			case Shader::OPCODE_DP2ADD:     dp2add(d, s0, s1, s2);                         break;
+			case Shader::OPCODE_DP3:        dp3(d, s0, s1);                                break;
+			case Shader::OPCODE_DP4:        dp4(d, s0, s1);                                break;
+			case Shader::OPCODE_DET2:       det2(d, s0, s1);                               break;
+			case Shader::OPCODE_DET3:       det3(d, s0, s1, s2);                           break;
+			case Shader::OPCODE_DET4:       det4(d, s0, s1, s2, s3);                       break;
+			case Shader::OPCODE_CMP0:       cmp0(d, s0, s1, s2);                           break;
+			case Shader::OPCODE_ICMP:       icmp(d, s0, s1, control);                      break;
+			case Shader::OPCODE_UCMP:       ucmp(d, s0, s1, control);                      break;
+			case Shader::OPCODE_SELECT:     select(d, s0, s1, s2);                         break;
+			case Shader::OPCODE_EXTRACT:    extract(d.x, s0, s1.x);                        break;
+			case Shader::OPCODE_INSERT:     insert(d, s0, s1.x, s2.x);                     break;
+			case Shader::OPCODE_FRC:        frc(d, s0);                                    break;
+			case Shader::OPCODE_TRUNC:      trunc(d, s0);                                  break;
+			case Shader::OPCODE_FLOOR:      floor(d, s0);                                  break;
+			case Shader::OPCODE_ROUND:      round(d, s0);                                  break;
+			case Shader::OPCODE_ROUNDEVEN:  roundEven(d, s0);                              break;
+			case Shader::OPCODE_CEIL:       ceil(d, s0);                                   break;
+			case Shader::OPCODE_EXP2X:      exp2x(d, s0, pp);                              break;
+			case Shader::OPCODE_EXP2:       exp2(d, s0, pp);                               break;
+			case Shader::OPCODE_LOG2X:      log2x(d, s0, pp);                              break;
+			case Shader::OPCODE_LOG2:       log2(d, s0, pp);                               break;
+			case Shader::OPCODE_EXP:        exp(d, s0, pp);                                break;
+			case Shader::OPCODE_LOG:        log(d, s0, pp);                                break;
+			case Shader::OPCODE_RCPX:       rcpx(d, s0, pp);                               break;
+			case Shader::OPCODE_DIV:        div(d, s0, s1);                                break;
+			case Shader::OPCODE_IDIV:       idiv(d, s0, s1);                               break;
+			case Shader::OPCODE_UDIV:       udiv(d, s0, s1);                               break;
+			case Shader::OPCODE_MOD:        mod(d, s0, s1);                                break;
+			case Shader::OPCODE_IMOD:       imod(d, s0, s1);                               break;
+			case Shader::OPCODE_UMOD:       umod(d, s0, s1);                               break;
+			case Shader::OPCODE_SHL:        shl(d, s0, s1);                                break;
+			case Shader::OPCODE_ISHR:       ishr(d, s0, s1);                               break;
+			case Shader::OPCODE_USHR:       ushr(d, s0, s1);                               break;
+			case Shader::OPCODE_RSQX:       rsqx(d, s0, pp);                               break;
+			case Shader::OPCODE_SQRT:       sqrt(d, s0, pp);                               break;
+			case Shader::OPCODE_RSQ:        rsq(d, s0, pp);                                break;
+			case Shader::OPCODE_LEN2:       len2(d.x, s0, pp);                             break;
+			case Shader::OPCODE_LEN3:       len3(d.x, s0, pp);                             break;
+			case Shader::OPCODE_LEN4:       len4(d.x, s0, pp);                             break;
+			case Shader::OPCODE_DIST1:      dist1(d.x, s0, s1, pp);                        break;
+			case Shader::OPCODE_DIST2:      dist2(d.x, s0, s1, pp);                        break;
+			case Shader::OPCODE_DIST3:      dist3(d.x, s0, s1, pp);                        break;
+			case Shader::OPCODE_DIST4:      dist4(d.x, s0, s1, pp);                        break;
+			case Shader::OPCODE_MIN:        min(d, s0, s1);                                break;
+			case Shader::OPCODE_IMIN:       imin(d, s0, s1);                               break;
+			case Shader::OPCODE_UMIN:       umin(d, s0, s1);                               break;
+			case Shader::OPCODE_MAX:        max(d, s0, s1);                                break;
+			case Shader::OPCODE_IMAX:       imax(d, s0, s1);                               break;
+			case Shader::OPCODE_UMAX:       umax(d, s0, s1);                               break;
+			case Shader::OPCODE_LRP:        lrp(d, s0, s1, s2);                            break;
+			case Shader::OPCODE_STEP:       step(d, s0, s1);                               break;
+			case Shader::OPCODE_SMOOTH:     smooth(d, s0, s1, s2);                         break;
+			case Shader::OPCODE_ISINF:      isinf(d, s0);                                  break;
+			case Shader::OPCODE_ISNAN:      isnan(d, s0);                                  break;
+			case Shader::OPCODE_FLOATBITSTOINT:
+			case Shader::OPCODE_FLOATBITSTOUINT:
+			case Shader::OPCODE_INTBITSTOFLOAT:
+			case Shader::OPCODE_UINTBITSTOFLOAT: d = s0;                                   break;
+			case Shader::OPCODE_PACKSNORM2x16:   packSnorm2x16(d, s0);                     break;
+			case Shader::OPCODE_PACKUNORM2x16:   packUnorm2x16(d, s0);                     break;
+			case Shader::OPCODE_PACKHALF2x16:    packHalf2x16(d, s0);                      break;
+			case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0);                   break;
+			case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0);                   break;
+			case Shader::OPCODE_UNPACKHALF2x16:  unpackHalf2x16(d, s0);                    break;
+			case Shader::OPCODE_POWX:       powx(d, s0, s1, pp);                           break;
+			case Shader::OPCODE_POW:        pow(d, s0, s1, pp);                            break;
+			case Shader::OPCODE_SGN:        sgn(d, s0);                                    break;
+			case Shader::OPCODE_ISGN:       isgn(d, s0);                                   break;
+			case Shader::OPCODE_CRS:        crs(d, s0, s1);                                break;
+			case Shader::OPCODE_FORWARD1:   forward1(d, s0, s1, s2);                       break;
+			case Shader::OPCODE_FORWARD2:   forward2(d, s0, s1, s2);                       break;
+			case Shader::OPCODE_FORWARD3:   forward3(d, s0, s1, s2);                       break;
+			case Shader::OPCODE_FORWARD4:   forward4(d, s0, s1, s2);                       break;
+			case Shader::OPCODE_REFLECT1:   reflect1(d, s0, s1);                           break;
+			case Shader::OPCODE_REFLECT2:   reflect2(d, s0, s1);                           break;
+			case Shader::OPCODE_REFLECT3:   reflect3(d, s0, s1);                           break;
+			case Shader::OPCODE_REFLECT4:   reflect4(d, s0, s1);                           break;
+			case Shader::OPCODE_REFRACT1:   refract1(d, s0, s1, s2.x);                     break;
+			case Shader::OPCODE_REFRACT2:   refract2(d, s0, s1, s2.x);                     break;
+			case Shader::OPCODE_REFRACT3:   refract3(d, s0, s1, s2.x);                     break;
+			case Shader::OPCODE_REFRACT4:   refract4(d, s0, s1, s2.x);                     break;
+			case Shader::OPCODE_NRM2:       nrm2(d, s0, pp);                               break;
+			case Shader::OPCODE_NRM3:       nrm3(d, s0, pp);                               break;
+			case Shader::OPCODE_NRM4:       nrm4(d, s0, pp);                               break;
+			case Shader::OPCODE_ABS:        abs(d, s0);                                    break;
+			case Shader::OPCODE_IABS:       iabs(d, s0);                                   break;
+			case Shader::OPCODE_SINCOS:     sincos(d, s0, pp);                             break;
+			case Shader::OPCODE_COS:        cos(d, s0, pp);                                break;
+			case Shader::OPCODE_SIN:        sin(d, s0, pp);                                break;
+			case Shader::OPCODE_TAN:        tan(d, s0, pp);                                break;
+			case Shader::OPCODE_ACOS:       acos(d, s0, pp);                               break;
+			case Shader::OPCODE_ASIN:       asin(d, s0, pp);                               break;
+			case Shader::OPCODE_ATAN:       atan(d, s0, pp);                               break;
+			case Shader::OPCODE_ATAN2:      atan2(d, s0, s1, pp);                          break;
+			case Shader::OPCODE_COSH:       cosh(d, s0, pp);                               break;
+			case Shader::OPCODE_SINH:       sinh(d, s0, pp);                               break;
+			case Shader::OPCODE_TANH:       tanh(d, s0, pp);                               break;
+			case Shader::OPCODE_ACOSH:      acosh(d, s0, pp);                              break;
+			case Shader::OPCODE_ASINH:      asinh(d, s0, pp);                              break;
+			case Shader::OPCODE_ATANH:      atanh(d, s0, pp);                              break;
+			case Shader::OPCODE_M4X4:       M4X4(d, s0, src1);                             break;
+			case Shader::OPCODE_M4X3:       M4X3(d, s0, src1);                             break;
+			case Shader::OPCODE_M3X4:       M3X4(d, s0, src1);                             break;
+			case Shader::OPCODE_M3X3:       M3X3(d, s0, src1);                             break;
+			case Shader::OPCODE_M3X2:       M3X2(d, s0, src1);                             break;
+			case Shader::OPCODE_TEX:        TEX(d, s0, src1, project, bias);               break;
+			case Shader::OPCODE_TEXLDD:     TEXGRAD(d, s0, src1, s2, s3);                  break;
+			case Shader::OPCODE_TEXLDL:     TEXLOD(d, s0, src1, s0.w);                     break;
+			case Shader::OPCODE_TEXLOD:     TEXLOD(d, s0, src1, s2.x);                     break;
+			case Shader::OPCODE_TEXSIZE:    TEXSIZE(d, s0.x, src1);                        break;
+			case Shader::OPCODE_TEXKILL:    TEXKILL(cMask, d, dst.mask);                   break;
+			case Shader::OPCODE_TEXOFFSET:  TEXOFFSET(d, s0, src1, s2);                    break;
+			case Shader::OPCODE_TEXLODOFFSET: TEXLODOFFSET(d, s0, src1, s2, s3.x);         break;
+			case Shader::OPCODE_TEXELFETCH: TEXELFETCH(d, s0, src1, s2.x);                 break;
+			case Shader::OPCODE_TEXELFETCHOFFSET: TEXELFETCHOFFSET(d, s0, src1, s2, s3.x); break;
+			case Shader::OPCODE_TEXGRAD:    TEXGRAD(d, s0, src1, s2, s3);                  break;
+			case Shader::OPCODE_TEXGRADOFFSET: TEXGRADOFFSET(d, s0, src1, s2, s3, s4);     break;
+			case Shader::OPCODE_TEXBIAS:    TEXBIAS(d, s0, src1, s2.x);                    break;
+			case Shader::OPCODE_TEXOFFSETBIAS: TEXOFFSETBIAS(d, s0, src1, s2, s3.x);       break;
+			case Shader::OPCODE_DISCARD:    DISCARD(cMask, instruction);                   break;
+			case Shader::OPCODE_DFDX:       DFDX(d, s0);                                   break;
+			case Shader::OPCODE_DFDY:       DFDY(d, s0);                                   break;
+			case Shader::OPCODE_FWIDTH:     FWIDTH(d, s0);                                 break;
+			case Shader::OPCODE_BREAK:      BREAK();                                       break;
+			case Shader::OPCODE_BREAKC:     BREAKC(s0, s1, control);                       break;
+			case Shader::OPCODE_BREAKP:     BREAKP(src0);                                  break;
+			case Shader::OPCODE_CONTINUE:   CONTINUE();                                    break;
+			case Shader::OPCODE_TEST:       TEST();                                        break;
+			case Shader::OPCODE_CALL:       CALL(dst.label, dst.callSite);                 break;
+			case Shader::OPCODE_CALLNZ:     CALLNZ(dst.label, dst.callSite, src0);         break;
+			case Shader::OPCODE_ELSE:       ELSE();                                        break;
+			case Shader::OPCODE_ENDIF:      ENDIF();                                       break;
+			case Shader::OPCODE_ENDLOOP:    ENDLOOP();                                     break;
+			case Shader::OPCODE_ENDREP:     ENDREP();                                      break;
+			case Shader::OPCODE_ENDWHILE:   ENDWHILE();                                    break;
+			case Shader::OPCODE_ENDSWITCH:  ENDSWITCH();                                   break;
+			case Shader::OPCODE_IF:         IF(src0);                                      break;
+			case Shader::OPCODE_IFC:        IFC(s0, s1, control);                          break;
+			case Shader::OPCODE_LABEL:      LABEL(dst.index);                              break;
+			case Shader::OPCODE_LOOP:       LOOP(src1);                                    break;
+			case Shader::OPCODE_REP:        REP(src0);                                     break;
+			case Shader::OPCODE_WHILE:      WHILE(src0);                                   break;
+			case Shader::OPCODE_SWITCH:     SWITCH();                                      break;
+			case Shader::OPCODE_RET:        RET();                                         break;
+			case Shader::OPCODE_LEAVE:      LEAVE();                                       break;
+			case Shader::OPCODE_CMP:        cmp(d, s0, s1, control);                       break;
+			case Shader::OPCODE_ALL:        all(d.x, s0);                                  break;
+			case Shader::OPCODE_ANY:        any(d.x, s0);                                  break;
+			case Shader::OPCODE_NOT:        bitwise_not(d, s0);                            break;
+			case Shader::OPCODE_OR:         bitwise_or(d, s0, s1);                         break;
+			case Shader::OPCODE_XOR:        bitwise_xor(d, s0, s1);                        break;
+			case Shader::OPCODE_AND:        bitwise_and(d, s0, s1);                        break;
+			case Shader::OPCODE_EQ:         equal(d, s0, s1);                              break;
+			case Shader::OPCODE_NE:         notEqual(d, s0, s1);                           break;
+			case Shader::OPCODE_END:                                                       break;
+			default:
+				ASSERT(false);
+			}
+
+			if(dst.type != Shader::PARAMETER_VOID && dst.type != Shader::PARAMETER_LABEL && opcode != Shader::OPCODE_TEXKILL && opcode != Shader::OPCODE_NOP)
+			{
+				if(dst.saturate)
+				{
+					if(dst.x) d.x = Max(d.x, Float4(0.0f));
+					if(dst.y) d.y = Max(d.y, Float4(0.0f));
+					if(dst.z) d.z = Max(d.z, Float4(0.0f));
+					if(dst.w) d.w = Max(d.w, Float4(0.0f));
+
+					if(dst.x) d.x = Min(d.x, Float4(1.0f));
+					if(dst.y) d.y = Min(d.y, Float4(1.0f));
+					if(dst.z) d.z = Min(d.z, Float4(1.0f));
+					if(dst.w) d.w = Min(d.w, Float4(1.0f));
+				}
+
+				if(instruction->isPredicated())
+				{
+					Vector4f pDst;   // FIXME: Rename
+
+					switch(dst.type)
+					{
+					case Shader::PARAMETER_TEMP:
+						if(dst.rel.type == Shader::PARAMETER_VOID)
+						{
+							if(dst.x) pDst.x = r[dst.index].x;
+							if(dst.y) pDst.y = r[dst.index].y;
+							if(dst.z) pDst.z = r[dst.index].z;
+							if(dst.w) pDst.w = r[dst.index].w;
+						}
+						else if(!dst.rel.dynamic)
+						{
+							Int a = dst.index + relativeAddress(dst.rel);
+
+							if(dst.x) pDst.x = r[a].x;
+							if(dst.y) pDst.y = r[a].y;
+							if(dst.z) pDst.z = r[a].z;
+							if(dst.w) pDst.w = r[a].w;
+						}
+						else
+						{
+							Int4 a = dst.index + dynamicAddress(dst.rel);
+
+							if(dst.x) pDst.x = r[a].x;
+							if(dst.y) pDst.y = r[a].y;
+							if(dst.z) pDst.z = r[a].z;
+							if(dst.w) pDst.w = r[a].w;
+						}
+						break;
+					case Shader::PARAMETER_COLOROUT:
+						if(dst.rel.type == Shader::PARAMETER_VOID)
+						{
+							if(dst.x) pDst.x = oC[dst.index].x;
+							if(dst.y) pDst.y = oC[dst.index].y;
+							if(dst.z) pDst.z = oC[dst.index].z;
+							if(dst.w) pDst.w = oC[dst.index].w;
+						}
+						else if(!dst.rel.dynamic)
+						{
+							Int a = dst.index + relativeAddress(dst.rel);
+
+							if(dst.x) pDst.x = oC[a].x;
+							if(dst.y) pDst.y = oC[a].y;
+							if(dst.z) pDst.z = oC[a].z;
+							if(dst.w) pDst.w = oC[a].w;
+						}
+						else
+						{
+							Int4 a = dst.index + dynamicAddress(dst.rel);
+
+							if(dst.x) pDst.x = oC[a].x;
+							if(dst.y) pDst.y = oC[a].y;
+							if(dst.z) pDst.z = oC[a].z;
+							if(dst.w) pDst.w = oC[a].w;
+						}
+						break;
+					case Shader::PARAMETER_PREDICATE:
+						if(dst.x) pDst.x = p0.x;
+						if(dst.y) pDst.y = p0.y;
+						if(dst.z) pDst.z = p0.z;
+						if(dst.w) pDst.w = p0.w;
+						break;
+					case Shader::PARAMETER_DEPTHOUT:
+						pDst.x = oDepth;
+						break;
+					default:
+						ASSERT(false);
+					}
+
+					Int4 enable = enableMask(instruction);
+
+					Int4 xEnable = enable;
+					Int4 yEnable = enable;
+					Int4 zEnable = enable;
+					Int4 wEnable = enable;
+
+					if(predicate)
+					{
+						unsigned char pSwizzle = instruction->predicateSwizzle;
+
+						Float4 xPredicate = p0[(pSwizzle >> 0) & 0x03];
+						Float4 yPredicate = p0[(pSwizzle >> 2) & 0x03];
+						Float4 zPredicate = p0[(pSwizzle >> 4) & 0x03];
+						Float4 wPredicate = p0[(pSwizzle >> 6) & 0x03];
+
+						if(!instruction->predicateNot)
+						{
+							if(dst.x) xEnable = xEnable & As<Int4>(xPredicate);
+							if(dst.y) yEnable = yEnable & As<Int4>(yPredicate);
+							if(dst.z) zEnable = zEnable & As<Int4>(zPredicate);
+							if(dst.w) wEnable = wEnable & As<Int4>(wPredicate);
+						}
+						else
+						{
+							if(dst.x) xEnable = xEnable & ~As<Int4>(xPredicate);
+							if(dst.y) yEnable = yEnable & ~As<Int4>(yPredicate);
+							if(dst.z) zEnable = zEnable & ~As<Int4>(zPredicate);
+							if(dst.w) wEnable = wEnable & ~As<Int4>(wPredicate);
+						}
+					}
+
+					if(dst.x) d.x = As<Float4>(As<Int4>(d.x) & xEnable);
+					if(dst.y) d.y = As<Float4>(As<Int4>(d.y) & yEnable);
+					if(dst.z) d.z = As<Float4>(As<Int4>(d.z) & zEnable);
+					if(dst.w) d.w = As<Float4>(As<Int4>(d.w) & wEnable);
+
+					if(dst.x) d.x = As<Float4>(As<Int4>(d.x) | (As<Int4>(pDst.x) & ~xEnable));
+					if(dst.y) d.y = As<Float4>(As<Int4>(d.y) | (As<Int4>(pDst.y) & ~yEnable));
+					if(dst.z) d.z = As<Float4>(As<Int4>(d.z) | (As<Int4>(pDst.z) & ~zEnable));
+					if(dst.w) d.w = As<Float4>(As<Int4>(d.w) | (As<Int4>(pDst.w) & ~wEnable));
+				}
+
+				switch(dst.type)
+				{
+				case Shader::PARAMETER_TEMP:
+					if(dst.rel.type == Shader::PARAMETER_VOID)
+					{
+						if(dst.x) r[dst.index].x = d.x;
+						if(dst.y) r[dst.index].y = d.y;
+						if(dst.z) r[dst.index].z = d.z;
+						if(dst.w) r[dst.index].w = d.w;
+					}
+					else if(!dst.rel.dynamic)
+					{
+						Int a = dst.index + relativeAddress(dst.rel);
+
+						if(dst.x) r[a].x = d.x;
+						if(dst.y) r[a].y = d.y;
+						if(dst.z) r[a].z = d.z;
+						if(dst.w) r[a].w = d.w;
+					}
+					else
+					{
+						Int4 a = dst.index + dynamicAddress(dst.rel);
+
+						if(dst.x) r.scatter_x(a, d.x);
+						if(dst.y) r.scatter_y(a, d.y);
+						if(dst.z) r.scatter_z(a, d.z);
+						if(dst.w) r.scatter_w(a, d.w);
+					}
+					break;
+				case Shader::PARAMETER_COLOROUT:
+					if(dst.rel.type == Shader::PARAMETER_VOID)
+					{
+						broadcastColor0 = (dst.index == 0) && broadcastColor0;
+
+						if(dst.x) oC[dst.index].x = d.x;
+						if(dst.y) oC[dst.index].y = d.y;
+						if(dst.z) oC[dst.index].z = d.z;
+						if(dst.w) oC[dst.index].w = d.w;
+					}
+					else if(!dst.rel.dynamic)
+					{
+						broadcastColor0 = false;
+						Int a = dst.index + relativeAddress(dst.rel);
+
+						if(dst.x) oC[a].x = d.x;
+						if(dst.y) oC[a].y = d.y;
+						if(dst.z) oC[a].z = d.z;
+						if(dst.w) oC[a].w = d.w;
+					}
+					else
+					{
+						broadcastColor0 = false;
+						Int4 a = dst.index + dynamicAddress(dst.rel);
+
+						if(dst.x) oC.scatter_x(a, d.x);
+						if(dst.y) oC.scatter_y(a, d.y);
+						if(dst.z) oC.scatter_z(a, d.z);
+						if(dst.w) oC.scatter_w(a, d.w);
+					}
+					break;
+				case Shader::PARAMETER_PREDICATE:
+					if(dst.x) p0.x = d.x;
+					if(dst.y) p0.y = d.y;
+					if(dst.z) p0.z = d.z;
+					if(dst.w) p0.w = d.w;
+					break;
+				case Shader::PARAMETER_DEPTHOUT:
+					oDepth = d.x;
+					break;
+				default:
+					ASSERT(false);
+				}
+			}
+		}
+
+		if(currentLabel != -1)
+		{
+			Nucleus::setInsertBlock(returnBlock);
+		}
+
+		if(broadcastColor0)
+		{
+			for(int i = 0; i < RENDERTARGETS; i++)
+			{
+				c[i] = oC[0];
+			}
+		}
+		else
+		{
+			for(int i = 0; i < RENDERTARGETS; i++)
+			{
+				c[i] = oC[i];
+			}
+		}
+
+		clampColor(c);
+
+		if(state.depthOverride)
+		{
+			oDepth = Min(Max(oDepth, Float4(0.0f)), Float4(1.0f));
+		}
+	}
+
+	Bool PixelProgram::alphaTest(Int cMask[4])
+	{
+		if(!state.alphaTestActive())
+		{
+			return true;
+		}
+
+		Int aMask;
+
+		if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
+		{
+			Short4 alpha = RoundShort4(c[0].w * Float4(0x1000));
+
+			PixelRoutine::alphaTest(aMask, alpha);
+
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				cMask[q] &= aMask;
+			}
+		}
+		else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
+		{
+			alphaToCoverage(cMask, c[0].w);
+		}
+		else ASSERT(false);
+
+		Int pass = cMask[0];
+
+		for(unsigned int q = 1; q < state.multiSample; q++)
+		{
+			pass = pass | cMask[q];
+		}
+
+		return pass != 0x0;
+	}
+
+	void PixelProgram::rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+	{
+		for(int index = 0; index < RENDERTARGETS; index++)
+		{
+			if(!state.colorWriteActive(index))
+			{
+				continue;
+			}
+
+			if(!postBlendSRGB && state.writeSRGB && !isSRGB(index))
+			{
+				c[index].x = linearToSRGB(c[index].x);
+				c[index].y = linearToSRGB(c[index].y);
+				c[index].z = linearToSRGB(c[index].z);
+			}
+
+			if(index == 0)
+			{
+				fogBlend(c[index], fog);
+			}
+
+			switch(state.targetFormat[index])
+			{
+			case FORMAT_R5G6B5:
+			case FORMAT_X8R8G8B8:
+			case FORMAT_X8B8G8R8:
+			case FORMAT_A8R8G8B8:
+			case FORMAT_A8B8G8R8:
+			case FORMAT_SRGB8_X8:
+			case FORMAT_SRGB8_A8:
+			case FORMAT_G8R8:
+			case FORMAT_R8:
+			case FORMAT_A8:
+			case FORMAT_G16R16:
+			case FORMAT_A16B16G16R16:
+				for(unsigned int q = 0; q < state.multiSample; q++)
+				{
+					Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+					Vector4s color;
+
+					if(state.targetFormat[index] == FORMAT_R5G6B5)
+					{
+						color.x = UShort4(c[index].x * Float4(0xFBFF), false);
+						color.y = UShort4(c[index].y * Float4(0xFDFF), false);
+						color.z = UShort4(c[index].z * Float4(0xFBFF), false);
+						color.w = UShort4(c[index].w * Float4(0xFFFF), false);
+					}
+					else
+					{
+						color.x = convertFixed16(c[index].x, false);
+						color.y = convertFixed16(c[index].y, false);
+						color.z = convertFixed16(c[index].z, false);
+						color.w = convertFixed16(c[index].w, false);
+					}
+
+					if(state.multiSampleMask & (1 << q))
+					{
+						alphaBlend(index, buffer, color, x);
+						logicOperation(index, buffer, color, x);
+						writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+					}
+				}
+				break;
+			case FORMAT_R32F:
+			case FORMAT_G32R32F:
+			case FORMAT_X32B32G32R32F:
+			case FORMAT_A32B32G32R32F:
+			case FORMAT_X32B32G32R32F_UNSIGNED:
+			case FORMAT_R32I:
+			case FORMAT_G32R32I:
+			case FORMAT_A32B32G32R32I:
+			case FORMAT_R32UI:
+			case FORMAT_G32R32UI:
+			case FORMAT_A32B32G32R32UI:
+			case FORMAT_R16I:
+			case FORMAT_G16R16I:
+			case FORMAT_A16B16G16R16I:
+			case FORMAT_R16UI:
+			case FORMAT_G16R16UI:
+			case FORMAT_A16B16G16R16UI:
+			case FORMAT_R8I:
+			case FORMAT_G8R8I:
+			case FORMAT_A8B8G8R8I:
+			case FORMAT_R8UI:
+			case FORMAT_G8R8UI:
+			case FORMAT_A8B8G8R8UI:
+				for(unsigned int q = 0; q < state.multiSample; q++)
+				{
+					Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+					Vector4f color = c[index];
+
+					if(state.multiSampleMask & (1 << q))
+					{
+						alphaBlend(index, buffer, color, x);
+						writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+					}
+				}
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+	}
+
+	Vector4f PixelProgram::sampleTexture(const Src &sampler, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+	{
+		Vector4f tmp;
+
+		if(sampler.type == Shader::PARAMETER_SAMPLER && sampler.rel.type == Shader::PARAMETER_VOID)
+		{
+			tmp = sampleTexture(sampler.index, uvwq, bias, dsx, dsy, offset, function);
+		}
+		else
+		{
+			Int index = As<Int>(Float(fetchRegister(sampler).x.x));
+
+			for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
+			{
+				if(shader->usesSampler(i))
+				{
+					If(index == i)
+					{
+						tmp = sampleTexture(i, uvwq, bias, dsx, dsy, offset, function);
+						// FIXME: When the sampler states are the same, we could use one sampler and just index the texture
+					}
+				}
+			}
+		}
+
+		Vector4f c;
+		c.x = tmp[(sampler.swizzle >> 0) & 0x3];
+		c.y = tmp[(sampler.swizzle >> 2) & 0x3];
+		c.z = tmp[(sampler.swizzle >> 4) & 0x3];
+		c.w = tmp[(sampler.swizzle >> 6) & 0x3];
+
+		return c;
+	}
+
+	Vector4f PixelProgram::sampleTexture(int samplerIndex, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+	{
+		#if PERF_PROFILE
+			Long texTime = Ticks();
+		#endif
+
+		Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + samplerIndex * sizeof(Texture);
+		Vector4f c = SamplerCore(constants, state.sampler[samplerIndex]).sampleTexture(texture, uvwq.x, uvwq.y, uvwq.z, uvwq.w, bias, dsx, dsy, offset, function);
+
+		#if PERF_PROFILE
+			cycles[PERF_TEX] += Ticks() - texTime;
+		#endif
+
+		return c;
+	}
+
+	void PixelProgram::clampColor(Vector4f oC[RENDERTARGETS])
+	{
+		for(int index = 0; index < RENDERTARGETS; index++)
+		{
+			if(!state.colorWriteActive(index) && !(index == 0 && state.alphaTestActive()))
+			{
+				continue;
+			}
+
+			switch(state.targetFormat[index])
+			{
+			case FORMAT_NULL:
+				break;
+			case FORMAT_R5G6B5:
+			case FORMAT_A8R8G8B8:
+			case FORMAT_A8B8G8R8:
+			case FORMAT_X8R8G8B8:
+			case FORMAT_X8B8G8R8:
+			case FORMAT_SRGB8_X8:
+			case FORMAT_SRGB8_A8:
+			case FORMAT_G8R8:
+			case FORMAT_R8:
+			case FORMAT_A8:
+			case FORMAT_G16R16:
+			case FORMAT_A16B16G16R16:
+				oC[index].x = Max(oC[index].x, Float4(0.0f)); oC[index].x = Min(oC[index].x, Float4(1.0f));
+				oC[index].y = Max(oC[index].y, Float4(0.0f)); oC[index].y = Min(oC[index].y, Float4(1.0f));
+				oC[index].z = Max(oC[index].z, Float4(0.0f)); oC[index].z = Min(oC[index].z, Float4(1.0f));
+				oC[index].w = Max(oC[index].w, Float4(0.0f)); oC[index].w = Min(oC[index].w, Float4(1.0f));
+				break;
+			case FORMAT_R32F:
+			case FORMAT_G32R32F:
+			case FORMAT_X32B32G32R32F:
+			case FORMAT_A32B32G32R32F:
+			case FORMAT_R32I:
+			case FORMAT_G32R32I:
+			case FORMAT_A32B32G32R32I:
+			case FORMAT_R32UI:
+			case FORMAT_G32R32UI:
+			case FORMAT_A32B32G32R32UI:
+			case FORMAT_R16I:
+			case FORMAT_G16R16I:
+			case FORMAT_A16B16G16R16I:
+			case FORMAT_R16UI:
+			case FORMAT_G16R16UI:
+			case FORMAT_A16B16G16R16UI:
+			case FORMAT_R8I:
+			case FORMAT_G8R8I:
+			case FORMAT_A8B8G8R8I:
+			case FORMAT_R8UI:
+			case FORMAT_G8R8UI:
+			case FORMAT_A8B8G8R8UI:
+				break;
+			case FORMAT_X32B32G32R32F_UNSIGNED:
+				oC[index].x = Max(oC[index].x, Float4(0.0f));
+				oC[index].y = Max(oC[index].y, Float4(0.0f));
+				oC[index].z = Max(oC[index].z, Float4(0.0f));
+				oC[index].w = Max(oC[index].w, Float4(0.0f));
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+	}
+
+	Int4 PixelProgram::enableMask(const Shader::Instruction *instruction)
+	{
+		Int4 enable = instruction->analysisBranch ? Int4(enableStack[enableIndex]) : Int4(0xFFFFFFFF);
+
+		if(!whileTest)
+		{
+			if(shader->containsBreakInstruction() && instruction->analysisBreak)
+			{
+				enable &= enableBreak;
+			}
+
+			if(shader->containsContinueInstruction() && instruction->analysisContinue)
+			{
+				enable &= enableContinue;
+			}
+
+			if(shader->containsLeaveInstruction() && instruction->analysisLeave)
+			{
+				enable &= enableLeave;
+			}
+		}
+
+		return enable;
+	}
+
+	Vector4f PixelProgram::fetchRegister(const Src &src, unsigned int offset)
+	{
+		Vector4f reg;
+		unsigned int i = src.index + offset;
+
+		switch(src.type)
+		{
+		case Shader::PARAMETER_TEMP:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg = r[i];
+			}
+			else if(!src.rel.dynamic)
+			{
+				reg = r[i + relativeAddress(src.rel, src.bufferIndex)];
+			}
+			else
+			{
+				reg = r[i + dynamicAddress(src.rel)];
+			}
+			break;
+		case Shader::PARAMETER_INPUT:
+			if(src.rel.type == Shader::PARAMETER_VOID)   // Not relative
+			{
+				reg = v[i];
+			}
+			else if(!src.rel.dynamic)
+			{
+				reg = v[i + relativeAddress(src.rel, src.bufferIndex)];
+			}
+			else
+			{
+				reg = v[i + dynamicAddress(src.rel)];
+			}
+			break;
+		case Shader::PARAMETER_CONST:
+			reg = readConstant(src, offset);
+			break;
+		case Shader::PARAMETER_TEXTURE:
+			reg = v[2 + i];
+			break;
+		case Shader::PARAMETER_MISCTYPE:
+			if(src.index == Shader::VPosIndex) reg = vPos;
+			if(src.index == Shader::VFaceIndex) reg = vFace;
+			break;
+		case Shader::PARAMETER_SAMPLER:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg.x = As<Float4>(Int4(i));
+			}
+			else if(src.rel.type == Shader::PARAMETER_TEMP)
+			{
+				reg.x = As<Float4>(Int4(i) + As<Int4>(r[src.rel.index].x));
+			}
+			return reg;
+		case Shader::PARAMETER_PREDICATE:   return reg; // Dummy
+		case Shader::PARAMETER_VOID:        return reg; // Dummy
+		case Shader::PARAMETER_FLOAT4LITERAL:
+			reg.x = Float4(src.value[0]);
+			reg.y = Float4(src.value[1]);
+			reg.z = Float4(src.value[2]);
+			reg.w = Float4(src.value[3]);
+			break;
+		case Shader::PARAMETER_CONSTINT:    return reg; // Dummy
+		case Shader::PARAMETER_CONSTBOOL:   return reg; // Dummy
+		case Shader::PARAMETER_LOOP:        return reg; // Dummy
+		case Shader::PARAMETER_COLOROUT:
+			if(src.rel.type == Shader::PARAMETER_VOID)   // Not relative
+			{
+				reg = oC[i];
+			}
+			else if(!src.rel.dynamic)
+			{
+				reg = oC[i + relativeAddress(src.rel, src.bufferIndex)];
+			}
+			else
+			{
+				reg = oC[i + dynamicAddress(src.rel)];
+			}
+			break;
+		case Shader::PARAMETER_DEPTHOUT:
+			reg.x = oDepth;
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		const Float4 &x = reg[(src.swizzle >> 0) & 0x3];
+		const Float4 &y = reg[(src.swizzle >> 2) & 0x3];
+		const Float4 &z = reg[(src.swizzle >> 4) & 0x3];
+		const Float4 &w = reg[(src.swizzle >> 6) & 0x3];
+
+		Vector4f mod;
+
+		switch(src.modifier)
+		{
+		case Shader::MODIFIER_NONE:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
+			break;
+		case Shader::MODIFIER_NEGATE:
+			mod.x = -x;
+			mod.y = -y;
+			mod.z = -z;
+			mod.w = -w;
+			break;
+		case Shader::MODIFIER_ABS:
+			mod.x = Abs(x);
+			mod.y = Abs(y);
+			mod.z = Abs(z);
+			mod.w = Abs(w);
+			break;
+		case Shader::MODIFIER_ABS_NEGATE:
+			mod.x = -Abs(x);
+			mod.y = -Abs(y);
+			mod.z = -Abs(z);
+			mod.w = -Abs(w);
+			break;
+		case Shader::MODIFIER_NOT:
+			mod.x = As<Float4>(As<Int4>(x) ^ Int4(0xFFFFFFFF));
+			mod.y = As<Float4>(As<Int4>(y) ^ Int4(0xFFFFFFFF));
+			mod.z = As<Float4>(As<Int4>(z) ^ Int4(0xFFFFFFFF));
+			mod.w = As<Float4>(As<Int4>(w) ^ Int4(0xFFFFFFFF));
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		return mod;
+	}
+
+	RValue<Pointer<Byte>> PixelProgram::uniformAddress(int bufferIndex, unsigned int index)
+	{
+		if(bufferIndex == -1)
+		{
+			return data + OFFSET(DrawData, ps.c[index]);
+		}
+		else
+		{
+			return *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, ps.u[bufferIndex])) + index;
+		}
+	}
+
+	RValue<Pointer<Byte>> PixelProgram::uniformAddress(int bufferIndex, unsigned int index, Int& offset)
+	{
+		return uniformAddress(bufferIndex, index) + offset * sizeof(float4);
+	}
+
+	Vector4f PixelProgram::readConstant(const Src &src, unsigned int offset)
+	{
+		Vector4f c;
+		unsigned int i = src.index + offset;
+
+		if(src.rel.type == Shader::PARAMETER_VOID)   // Not relative
+		{
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i));
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+
+			if(shader->containsDefineInstruction())   // Constant may be known at compile time
+			{
+				for(size_t j = 0; j < shader->getLength(); j++)
+				{
+					const Shader::Instruction &instruction = *shader->getInstruction(j);
+
+					if(instruction.opcode == Shader::OPCODE_DEF)
+					{
+						if(instruction.dst.index == i)
+						{
+							c.x = Float4(instruction.src[0].value[0]);
+							c.y = Float4(instruction.src[0].value[1]);
+							c.z = Float4(instruction.src[0].value[2]);
+							c.w = Float4(instruction.src[0].value[3]);
+
+							break;
+						}
+					}
+				}
+			}
+		}
+		else if(!src.rel.dynamic || src.rel.type == Shader::PARAMETER_LOOP)
+		{
+			Int a = relativeAddress(src.rel, src.bufferIndex);
+
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i, a));
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+		}
+		else
+		{
+			int component = src.rel.swizzle & 0x03;
+			Float4 a;
+
+			switch(src.rel.type)
+			{
+			case Shader::PARAMETER_TEMP:     a = r[src.rel.index][component]; break;
+			case Shader::PARAMETER_INPUT:    a = v[src.rel.index][component]; break;
+			case Shader::PARAMETER_OUTPUT:   a = oC[src.rel.index][component]; break;
+			case Shader::PARAMETER_CONST:    a = *Pointer<Float>(uniformAddress(src.bufferIndex, src.rel.index) + component * sizeof(float)); break;
+			case Shader::PARAMETER_MISCTYPE:
+				switch(src.rel.index)
+				{
+				case Shader::VPosIndex:  a = vPos.x;  break;
+				case Shader::VFaceIndex: a = vFace.x; break;
+				default: ASSERT(false);
+				}
+				break;
+			default: ASSERT(false);
+			}
+
+			Int4 index = Int4(i) + As<Int4>(a) * Int4(src.rel.scale);
+
+			index = Min(As<UInt4>(index), UInt4(VERTEX_UNIFORM_VECTORS));   // Clamp to constant register range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+
+			Int index0 = Extract(index, 0);
+			Int index1 = Extract(index, 1);
+			Int index2 = Extract(index, 2);
+			Int index3 = Extract(index, 3);
+
+			c.x = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index0), 16);
+			c.y = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index1), 16);
+			c.z = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index2), 16);
+			c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index3), 16);
+
+			transpose4x4(c.x, c.y, c.z, c.w);
+		}
+
+		return c;
+	}
+
+	Int PixelProgram::relativeAddress(const Shader::Relative &rel, int bufferIndex)
+	{
+		ASSERT(!rel.dynamic);
+
+		if(rel.type == Shader::PARAMETER_TEMP)
+		{
+			return As<Int>(Extract(r[rel.index].x, 0)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_INPUT)
+		{
+			return As<Int>(Extract(v[rel.index].x, 0)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_OUTPUT)
+		{
+			return As<Int>(Extract(oC[rel.index].x, 0)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_CONST)
+		{
+			return *Pointer<Int>(uniformAddress(bufferIndex, rel.index)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_LOOP)
+		{
+			return aL[loopDepth];
+		}
+		else ASSERT(false);
+
+		return 0;
+	}
+
+	Int4 PixelProgram::dynamicAddress(const Shader::Relative &rel)
+	{
+		int component = rel.swizzle & 0x03;
+		Float4 a;
+
+		switch(rel.type)
+		{
+		case Shader::PARAMETER_TEMP:     a = r[rel.index][component]; break;
+		case Shader::PARAMETER_INPUT:    a = v[rel.index][component]; break;
+		case Shader::PARAMETER_OUTPUT:   a = oC[rel.index][component]; break;
+		case Shader::PARAMETER_MISCTYPE:
+			switch(rel.index)
+			{
+			case Shader::VPosIndex:  a = vPos.x;  break;
+			case Shader::VFaceIndex: a = vFace.x; break;
+			default: ASSERT(false);
+			}
+			break;
+		default: ASSERT(false);
+		}
+
+		return As<Int4>(a) * Int4(rel.scale);
+	}
+
+	Float4 PixelProgram::linearToSRGB(const Float4 &x)   // Approximates x^(1.0/2.2)
+	{
+		Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
+		Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
+
+		return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
+	}
+
+	void PixelProgram::M3X2(Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+
+		dst.x = dot3(src0, row0);
+		dst.y = dot3(src0, row1);
+	}
+
+	void PixelProgram::M3X3(Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+
+		dst.x = dot3(src0, row0);
+		dst.y = dot3(src0, row1);
+		dst.z = dot3(src0, row2);
+	}
+
+	void PixelProgram::M3X4(Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+		Vector4f row3 = fetchRegister(src1, 3);
+
+		dst.x = dot3(src0, row0);
+		dst.y = dot3(src0, row1);
+		dst.z = dot3(src0, row2);
+		dst.w = dot3(src0, row3);
+	}
+
+	void PixelProgram::M4X3(Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+
+		dst.x = dot4(src0, row0);
+		dst.y = dot4(src0, row1);
+		dst.z = dot4(src0, row2);
+	}
+
+	void PixelProgram::M4X4(Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+		Vector4f row3 = fetchRegister(src1, 3);
+
+		dst.x = dot4(src0, row0);
+		dst.y = dot4(src0, row1);
+		dst.z = dot4(src0, row2);
+		dst.w = dot4(src0, row3);
+	}
+
+	void PixelProgram::TEX(Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias)
+	{
+		if(project)
+		{
+			Vector4f proj;
+			Float4 rw = reciprocal(src0.w);
+			proj.x = src0.x * rw;
+			proj.y = src0.y * rw;
+			proj.z = src0.z * rw;
+
+			dst = sampleTexture(src1, proj, src0.x, (src0), (src0), (src0), Implicit);
+		}
+		else
+		{
+			dst = sampleTexture(src1, src0, src0.x, (src0), (src0), (src0), bias ? Bias : Implicit);
+		}
+	}
+
+	void PixelProgram::TEXOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset)
+	{
+		dst = sampleTexture(src1, src0, (src0.x), (src0), (src0), offset, {Implicit, Offset});
+	}
+
+	void PixelProgram::TEXLODOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Lod, Offset});
+	}
+
+	void PixelProgram::TEXBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &bias)
+	{
+		dst = sampleTexture(src1, src0, bias, (src0), (src0), (src0), Bias);
+	}
+
+	void PixelProgram::TEXOFFSETBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &bias)
+	{
+		dst = sampleTexture(src1, src0, bias, (src0), (src0), offset, {Bias, Offset});
+	}
+
+	void PixelProgram::TEXELFETCH(Vector4f &dst, Vector4f &src0, const Src& src1, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Fetch);
+	}
+
+	void PixelProgram::TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Fetch, Offset});
+	}
+
+	void PixelProgram::TEXGRAD(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy)
+	{
+		dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, (src0), Grad);
+	}
+
+	void PixelProgram::TEXGRADOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy, Vector4f &offset)
+	{
+		dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, offset, {Grad, Offset});
+	}
+
+	void PixelProgram::TEXLOD(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Lod);
+	}
+
+	void PixelProgram::TEXSIZE(Vector4f &dst, Float4 &lod, const Src &src1)
+	{
+		Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + src1.index * sizeof(Texture);
+		dst = SamplerCore::textureSize(texture, lod);
+	}
+
+	void PixelProgram::TEXKILL(Int cMask[4], Vector4f &src, unsigned char mask)
+	{
+		Int kill = -1;
+
+		if(mask & 0x1) kill &= SignMask(CmpNLT(src.x, Float4(0.0f)));
+		if(mask & 0x2) kill &= SignMask(CmpNLT(src.y, Float4(0.0f)));
+		if(mask & 0x4) kill &= SignMask(CmpNLT(src.z, Float4(0.0f)));
+		if(mask & 0x8) kill &= SignMask(CmpNLT(src.w, Float4(0.0f)));
+
+		// FIXME: Dynamic branching affects TEXKILL?
+		//	if(shader->containsDynamicBranching())
+		//	{
+		//		kill = ~SignMask(enableMask());
+		//	}
+
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			cMask[q] &= kill;
+		}
+
+		// FIXME: Branch to end of shader if all killed?
+	}
+
+	void PixelProgram::DISCARD(Int cMask[4], const Shader::Instruction *instruction)
+	{
+		Int kill = 0;
+
+		if(shader->containsDynamicBranching())
+		{
+			kill = ~SignMask(enableMask(instruction));
+		}
+
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			cMask[q] &= kill;
+		}
+
+		// FIXME: Branch to end of shader if all killed?
+	}
+
+	void PixelProgram::DFDX(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = src.x.yyww - src.x.xxzz;
+		dst.y = src.y.yyww - src.y.xxzz;
+		dst.z = src.z.yyww - src.z.xxzz;
+		dst.w = src.w.yyww - src.w.xxzz;
+	}
+
+	void PixelProgram::DFDY(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = src.x.zwzw - src.x.xyxy;
+		dst.y = src.y.zwzw - src.y.xyxy;
+		dst.z = src.z.zwzw - src.z.xyxy;
+		dst.w = src.w.zwzw - src.w.xyxy;
+	}
+
+	void PixelProgram::FWIDTH(Vector4f &dst, Vector4f &src)
+	{
+		// abs(dFdx(src)) + abs(dFdy(src));
+		dst.x = Abs(src.x.yyww - src.x.xxzz) + Abs(src.x.zwzw - src.x.xyxy);
+		dst.y = Abs(src.y.yyww - src.y.xxzz) + Abs(src.y.zwzw - src.y.xyxy);
+		dst.z = Abs(src.z.yyww - src.z.xxzz) + Abs(src.z.zwzw - src.z.xyxy);
+		dst.w = Abs(src.w.yyww - src.w.xxzz) + Abs(src.w.zwzw - src.w.xyxy);
+	}
+
+	void PixelProgram::BREAK()
+	{
+		enableBreak = enableBreak & ~enableStack[enableIndex];
+	}
+
+	void PixelProgram::BREAKC(Vector4f &src0, Vector4f &src1, Control control)
+	{
+		Int4 condition;
+
+		switch(control)
+		{
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);  break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x);  break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);  break;
+		default:
+			ASSERT(false);
+		}
+
+		BREAK(condition);
+	}
+
+	void PixelProgram::BREAKP(const Src &predicateRegister)   // FIXME: Factor out parts common with BREAKC
+	{
+		Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = ~condition;
+		}
+
+		BREAK(condition);
+	}
+
+	void PixelProgram::BREAK(Int4 &condition)
+	{
+		condition &= enableStack[enableIndex];
+
+		enableBreak = enableBreak & ~condition;
+	}
+
+	void PixelProgram::CONTINUE()
+	{
+		enableContinue = enableContinue & ~enableStack[enableIndex];
+	}
+
+	void PixelProgram::TEST()
+	{
+		whileTest = true;
+	}
+
+	void PixelProgram::CALL(int labelIndex, int callSiteIndex)
+	{
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			callStack[stackIndex++] = UInt(callSiteIndex);
+		}
+
+		Int4 restoreLeave = enableLeave;
+
+		Nucleus::createBr(labelBlock[labelIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		enableLeave = restoreLeave;
+	}
+
+	void PixelProgram::CALLNZ(int labelIndex, int callSiteIndex, const Src &src)
+	{
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
+		{
+			CALLNZb(labelIndex, callSiteIndex, src);
+		}
+		else if(src.type == Shader::PARAMETER_PREDICATE)
+		{
+			CALLNZp(labelIndex, callSiteIndex, src);
+		}
+		else ASSERT(false);
+	}
+
+	void PixelProgram::CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister)
+	{
+		Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData, ps.b[boolRegister.index])) != Byte(0));   // FIXME
+
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = !condition;
+		}
+
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			callStack[stackIndex++] = UInt(callSiteIndex);
+		}
+
+		Int4 restoreLeave = enableLeave;
+
+		branch(condition, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		enableLeave = restoreLeave;
+	}
+
+	void PixelProgram::CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister)
+	{
+		Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = ~condition;
+		}
+
+		condition &= enableStack[enableIndex];
+
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			callStack[stackIndex++] = UInt(callSiteIndex);
+		}
+
+		enableIndex++;
+		enableStack[enableIndex] = condition;
+		Int4 restoreLeave = enableLeave;
+
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		enableIndex--;
+		enableLeave = restoreLeave;
+	}
+
+	void PixelProgram::ELSE()
+	{
+		ifDepth--;
+
+		BasicBlock *falseBlock = ifFalseBlock[ifDepth];
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		if(isConditionalIf[ifDepth])
+		{
+			Int4 condition = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+			Bool notAllFalse = SignMask(condition) != 0;
+
+			branch(notAllFalse, falseBlock, endBlock);
+
+			enableStack[enableIndex] = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+		}
+		else
+		{
+			Nucleus::createBr(endBlock);
+			Nucleus::setInsertBlock(falseBlock);
+		}
+
+		ifFalseBlock[ifDepth] = endBlock;
+
+		ifDepth++;
+	}
+
+	void PixelProgram::ENDIF()
+	{
+		ifDepth--;
+
+		BasicBlock *endBlock = ifFalseBlock[ifDepth];
+
+		Nucleus::createBr(endBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		if(isConditionalIf[ifDepth])
+		{
+			enableIndex--;
+		}
+	}
+
+	void PixelProgram::ENDLOOP()
+	{
+		loopRepDepth--;
+
+		aL[loopDepth] = aL[loopDepth] + increment[loopDepth];   // FIXME: +=
+
+		BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		loopDepth--;
+		enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+	}
+
+	void PixelProgram::ENDREP()
+	{
+		loopRepDepth--;
+
+		BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		loopDepth--;
+		enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+	}
+
+	void PixelProgram::ENDWHILE()
+	{
+		loopRepDepth--;
+
+		BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		enableIndex--;
+		whileTest = false;
+	}
+
+	void PixelProgram::ENDSWITCH()
+	{
+		loopRepDepth--;
+
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(endBlock);
+		Nucleus::setInsertBlock(endBlock);
+	}
+
+	void PixelProgram::IF(const Src &src)
+	{
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
+		{
+			IFb(src);
+		}
+		else if(src.type == Shader::PARAMETER_PREDICATE)
+		{
+			IFp(src);
+		}
+		else
+		{
+			Int4 condition = As<Int4>(fetchRegister(src).x);
+			IF(condition);
+		}
+	}
+
+	void PixelProgram::IFb(const Src &boolRegister)
+	{
+		ASSERT(ifDepth < 24 + 4);
+
+		Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData, ps.b[boolRegister.index])) != Byte(0));   // FIXME
+
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = !condition;
+		}
+
+		BasicBlock *trueBlock = Nucleus::createBasicBlock();
+		BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+		branch(condition, trueBlock, falseBlock);
+
+		isConditionalIf[ifDepth] = false;
+		ifFalseBlock[ifDepth] = falseBlock;
+
+		ifDepth++;
+	}
+
+	void PixelProgram::IFp(const Src &predicateRegister)
+	{
+		Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = ~condition;
+		}
+
+		IF(condition);
+	}
+
+	void PixelProgram::IFC(Vector4f &src0, Vector4f &src1, Control control)
+	{
+		Int4 condition;
+
+		switch(control)
+		{
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);  break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x);  break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);  break;
+		default:
+			ASSERT(false);
+		}
+
+		IF(condition);
+	}
+
+	void PixelProgram::IF(Int4 &condition)
+	{
+		condition &= enableStack[enableIndex];
+
+		enableIndex++;
+		enableStack[enableIndex] = condition;
+
+		BasicBlock *trueBlock = Nucleus::createBasicBlock();
+		BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+		Bool notAllFalse = SignMask(condition) != 0;
+
+		branch(notAllFalse, trueBlock, falseBlock);
+
+		isConditionalIf[ifDepth] = true;
+		ifFalseBlock[ifDepth] = falseBlock;
+
+		ifDepth++;
+	}
+
+	void PixelProgram::LABEL(int labelIndex)
+	{
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		Nucleus::setInsertBlock(labelBlock[labelIndex]);
+		currentLabel = labelIndex;
+	}
+
+	void PixelProgram::LOOP(const Src &integerRegister)
+	{
+		loopDepth++;
+
+		iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][0]));
+		aL[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][1]));
+		increment[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][2]));
+
+		//	If(increment[loopDepth] == 0)
+		//	{
+		//		increment[loopDepth] = 1;
+		//	}
+
+		BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		BasicBlock *testBlock = Nucleus::createBasicBlock();
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		// FIXME: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+
+		branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+		Nucleus::setInsertBlock(loopBlock);
+
+		iteration[loopDepth] = iteration[loopDepth] - 1;   // FIXME: --
+
+		loopRepDepth++;
+	}
+
+	void PixelProgram::REP(const Src &integerRegister)
+	{
+		loopDepth++;
+
+		iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData, ps.i[integerRegister.index][0]));
+		aL[loopDepth] = aL[loopDepth - 1];
+
+		BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		BasicBlock *testBlock = Nucleus::createBasicBlock();
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		// FIXME: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+
+		branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+		Nucleus::setInsertBlock(loopBlock);
+
+		iteration[loopDepth] = iteration[loopDepth] - 1;   // FIXME: --
+
+		loopRepDepth++;
+	}
+
+	void PixelProgram::WHILE(const Src &temporaryRegister)
+	{
+		enableIndex++;
+
+		BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		BasicBlock *testBlock = Nucleus::createBasicBlock();
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		Int4 restoreBreak = enableBreak;
+		Int4 restoreContinue = enableContinue;
+
+		// TODO: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+		enableContinue = restoreContinue;
+
+		const Vector4f &src = fetchRegister(temporaryRegister);
+		Int4 condition = As<Int4>(src.x);
+		condition &= enableStack[enableIndex - 1];
+		if(shader->containsLeaveInstruction()) condition &= enableLeave;
+		if(shader->containsBreakInstruction()) condition &= enableBreak;
+		enableStack[enableIndex] = condition;
+
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, loopBlock, endBlock);
+
+		Nucleus::setInsertBlock(endBlock);
+		enableBreak = restoreBreak;
+
+		Nucleus::setInsertBlock(loopBlock);
+
+		loopRepDepth++;
+	}
+
+	void PixelProgram::SWITCH()
+	{
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = nullptr;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		Int4 restoreBreak = enableBreak;
+
+		BasicBlock *currentBlock = Nucleus::getInsertBlock();
+
+		Nucleus::setInsertBlock(endBlock);
+		enableBreak = restoreBreak;
+
+		Nucleus::setInsertBlock(currentBlock);
+
+		loopRepDepth++;
+	}
+
+	void PixelProgram::RET()
+	{
+		if(currentLabel == -1)
+		{
+			returnBlock = Nucleus::createBasicBlock();
+			Nucleus::createBr(returnBlock);
+		}
+		else
+		{
+			BasicBlock *unreachableBlock = Nucleus::createBasicBlock();
+
+			if(callRetBlock[currentLabel].size() > 1)   // Pop the return destination from the call stack
+			{
+				// FIXME: Encapsulate
+				UInt index = callStack[--stackIndex];
+
+				Value *value = index.loadValue();
+				SwitchCases *switchCases = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock[currentLabel].size());
+
+				for(unsigned int i = 0; i < callRetBlock[currentLabel].size(); i++)
+				{
+					Nucleus::addSwitchCase(switchCases, i, callRetBlock[currentLabel][i]);
+				}
+			}
+			else if(callRetBlock[currentLabel].size() == 1)   // Jump directly to the unique return destination
+			{
+				Nucleus::createBr(callRetBlock[currentLabel][0]);
+			}
+			else   // Function isn't called
+			{
+				Nucleus::createBr(unreachableBlock);
+			}
+
+			Nucleus::setInsertBlock(unreachableBlock);
+			Nucleus::createUnreachable();
+		}
+	}
+
+	void PixelProgram::LEAVE()
+	{
+		enableLeave = enableLeave & ~enableStack[enableIndex];
+
+		// FIXME: Return from function if all instances left
+		// FIXME: Use enableLeave in other control-flow constructs
+	}
+}
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
new file mode 100644
index 0000000..240938d
--- /dev/null
+++ b/src/Pipeline/PixelProgram.hpp
@@ -0,0 +1,170 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelProgram_hpp
+#define sw_PixelProgram_hpp
+
+#include "PixelRoutine.hpp"
+#include "SamplerCore.hpp"
+
+namespace sw
+{
+	class PixelProgram : public PixelRoutine
+	{
+	public:
+		PixelProgram(const PixelProcessor::State &state, const PixelShader *shader) :
+			PixelRoutine(state, shader), r(shader->indirectAddressableTemporaries),
+			loopDepth(-1), ifDepth(0), loopRepDepth(0), currentLabel(-1), whileTest(false)
+		{
+			for(int i = 0; i < 2048; ++i)
+			{
+				labelBlock[i] = 0;
+			}
+
+			enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+
+			if(shader->containsBreakInstruction())
+			{
+				enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+			}
+
+			if(shader->containsContinueInstruction())
+			{
+				enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+			}
+		}
+
+		virtual ~PixelProgram() {}
+
+	protected:
+		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w);
+		virtual void applyShader(Int cMask[4]);
+		virtual Bool alphaTest(Int cMask[4]);
+		virtual void rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
+
+	private:
+		// Temporary registers
+		RegisterArray<NUM_TEMPORARY_REGISTERS> r;
+
+		// Color outputs
+		Vector4f c[RENDERTARGETS];
+		RegisterArray<RENDERTARGETS, true> oC;
+
+		// Shader variables
+		Vector4f vPos;
+		Vector4f vFace;
+
+		// DX9 specific variables
+		Vector4f p0;
+		Array<Int, 4> aL;
+		Array<Int, 4> increment;
+		Array<Int, 4> iteration;
+
+		Int loopDepth;    // FIXME: Add support for switch
+		Int stackIndex;   // FIXME: Inc/decrement callStack
+		Array<UInt, 16> callStack;
+
+		// Per pixel based on conditions reached
+		Int enableIndex;
+		Array<Int4, 1 + 24> enableStack;
+		Int4 enableBreak;
+		Int4 enableContinue;
+		Int4 enableLeave;
+
+		Vector4f sampleTexture(const Src &sampler, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+		Vector4f sampleTexture(int samplerIndex, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+
+		// Raster operations
+		void clampColor(Vector4f oC[RENDERTARGETS]);
+
+		Int4 enableMask(const Shader::Instruction *instruction);
+
+		Vector4f fetchRegister(const Src &src, unsigned int offset = 0);
+		Vector4f readConstant(const Src &src, unsigned int offset = 0);
+		RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index);
+		RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index, Int& offset);
+		Int relativeAddress(const Shader::Relative &rel, int bufferIndex = -1);
+		Int4 dynamicAddress(const Shader::Relative &rel);
+
+		Float4 linearToSRGB(const Float4 &x);
+
+		// Instructions
+		typedef Shader::Control Control;
+
+		void M3X2(Vector4f &dst, Vector4f &src0, const Src &src1);
+		void M3X3(Vector4f &dst, Vector4f &src0, const Src &src1);
+		void M3X4(Vector4f &dst, Vector4f &src0, const Src &src1);
+		void M4X3(Vector4f &dst, Vector4f &src0, const Src &src1);
+		void M4X4(Vector4f &dst, Vector4f &src0, const Src &src1);
+		void TEX(Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias);
+		void TEXLOD(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &lod);
+		void TEXBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Float4 &bias);
+		void TEXSIZE(Vector4f &dst, Float4 &lod, const Src &src1);
+		void TEXKILL(Int cMask[4], Vector4f &src, unsigned char mask);
+		void TEXOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset);
+		void TEXOFFSETBIAS(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &bias);
+		void TEXLODOFFSET(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &offset, Float4 &lod);
+		void TEXELFETCH(Vector4f &dst, Vector4f &src, const Src &, Float4 &lod);
+		void TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src, const Src &, Vector4f &offset, Float4 &lod);
+		void TEXGRAD(Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &dsx, Vector4f &dsy);
+		void TEXGRADOFFSET(Vector4f &dst, Vector4f &src, const Src &, Vector4f &dsx, Vector4f &dsy, Vector4f &offset);
+		void DISCARD(Int cMask[4], const Shader::Instruction *instruction);
+		void DFDX(Vector4f &dst, Vector4f &src);
+		void DFDY(Vector4f &dst, Vector4f &src);
+		void FWIDTH(Vector4f &dst, Vector4f &src);
+		void BREAK();
+		void BREAKC(Vector4f &src0, Vector4f &src1, Control);
+		void BREAKP(const Src &predicateRegister);
+		void BREAK(Int4 &condition);
+		void CONTINUE();
+		void TEST();
+		void CALL(int labelIndex, int callSiteIndex);
+		void CALLNZ(int labelIndex, int callSiteIndex, const Src &src);
+		void CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister);
+		void CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister);
+		void ELSE();
+		void ENDIF();
+		void ENDLOOP();
+		void ENDREP();
+		void ENDWHILE();
+		void ENDSWITCH();
+		void IF(const Src &src);
+		void IFb(const Src &boolRegister);
+		void IFp(const Src &predicateRegister);
+		void IFC(Vector4f &src0, Vector4f &src1, Control);
+		void IF(Int4 &condition);
+		void LABEL(int labelIndex);
+		void LOOP(const Src &integerRegister);
+		void REP(const Src &integerRegister);
+		void WHILE(const Src &temporaryRegister);
+		void SWITCH();
+		void RET();
+		void LEAVE();
+
+		int ifDepth;
+		int loopRepDepth;
+		int currentLabel;
+		bool whileTest;
+
+		BasicBlock *ifFalseBlock[24 + 24];
+		BasicBlock *loopRepTestBlock[4];
+		BasicBlock *loopRepEndBlock[4];
+		BasicBlock *labelBlock[2048];
+		std::vector<BasicBlock*> callRetBlock[2048];
+		BasicBlock *returnBlock;
+		bool isConditionalIf[24 + 24];
+	};
+}
+
+#endif
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
new file mode 100644
index 0000000..146e42d
--- /dev/null
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -0,0 +1,2724 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelRoutine.hpp"
+
+#include "SamplerCore.hpp"
+#include "Constants.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Renderer/QuadRasterizer.hpp"
+#include "Renderer/Surface.hpp"
+#include "Renderer/Primitive.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	extern bool complementaryDepthBuffer;
+	extern bool postBlendSRGB;
+	extern bool exactColorRounding;
+	extern bool forceClearRegisters;
+
+	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
+		: QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
+	{
+		if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
+		{
+			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
+			{
+				v[i].x = Float4(0.0f);
+				v[i].y = Float4(0.0f);
+				v[i].z = Float4(0.0f);
+				v[i].w = Float4(0.0f);
+			}
+		}
+	}
+
+	PixelRoutine::~PixelRoutine()
+	{
+	}
+
+	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
+	{
+		#if PERF_PROFILE
+			Long pipeTime = Ticks();
+		#endif
+
+		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
+
+		Int zMask[4];   // Depth mask
+		Int sMask[4];   // Stencil mask
+
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			zMask[q] = cMask[q];
+			sMask[q] = cMask[q];
+		}
+
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
+		}
+
+		Float4 f;
+		Float4 rhwCentroid;
+
+		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
+
+		if(interpolateZ())
+		{
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				Float4 x = xxxx;
+
+				if(state.multiSample > 1)
+				{
+					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
+				}
+
+				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
+			}
+		}
+
+		Bool depthPass = false;
+
+		if(earlyDepthTest)
+		{
+			for(unsigned int q = 0; q < state.multiSample; q++)
+			{
+				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
+			}
+		}
+
+		If(depthPass || Bool(!earlyDepthTest))
+		{
+			#if PERF_PROFILE
+				Long interpTime = Ticks();
+			#endif
+
+			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
+
+			// Centroid locations
+			Float4 XXXX = Float4(0.0f);
+			Float4 YYYY = Float4(0.0f);
+
+			if(state.centroid)
+			{
+				Float4 WWWW(1.0e-9f);
+
+				for(unsigned int q = 0; q < state.multiSample; q++)
+				{
+					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
+					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
+					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
+				}
+
+				WWWW = Rcp_pp(WWWW);
+				XXXX *= WWWW;
+				YYYY *= WWWW;
+
+				XXXX += xxxx;
+				YYYY += yyyy;
+			}
+
+			if(interpolateW())
+			{
+				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
+				rhw = reciprocal(w, false, false, true);
+
+				if(state.centroid)
+				{
+					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
+				}
+			}
+
+			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					if(state.interpolant[interpolant].component & (1 << component))
+					{
+						if(!state.interpolant[interpolant].centroid)
+						{
+							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
+						}
+						else
+						{
+							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
+						}
+					}
+				}
+
+				Float4 rcp;
+
+				switch(state.interpolant[interpolant].project)
+				{
+				case 0:
+					break;
+				case 1:
+					rcp = reciprocal(v[interpolant].y);
+					v[interpolant].x = v[interpolant].x * rcp;
+					break;
+				case 2:
+					rcp = reciprocal(v[interpolant].z);
+					v[interpolant].x = v[interpolant].x * rcp;
+					v[interpolant].y = v[interpolant].y * rcp;
+					break;
+				case 3:
+					rcp = reciprocal(v[interpolant].w);
+					v[interpolant].x = v[interpolant].x * rcp;
+					v[interpolant].y = v[interpolant].y * rcp;
+					v[interpolant].z = v[interpolant].z * rcp;
+					break;
+				}
+			}
+
+			if(state.fog.component)
+			{
+				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
+			}
+
+			setBuiltins(x, y, z, w);
+
+			#if PERF_PROFILE
+				cycles[PERF_INTERP] += Ticks() - interpTime;
+			#endif
+
+			Bool alphaPass = true;
+
+			if(colorUsed())
+			{
+				#if PERF_PROFILE
+					Long shaderTime = Ticks();
+				#endif
+
+				applyShader(cMask);
+
+				#if PERF_PROFILE
+					cycles[PERF_SHADER] += Ticks() - shaderTime;
+				#endif
+
+				alphaPass = alphaTest(cMask);
+
+				if((shader && shader->containsKill()) || state.alphaTestActive())
+				{
+					for(unsigned int q = 0; q < state.multiSample; q++)
+					{
+						zMask[q] &= cMask[q];
+						sMask[q] &= cMask[q];
+					}
+				}
+			}
+
+			If(alphaPass)
+			{
+				if(!earlyDepthTest)
+				{
+					for(unsigned int q = 0; q < state.multiSample; q++)
+					{
+						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
+					}
+				}
+
+				#if PERF_PROFILE
+					Long ropTime = Ticks();
+				#endif
+
+				If(depthPass || Bool(earlyDepthTest))
+				{
+					for(unsigned int q = 0; q < state.multiSample; q++)
+					{
+						if(state.multiSampleMask & (1 << q))
+						{
+							writeDepth(zBuffer, q, x, z[q], zMask[q]);
+
+							if(state.occlusionEnabled)
+							{
+								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
+							}
+						}
+					}
+
+					if(colorUsed())
+					{
+						#if PERF_PROFILE
+							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
+						#endif
+
+						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
+					}
+				}
+
+				#if PERF_PROFILE
+					cycles[PERF_ROP] += Ticks() - ropTime;
+				#endif
+			}
+		}
+
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			if(state.multiSampleMask & (1 << q))
+			{
+				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
+			}
+		}
+
+		#if PERF_PROFILE
+			cycles[PERF_PIPE] += Ticks() - pipeTime;
+		#endif
+	}
+
+	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
+	{
+		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
+
+		if(!flat)
+		{
+			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
+			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
+
+			if(perspective)
+			{
+				interpolant *= rhw;
+			}
+		}
+
+		return interpolant;
+	}
+
+	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
+	{
+		if(!state.stencilActive)
+		{
+			return;
+		}
+
+		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
+
+		Pointer<Byte> buffer = sBuffer + 2 * x;
+
+		if(q > 0)
+		{
+			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
+		}
+
+		Byte8 value = *Pointer<Byte8>(buffer);
+		Byte8 valueCCW = value;
+
+		if(!state.noStencilMask)
+		{
+			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
+		}
+
+		stencilTest(value, state.stencilCompareMode, false);
+
+		if(state.twoSidedStencil)
+		{
+			if(!state.noStencilMaskCCW)
+			{
+				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
+			}
+
+			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
+
+			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
+			value |= valueCCW;
+		}
+
+		sMask = SignMask(value) & cMask;
+	}
+
+	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
+	{
+		Byte8 equal;
+
+		switch(stencilCompareMode)
+		{
+		case STENCIL_ALWAYS:
+			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+			break;
+		case STENCIL_NEVER:
+			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+			break;
+		case STENCIL_LESS:			// a < b ~ b > a
+			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+			break;
+		case STENCIL_EQUAL:
+			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+			break;
+		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
+			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+			break;
+		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
+			equal = value;
+			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
+			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+			value |= equal;
+			break;
+		case STENCIL_GREATER:		// a > b
+			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
+			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
+			value = equal;
+			break;
+		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
+			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
+			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
+	{
+		if(!state.depthTestActive)
+		{
+			return true;
+		}
+
+		Float4 Z = z;
+
+		if(shader && shader->depthOverride())
+		{
+			if(complementaryDepthBuffer)
+			{
+				Z = Float4(1.0f) - oDepth;
+			}
+			else
+			{
+				Z = oDepth;
+			}
+		}
+
+		Pointer<Byte> buffer;
+		Int pitch;
+
+		if(!state.quadLayoutDepthBuffer)
+		{
+			buffer = zBuffer + 4 * x;
+			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+		}
+		else
+		{
+			buffer = zBuffer + 8 * x;
+		}
+
+		if(q > 0)
+		{
+			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+		}
+
+		Float4 zValue;
+
+		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
+		{
+			if(!state.quadLayoutDepthBuffer)
+			{
+				// FIXME: Properly optimizes?
+				zValue.xy = *Pointer<Float4>(buffer);
+				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+			}
+			else
+			{
+				zValue = *Pointer<Float4>(buffer, 16);
+			}
+		}
+
+		Int4 zTest;
+
+		switch(state.depthCompareMode)
+		{
+		case DEPTH_ALWAYS:
+			// Optimized
+			break;
+		case DEPTH_NEVER:
+			// Optimized
+			break;
+		case DEPTH_EQUAL:
+			zTest = CmpEQ(zValue, Z);
+			break;
+		case DEPTH_NOTEQUAL:
+			zTest = CmpNEQ(zValue, Z);
+			break;
+		case DEPTH_LESS:
+			if(complementaryDepthBuffer)
+			{
+				zTest = CmpLT(zValue, Z);
+			}
+			else
+			{
+				zTest = CmpNLE(zValue, Z);
+			}
+			break;
+		case DEPTH_GREATEREQUAL:
+			if(complementaryDepthBuffer)
+			{
+				zTest = CmpNLT(zValue, Z);
+			}
+			else
+			{
+				zTest = CmpLE(zValue, Z);
+			}
+			break;
+		case DEPTH_LESSEQUAL:
+			if(complementaryDepthBuffer)
+			{
+				zTest = CmpLE(zValue, Z);
+			}
+			else
+			{
+				zTest = CmpNLT(zValue, Z);
+			}
+			break;
+		case DEPTH_GREATER:
+			if(complementaryDepthBuffer)
+			{
+				zTest = CmpNLE(zValue, Z);
+			}
+			else
+			{
+				zTest = CmpLT(zValue, Z);
+			}
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		switch(state.depthCompareMode)
+		{
+		case DEPTH_ALWAYS:
+			zMask = cMask;
+			break;
+		case DEPTH_NEVER:
+			zMask = 0x0;
+			break;
+		default:
+			zMask = SignMask(zTest) & cMask;
+			break;
+		}
+
+		if(state.stencilActive)
+		{
+			zMask &= sMask;
+		}
+
+		return zMask != 0;
+	}
+
+	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
+	{
+		Short4 cmp;
+		Short4 equal;
+
+		switch(state.alphaCompareMode)
+		{
+		case ALPHA_ALWAYS:
+			aMask = 0xF;
+			break;
+		case ALPHA_NEVER:
+			aMask = 0x0;
+			break;
+		case ALPHA_EQUAL:
+			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+			break;
+		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
+			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+			break;
+		case ALPHA_LESS:           // a < b ~ b > a
+			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+			break;
+		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
+			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+			cmp |= equal;
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+			break;
+		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
+			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+			break;
+		case ALPHA_GREATER:        // a > b
+			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
+			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
+	{
+		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
+		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
+		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
+		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
+
+		Int aMask0 = SignMask(coverage0);
+		Int aMask1 = SignMask(coverage1);
+		Int aMask2 = SignMask(coverage2);
+		Int aMask3 = SignMask(coverage3);
+
+		cMask[0] &= aMask0;
+		cMask[1] &= aMask1;
+		cMask[2] &= aMask2;
+		cMask[3] &= aMask3;
+	}
+
+	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
+	{
+		if(!state.fogActive)
+		{
+			return;
+		}
+
+		if(state.pixelFogMode != FOG_NONE)
+		{
+			pixelFog(fog);
+
+			fog = Min(fog, Float4(1.0f));
+			fog = Max(fog, Float4(0.0f));
+		}
+
+		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
+		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
+		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
+
+		c0.x *= fog;
+		c0.y *= fog;
+		c0.z *= fog;
+
+		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
+		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
+		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
+	}
+
+	void PixelRoutine::pixelFog(Float4 &visibility)
+	{
+		Float4 &zw = visibility;
+
+		if(state.pixelFogMode != FOG_NONE)
+		{
+			if(state.wBasedFog)
+			{
+				zw = rhw;
+			}
+			else
+			{
+				if(complementaryDepthBuffer)
+				{
+					zw = Float4(1.0f) - z[0];
+				}
+				else
+				{
+					zw = z[0];
+				}
+			}
+		}
+
+		switch(state.pixelFogMode)
+		{
+		case FOG_NONE:
+			break;
+		case FOG_LINEAR:
+			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
+			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
+			break;
+		case FOG_EXP:
+			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
+			zw = exponential2(zw, true);
+			break;
+		case FOG_EXP2:
+			zw *= zw;
+			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
+			zw = exponential2(zw, true);
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
+	{
+		if(!state.depthWriteEnable)
+		{
+			return;
+		}
+
+		Float4 Z = z;
+
+		if(shader && shader->depthOverride())
+		{
+			if(complementaryDepthBuffer)
+			{
+				Z = Float4(1.0f) - oDepth;
+			}
+			else
+			{
+				Z = oDepth;
+			}
+		}
+
+		Pointer<Byte> buffer;
+		Int pitch;
+
+		if(!state.quadLayoutDepthBuffer)
+		{
+			buffer = zBuffer + 4 * x;
+			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
+		}
+		else
+		{
+			buffer = zBuffer + 8 * x;
+		}
+
+		if(q > 0)
+		{
+			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
+		}
+
+		Float4 zValue;
+
+		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
+		{
+			if(!state.quadLayoutDepthBuffer)
+			{
+				// FIXME: Properly optimizes?
+				zValue.xy = *Pointer<Float4>(buffer);
+				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+			}
+			else
+			{
+				zValue = *Pointer<Float4>(buffer, 16);
+			}
+		}
+
+		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
+		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
+		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
+
+		if(!state.quadLayoutDepthBuffer)
+		{
+			// FIXME: Properly optimizes?
+			*Pointer<Float2>(buffer) = Float2(Z.xy);
+			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
+		}
+		else
+		{
+			*Pointer<Float4>(buffer, 16) = Z;
+		}
+	}
+
+	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
+	{
+		if(!state.stencilActive)
+		{
+			return;
+		}
+
+		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
+		{
+			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
+			{
+				return;
+			}
+		}
+
+		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
+		{
+			return;
+		}
+
+		Pointer<Byte> buffer = sBuffer + 2 * x;
+
+		if(q > 0)
+		{
+			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
+		}
+
+		Byte8 bufferValue = *Pointer<Byte8>(buffer);
+
+		Byte8 newValue;
+		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
+
+		if(!state.noStencilWriteMask)
+		{
+			Byte8 maskedValue = bufferValue;
+			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
+			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
+			newValue |= maskedValue;
+		}
+
+		if(state.twoSidedStencil)
+		{
+			Byte8 newValueCCW;
+
+			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
+
+			if(!state.noStencilWriteMaskCCW)
+			{
+				Byte8 maskedValue = bufferValue;
+				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
+				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
+				newValueCCW |= maskedValue;
+			}
+
+			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
+			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
+			newValue |= newValueCCW;
+		}
+
+		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
+		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
+		newValue |= bufferValue;
+
+		*Pointer<Byte4>(buffer) = Byte4(newValue);
+	}
+
+	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
+	{
+		Byte8 &pass = newValue;
+		Byte8 fail;
+		Byte8 zFail;
+
+		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
+
+		if(stencilZFailOperation != stencilPassOperation)
+		{
+			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
+		}
+
+		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
+		{
+			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
+		}
+
+		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
+		{
+			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
+			{
+				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
+				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
+				pass |= zFail;
+			}
+
+			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
+			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
+			pass |= fail;
+		}
+	}
+
+	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
+	{
+		switch(operation)
+		{
+		case OPERATION_KEEP:
+			output = bufferValue;
+			break;
+		case OPERATION_ZERO:
+			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+			break;
+		case OPERATION_REPLACE:
+			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
+			break;
+		case OPERATION_INCRSAT:
+			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
+			break;
+		case OPERATION_DECRSAT:
+			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
+			break;
+		case OPERATION_INVERT:
+			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+			break;
+		case OPERATION_INCR:
+			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
+			break;
+		case OPERATION_DECR:
+			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
+	{
+		switch(blendFactorActive)
+		{
+		case BLEND_ZERO:
+			// Optimized
+			break;
+		case BLEND_ONE:
+			// Optimized
+			break;
+		case BLEND_SOURCE:
+			blendFactor.x = current.x;
+			blendFactor.y = current.y;
+			blendFactor.z = current.z;
+			break;
+		case BLEND_INVSOURCE:
+			blendFactor.x = Short4(0xFFFFu) - current.x;
+			blendFactor.y = Short4(0xFFFFu) - current.y;
+			blendFactor.z = Short4(0xFFFFu) - current.z;
+			break;
+		case BLEND_DEST:
+			blendFactor.x = pixel.x;
+			blendFactor.y = pixel.y;
+			blendFactor.z = pixel.z;
+			break;
+		case BLEND_INVDEST:
+			blendFactor.x = Short4(0xFFFFu) - pixel.x;
+			blendFactor.y = Short4(0xFFFFu) - pixel.y;
+			blendFactor.z = Short4(0xFFFFu) - pixel.z;
+			break;
+		case BLEND_SOURCEALPHA:
+			blendFactor.x = current.w;
+			blendFactor.y = current.w;
+			blendFactor.z = current.w;
+			break;
+		case BLEND_INVSOURCEALPHA:
+			blendFactor.x = Short4(0xFFFFu) - current.w;
+			blendFactor.y = Short4(0xFFFFu) - current.w;
+			blendFactor.z = Short4(0xFFFFu) - current.w;
+			break;
+		case BLEND_DESTALPHA:
+			blendFactor.x = pixel.w;
+			blendFactor.y = pixel.w;
+			blendFactor.z = pixel.w;
+			break;
+		case BLEND_INVDESTALPHA:
+			blendFactor.x = Short4(0xFFFFu) - pixel.w;
+			blendFactor.y = Short4(0xFFFFu) - pixel.w;
+			blendFactor.z = Short4(0xFFFFu) - pixel.w;
+			break;
+		case BLEND_SRCALPHASAT:
+			blendFactor.x = Short4(0xFFFFu) - pixel.w;
+			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
+			blendFactor.y = blendFactor.x;
+			blendFactor.z = blendFactor.x;
+			break;
+		case BLEND_CONSTANT:
+			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
+			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
+			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
+			break;
+		case BLEND_INVCONSTANT:
+			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
+			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
+			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
+			break;
+		case BLEND_CONSTANTALPHA:
+			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			break;
+		case BLEND_INVCONSTANTALPHA:
+			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
+	{
+		switch(blendFactorAlphaActive)
+		{
+		case BLEND_ZERO:
+			// Optimized
+			break;
+		case BLEND_ONE:
+			// Optimized
+			break;
+		case BLEND_SOURCE:
+			blendFactor.w = current.w;
+			break;
+		case BLEND_INVSOURCE:
+			blendFactor.w = Short4(0xFFFFu) - current.w;
+			break;
+		case BLEND_DEST:
+			blendFactor.w = pixel.w;
+			break;
+		case BLEND_INVDEST:
+			blendFactor.w = Short4(0xFFFFu) - pixel.w;
+			break;
+		case BLEND_SOURCEALPHA:
+			blendFactor.w = current.w;
+			break;
+		case BLEND_INVSOURCEALPHA:
+			blendFactor.w = Short4(0xFFFFu) - current.w;
+			break;
+		case BLEND_DESTALPHA:
+			blendFactor.w = pixel.w;
+			break;
+		case BLEND_INVDESTALPHA:
+			blendFactor.w = Short4(0xFFFFu) - pixel.w;
+			break;
+		case BLEND_SRCALPHASAT:
+			blendFactor.w = Short4(0xFFFFu);
+			break;
+		case BLEND_CONSTANT:
+		case BLEND_CONSTANTALPHA:
+			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			break;
+		case BLEND_INVCONSTANT:
+		case BLEND_INVCONSTANTALPHA:
+			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	bool PixelRoutine::isSRGB(int index) const
+	{
+		return Surface::isSRGBformat(state.targetFormat[index]);
+	}
+
+	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
+	{
+		Short4 c01;
+		Short4 c23;
+		Pointer<Byte> buffer;
+		Pointer<Byte> buffer2;
+
+		switch(state.targetFormat[index])
+		{
+		case FORMAT_R5G6B5:
+			buffer = cBuffer + 2 * x;
+			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
+
+			pixel.x = c01 & Short4(0xF800u);
+			pixel.y = (c01 & Short4(0x07E0u)) << 5;
+			pixel.z = (c01 & Short4(0x001Fu)) << 11;
+			pixel.w = Short4(0xFFFFu);
+			break;
+		case FORMAT_A8R8G8B8:
+			buffer = cBuffer + 4 * x;
+			c01 = *Pointer<Short4>(buffer);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			c23 = *Pointer<Short4>(buffer);
+			pixel.z = c01;
+			pixel.y = c01;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+			pixel.x = pixel.z;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+			pixel.y = pixel.z;
+			pixel.w = pixel.x;
+			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+			break;
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_A8:
+			buffer = cBuffer + 4 * x;
+			c01 = *Pointer<Short4>(buffer);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			c23 = *Pointer<Short4>(buffer);
+			pixel.z = c01;
+			pixel.y = c01;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+			pixel.x = pixel.z;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+			pixel.y = pixel.z;
+			pixel.w = pixel.x;
+			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+			break;
+		case FORMAT_A8:
+			buffer = cBuffer + 1 * x;
+			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
+			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+			pixel.x = Short4(0x0000);
+			pixel.y = Short4(0x0000);
+			pixel.z = Short4(0x0000);
+			break;
+		case FORMAT_R8:
+			buffer = cBuffer + 1 * x;
+			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
+			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+			pixel.y = Short4(0x0000);
+			pixel.z = Short4(0x0000);
+			pixel.w = Short4(0xFFFFu);
+			break;
+		case FORMAT_X8R8G8B8:
+			buffer = cBuffer + 4 * x;
+			c01 = *Pointer<Short4>(buffer);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			c23 = *Pointer<Short4>(buffer);
+			pixel.z = c01;
+			pixel.y = c01;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+			pixel.x = pixel.z;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+			pixel.y = pixel.z;
+			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+			pixel.w = Short4(0xFFFFu);
+			break;
+		case FORMAT_G8R8:
+			buffer = cBuffer + 2 * x;
+			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
+			pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
+			pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
+			pixel.z = Short4(0x0000u);
+			pixel.w = Short4(0xFFFFu);
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_SRGB8_X8:
+			buffer = cBuffer + 4 * x;
+			c01 = *Pointer<Short4>(buffer);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			c23 = *Pointer<Short4>(buffer);
+			pixel.z = c01;
+			pixel.y = c01;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+			pixel.x = pixel.z;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+			pixel.y = pixel.z;
+			pixel.w = pixel.x;
+			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
+			pixel.w = Short4(0xFFFFu);
+			break;
+		case FORMAT_A8G8R8B8Q:
+			UNIMPLEMENTED();
+		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+			break;
+		case FORMAT_X8G8R8B8Q:
+			UNIMPLEMENTED();
+		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+		//	pixel.w = Short4(0xFFFFu);
+			break;
+		case FORMAT_A16B16G16R16:
+			buffer = cBuffer;
+			pixel.x = *Pointer<Short4>(buffer + 8 * x);
+			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			pixel.z = *Pointer<Short4>(buffer + 8 * x);
+			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
+			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+			break;
+		case FORMAT_G16R16:
+			buffer = cBuffer;
+			pixel.x = *Pointer<Short4>(buffer + 4 * x);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+			pixel.y = *Pointer<Short4>(buffer + 4 * x);
+			pixel.z = pixel.x;
+			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
+			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
+			pixel.y = pixel.z;
+			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
+			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
+			pixel.z = Short4(0xFFFFu);
+			pixel.w = Short4(0xFFFFu);
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
+		{
+			sRGBtoLinear16_12_16(pixel);
+		}
+	}
+
+	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
+	{
+		if(!state.alphaBlendActive)
+		{
+			return;
+		}
+
+		Vector4s pixel;
+		readPixel(index, cBuffer, x, pixel);
+
+		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
+		Vector4s sourceFactor;
+		Vector4s destFactor;
+
+		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
+		blendFactor(destFactor, current, pixel, state.destBlendFactor);
+
+		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
+		{
+			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
+			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
+			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
+		}
+
+		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
+		{
+			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
+			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
+			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
+		}
+
+		switch(state.blendOperation)
+		{
+		case BLENDOP_ADD:
+			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
+			break;
+		case BLENDOP_SUB:
+			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
+			break;
+		case BLENDOP_INVSUB:
+			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
+			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
+			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
+			break;
+		case BLENDOP_MIN:
+			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
+			break;
+		case BLENDOP_MAX:
+			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
+			break;
+		case BLENDOP_SOURCE:
+			// No operation
+			break;
+		case BLENDOP_DEST:
+			current.x = pixel.x;
+			current.y = pixel.y;
+			current.z = pixel.z;
+			break;
+		case BLENDOP_NULL:
+			current.x = Short4(0x0000);
+			current.y = Short4(0x0000);
+			current.z = Short4(0x0000);
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
+		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
+
+		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
+		{
+			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
+		}
+
+		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
+		{
+			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
+		}
+
+		switch(state.blendOperationAlpha)
+		{
+		case BLENDOP_ADD:
+			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
+			break;
+		case BLENDOP_SUB:
+			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
+			break;
+		case BLENDOP_INVSUB:
+			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
+			break;
+		case BLENDOP_MIN:
+			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
+			break;
+		case BLENDOP_MAX:
+			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
+			break;
+		case BLENDOP_SOURCE:
+			// No operation
+			break;
+		case BLENDOP_DEST:
+			current.w = pixel.w;
+			break;
+		case BLENDOP_NULL:
+			current.w = Short4(0x0000);
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
+	{
+		if(state.logicalOperation == LOGICALOP_COPY)
+		{
+			return;
+		}
+
+		Vector4s pixel;
+		readPixel(index, cBuffer, x, pixel);
+
+		switch(state.logicalOperation)
+		{
+		case LOGICALOP_CLEAR:
+			current.x = UShort4(0);
+			current.y = UShort4(0);
+			current.z = UShort4(0);
+			break;
+		case LOGICALOP_SET:
+			current.x = UShort4(0xFFFFu);
+			current.y = UShort4(0xFFFFu);
+			current.z = UShort4(0xFFFFu);
+			break;
+		case LOGICALOP_COPY:
+			ASSERT(false);   // Optimized out
+			break;
+		case LOGICALOP_COPY_INVERTED:
+			current.x = ~current.x;
+			current.y = ~current.y;
+			current.z = ~current.z;
+			break;
+		case LOGICALOP_NOOP:
+			current.x = pixel.x;
+			current.y = pixel.y;
+			current.z = pixel.z;
+			break;
+		case LOGICALOP_INVERT:
+			current.x = ~pixel.x;
+			current.y = ~pixel.y;
+			current.z = ~pixel.z;
+			break;
+		case LOGICALOP_AND:
+			current.x = pixel.x & current.x;
+			current.y = pixel.y & current.y;
+			current.z = pixel.z & current.z;
+			break;
+		case LOGICALOP_NAND:
+			current.x = ~(pixel.x & current.x);
+			current.y = ~(pixel.y & current.y);
+			current.z = ~(pixel.z & current.z);
+			break;
+		case LOGICALOP_OR:
+			current.x = pixel.x | current.x;
+			current.y = pixel.y | current.y;
+			current.z = pixel.z | current.z;
+			break;
+		case LOGICALOP_NOR:
+			current.x = ~(pixel.x | current.x);
+			current.y = ~(pixel.y | current.y);
+			current.z = ~(pixel.z | current.z);
+			break;
+		case LOGICALOP_XOR:
+			current.x = pixel.x ^ current.x;
+			current.y = pixel.y ^ current.y;
+			current.z = pixel.z ^ current.z;
+			break;
+		case LOGICALOP_EQUIV:
+			current.x = ~(pixel.x ^ current.x);
+			current.y = ~(pixel.y ^ current.y);
+			current.z = ~(pixel.z ^ current.z);
+			break;
+		case LOGICALOP_AND_REVERSE:
+			current.x = ~pixel.x & current.x;
+			current.y = ~pixel.y & current.y;
+			current.z = ~pixel.z & current.z;
+			break;
+		case LOGICALOP_AND_INVERTED:
+			current.x = pixel.x & ~current.x;
+			current.y = pixel.y & ~current.y;
+			current.z = pixel.z & ~current.z;
+			break;
+		case LOGICALOP_OR_REVERSE:
+			current.x = ~pixel.x | current.x;
+			current.y = ~pixel.y | current.y;
+			current.z = ~pixel.z | current.z;
+			break;
+		case LOGICALOP_OR_INVERTED:
+			current.x = pixel.x | ~current.x;
+			current.y = pixel.y | ~current.y;
+			current.z = pixel.z | ~current.z;
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
+	{
+		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
+		{
+			linearToSRGB16_12_16(current);
+		}
+
+		if(exactColorRounding)
+		{
+			switch(state.targetFormat[index])
+			{
+			case FORMAT_R5G6B5:
+				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
+				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
+				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
+				break;
+			case FORMAT_X8G8R8B8Q:
+			case FORMAT_A8G8R8B8Q:
+			case FORMAT_X8R8G8B8:
+			case FORMAT_X8B8G8R8:
+			case FORMAT_A8R8G8B8:
+			case FORMAT_A8B8G8R8:
+			case FORMAT_SRGB8_X8:
+			case FORMAT_SRGB8_A8:
+			case FORMAT_G8R8:
+			case FORMAT_R8:
+				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
+				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
+				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
+				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
+				break;
+			default:
+				break;
+			}
+		}
+
+		int rgbaWriteMask = state.colorWriteActive(index);
+		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
+
+		switch(state.targetFormat[index])
+		{
+		case FORMAT_R5G6B5:
+			{
+				current.x = current.x & Short4(0xF800u);
+				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+				current.z = As<UShort4>(current.z) >> 11;
+
+				current.x = current.x | current.y | current.z;
+			}
+			break;
+		case FORMAT_X8G8R8B8Q:
+			UNIMPLEMENTED();
+		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+			break;
+		case FORMAT_A8G8R8B8Q:
+			UNIMPLEMENTED();
+		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
+			break;
+		case FORMAT_X8R8G8B8:
+		case FORMAT_A8R8G8B8:
+			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
+			{
+				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+				current.z = As<Short4>(PackUnsigned(current.z, current.x));
+				current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+				current.x = current.z;
+				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+				current.y = current.z;
+				current.z = As<Short4>(UnpackLow(current.z, current.x));
+				current.y = As<Short4>(UnpackHigh(current.y, current.x));
+			}
+			else
+			{
+				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+				current.z = As<Short4>(PackUnsigned(current.z, current.x));
+				current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+				current.x = current.z;
+				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+				current.y = current.z;
+				current.z = As<Short4>(UnpackLow(current.z, current.x));
+				current.y = As<Short4>(UnpackHigh(current.y, current.x));
+			}
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
+			{
+				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+				current.z = As<Short4>(PackUnsigned(current.x, current.z));
+				current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+				current.x = current.z;
+				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+				current.y = current.z;
+				current.z = As<Short4>(UnpackLow(current.z, current.x));
+				current.y = As<Short4>(UnpackHigh(current.y, current.x));
+			}
+			else
+			{
+				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+				current.z = As<Short4>(PackUnsigned(current.x, current.z));
+				current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+				current.x = current.z;
+				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+				current.y = current.z;
+				current.z = As<Short4>(UnpackLow(current.z, current.x));
+				current.y = As<Short4>(UnpackHigh(current.y, current.x));
+			}
+			break;
+		case FORMAT_G8R8:
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.x = As<Short4>(PackUnsigned(current.x, current.x));
+			current.y = As<Short4>(PackUnsigned(current.y, current.y));
+			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
+			break;
+		case FORMAT_R8:
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.x = As<Short4>(PackUnsigned(current.x, current.x));
+			break;
+		case FORMAT_A8:
+			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+			current.w = As<Short4>(PackUnsigned(current.w, current.w));
+			break;
+		case FORMAT_G16R16:
+			current.z = current.x;
+			current.x = As<Short4>(UnpackLow(current.x, current.y));
+			current.z = As<Short4>(UnpackHigh(current.z, current.y));
+			current.y = current.z;
+			break;
+		case FORMAT_A16B16G16R16:
+			transpose4x4(current.x, current.y, current.z, current.w);
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		Short4 c01 = current.z;
+		Short4 c23 = current.y;
+
+		Int xMask;   // Combination of all masks
+
+		if(state.depthTestActive)
+		{
+			xMask = zMask;
+		}
+		else
+		{
+			xMask = cMask;
+		}
+
+		if(state.stencilActive)
+		{
+			xMask &= sMask;
+		}
+
+		switch(state.targetFormat[index])
+		{
+		case FORMAT_R5G6B5:
+			{
+				Pointer<Byte> buffer = cBuffer + 2 * x;
+				Int value = *Pointer<Int>(buffer);
+
+				Int c01 = Extract(As<Int2>(current.x), 0);
+
+				if((bgraWriteMask & 0x00000007) != 0x00000007)
+				{
+					Int masked = value;
+					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
+					c01 |= masked;
+				}
+
+				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
+				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
+				c01 |= value;
+				*Pointer<Int>(buffer) = c01;
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+				value = *Pointer<Int>(buffer);
+
+				Int c23 = Extract(As<Int2>(current.x), 1);
+
+				if((bgraWriteMask & 0x00000007) != 0x00000007)
+				{
+					Int masked = value;
+					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
+					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
+					c23 |= masked;
+				}
+
+				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
+				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
+				c23 |= value;
+				*Pointer<Int>(buffer) = c23;
+			}
+			break;
+		case FORMAT_A8G8R8B8Q:
+		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
+			UNIMPLEMENTED();
+		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
+
+		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
+		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
+		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
+		//	{
+		//		Short4 masked = value;
+		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+		//		c01 |= masked;
+		//	}
+
+		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+		//	c01 |= value;
+		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
+
+		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
+
+		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
+		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
+		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
+		//	{
+		//		Short4 masked = value;
+		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+		//		c23 |= masked;
+		//	}
+
+		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+		//	c23 |= value;
+		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
+			break;
+		case FORMAT_A8R8G8B8:
+		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
+			{
+				Pointer<Byte> buffer = cBuffer + x * 4;
+				Short4 value = *Pointer<Short4>(buffer);
+
+				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
+				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
+					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
+				{
+					Short4 masked = value;
+					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+					c01 |= masked;
+				}
+
+				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+				c01 |= value;
+				*Pointer<Short4>(buffer) = c01;
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+				value = *Pointer<Short4>(buffer);
+
+				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
+				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
+					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
+				{
+					Short4 masked = value;
+					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
+					c23 |= masked;
+				}
+
+				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+				c23 |= value;
+				*Pointer<Short4>(buffer) = c23;
+			}
+			break;
+		case FORMAT_A8B8G8R8:
+		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+			{
+				Pointer<Byte> buffer = cBuffer + x * 4;
+				Short4 value = *Pointer<Short4>(buffer);
+
+				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
+				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
+				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
+
+				if(masked)
+				{
+					Short4 masked = value;
+					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+					c01 |= masked;
+				}
+
+				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+				c01 |= value;
+				*Pointer<Short4>(buffer) = c01;
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+				value = *Pointer<Short4>(buffer);
+
+				if(masked)
+				{
+					Short4 masked = value;
+					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
+					c23 |= masked;
+				}
+
+				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+				c23 |= value;
+				*Pointer<Short4>(buffer) = c23;
+			}
+			break;
+		case FORMAT_G8R8:
+			if((rgbaWriteMask & 0x00000003) != 0x0)
+			{
+				Pointer<Byte> buffer = cBuffer + 2 * x;
+				Int2 value;
+				value = Insert(value, *Pointer<Int>(buffer), 0);
+				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
+
+				Int2 packedCol = As<Int2>(current.x);
+
+				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+				if((rgbaWriteMask & 0x3) != 0x3)
+				{
+					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+					mergedMask &= rgbaMask;
+				}
+
+				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
+
+				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
+			}
+			break;
+		case FORMAT_R8:
+			if(rgbaWriteMask & 0x00000001)
+			{
+				Pointer<Byte> buffer = cBuffer + 1 * x;
+				Short4 value;
+				value = Insert(value, *Pointer<Short>(buffer), 0);
+				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
+
+				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
+				current.x |= value;
+
+				*Pointer<Short>(buffer) = Extract(current.x, 0);
+				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
+			}
+			break;
+		case FORMAT_A8:
+			if(rgbaWriteMask & 0x00000008)
+			{
+				Pointer<Byte> buffer = cBuffer + 1 * x;
+				Short4 value;
+				value = Insert(value, *Pointer<Short>(buffer), 0);
+				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
+
+				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
+				current.w |= value;
+
+				*Pointer<Short>(buffer) = Extract(current.w, 0);
+				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
+			}
+			break;
+		case FORMAT_G16R16:
+			{
+				Pointer<Byte> buffer = cBuffer + 4 * x;
+
+				Short4 value = *Pointer<Short4>(buffer);
+
+				if((rgbaWriteMask & 0x00000003) != 0x00000003)
+				{
+					Short4 masked = value;
+					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+					current.x |= masked;
+				}
+
+				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
+				current.x |= value;
+				*Pointer<Short4>(buffer) = current.x;
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+				value = *Pointer<Short4>(buffer);
+
+				if((rgbaWriteMask & 0x00000003) != 0x00000003)
+				{
+					Short4 masked = value;
+					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
+					current.y |= masked;
+				}
+
+				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
+				current.y |= value;
+				*Pointer<Short4>(buffer) = current.y;
+			}
+			break;
+		case FORMAT_A16B16G16R16:
+			{
+				Pointer<Byte> buffer = cBuffer + 8 * x;
+
+				{
+					Short4 value = *Pointer<Short4>(buffer);
+
+					if(rgbaWriteMask != 0x0000000F)
+					{
+						Short4 masked = value;
+						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+						current.x |= masked;
+					}
+
+					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
+					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
+					current.x |= value;
+					*Pointer<Short4>(buffer) = current.x;
+				}
+
+				{
+					Short4 value = *Pointer<Short4>(buffer + 8);
+
+					if(rgbaWriteMask != 0x0000000F)
+					{
+						Short4 masked = value;
+						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+						current.y |= masked;
+					}
+
+					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
+					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
+					current.y |= value;
+					*Pointer<Short4>(buffer + 8) = current.y;
+				}
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+				{
+					Short4 value = *Pointer<Short4>(buffer);
+
+					if(rgbaWriteMask != 0x0000000F)
+					{
+						Short4 masked = value;
+						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+						current.z |= masked;
+					}
+
+					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
+					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
+					current.z |= value;
+					*Pointer<Short4>(buffer) = current.z;
+				}
+
+				{
+					Short4 value = *Pointer<Short4>(buffer + 8);
+
+					if(rgbaWriteMask != 0x0000000F)
+					{
+						Short4 masked = value;
+						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
+						current.w |= masked;
+					}
+
+					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
+					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
+					current.w |= value;
+					*Pointer<Short4>(buffer + 8) = current.w;
+				}
+			}
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
+	{
+		switch(blendFactorActive)
+		{
+		case BLEND_ZERO:
+			// Optimized
+			break;
+		case BLEND_ONE:
+			// Optimized
+			break;
+		case BLEND_SOURCE:
+			blendFactor.x = oC.x;
+			blendFactor.y = oC.y;
+			blendFactor.z = oC.z;
+			break;
+		case BLEND_INVSOURCE:
+			blendFactor.x = Float4(1.0f) - oC.x;
+			blendFactor.y = Float4(1.0f) - oC.y;
+			blendFactor.z = Float4(1.0f) - oC.z;
+			break;
+		case BLEND_DEST:
+			blendFactor.x = pixel.x;
+			blendFactor.y = pixel.y;
+			blendFactor.z = pixel.z;
+			break;
+		case BLEND_INVDEST:
+			blendFactor.x = Float4(1.0f) - pixel.x;
+			blendFactor.y = Float4(1.0f) - pixel.y;
+			blendFactor.z = Float4(1.0f) - pixel.z;
+			break;
+		case BLEND_SOURCEALPHA:
+			blendFactor.x = oC.w;
+			blendFactor.y = oC.w;
+			blendFactor.z = oC.w;
+			break;
+		case BLEND_INVSOURCEALPHA:
+			blendFactor.x = Float4(1.0f) - oC.w;
+			blendFactor.y = Float4(1.0f) - oC.w;
+			blendFactor.z = Float4(1.0f) - oC.w;
+			break;
+		case BLEND_DESTALPHA:
+			blendFactor.x = pixel.w;
+			blendFactor.y = pixel.w;
+			blendFactor.z = pixel.w;
+			break;
+		case BLEND_INVDESTALPHA:
+			blendFactor.x = Float4(1.0f) - pixel.w;
+			blendFactor.y = Float4(1.0f) - pixel.w;
+			blendFactor.z = Float4(1.0f) - pixel.w;
+			break;
+		case BLEND_SRCALPHASAT:
+			blendFactor.x = Float4(1.0f) - pixel.w;
+			blendFactor.x = Min(blendFactor.x, oC.w);
+			blendFactor.y = blendFactor.x;
+			blendFactor.z = blendFactor.x;
+			break;
+		case BLEND_CONSTANT:
+			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
+			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
+			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
+			break;
+		case BLEND_INVCONSTANT:
+			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
+			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
+			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
+	{
+		switch(blendFactorAlphaActive)
+		{
+		case BLEND_ZERO:
+			// Optimized
+			break;
+		case BLEND_ONE:
+			// Optimized
+			break;
+		case BLEND_SOURCE:
+			blendFactor.w = oC.w;
+			break;
+		case BLEND_INVSOURCE:
+			blendFactor.w = Float4(1.0f) - oC.w;
+			break;
+		case BLEND_DEST:
+			blendFactor.w = pixel.w;
+			break;
+		case BLEND_INVDEST:
+			blendFactor.w = Float4(1.0f) - pixel.w;
+			break;
+		case BLEND_SOURCEALPHA:
+			blendFactor.w = oC.w;
+			break;
+		case BLEND_INVSOURCEALPHA:
+			blendFactor.w = Float4(1.0f) - oC.w;
+			break;
+		case BLEND_DESTALPHA:
+			blendFactor.w = pixel.w;
+			break;
+		case BLEND_INVDESTALPHA:
+			blendFactor.w = Float4(1.0f) - pixel.w;
+			break;
+		case BLEND_SRCALPHASAT:
+			blendFactor.w = Float4(1.0f);
+			break;
+		case BLEND_CONSTANT:
+			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
+			break;
+		case BLEND_INVCONSTANT:
+			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
+	{
+		if(!state.alphaBlendActive)
+		{
+			return;
+		}
+
+		Pointer<Byte> buffer;
+		Vector4f pixel;
+
+		Vector4s color;
+		Short4 c01;
+		Short4 c23;
+
+		Float4 one;
+		if(Surface::isFloatFormat(state.targetFormat[index]))
+		{
+			one = Float4(1.0f);
+		}
+		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
+		{
+			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
+		}
+
+		switch(state.targetFormat[index])
+		{
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_R32F:
+			buffer = cBuffer;
+			// FIXME: movlps
+			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
+			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			// FIXME: movhps
+			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
+			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
+			pixel.y = pixel.z = pixel.w = one;
+			break;
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_G32R32F:
+			buffer = cBuffer;
+			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
+			pixel.z = pixel.x;
+			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
+			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
+			pixel.y = pixel.z;
+			pixel.z = pixel.w = one;
+			break;
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+			buffer = cBuffer;
+			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
+			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
+			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+			if(state.targetFormat[index] == FORMAT_X32B32G32R32F ||
+			   state.targetFormat[index] == FORMAT_X32B32G32R32F_UNSIGNED)
+			{
+				pixel.w = Float4(1.0f);
+			}
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
+		{
+			sRGBtoLinear(pixel.x);
+			sRGBtoLinear(pixel.y);
+			sRGBtoLinear(pixel.z);
+		}
+
+		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
+		Vector4f sourceFactor;
+		Vector4f destFactor;
+
+		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
+		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
+
+		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
+		{
+			oC.x *= sourceFactor.x;
+			oC.y *= sourceFactor.y;
+			oC.z *= sourceFactor.z;
+		}
+
+		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
+		{
+			pixel.x *= destFactor.x;
+			pixel.y *= destFactor.y;
+			pixel.z *= destFactor.z;
+		}
+
+		switch(state.blendOperation)
+		{
+		case BLENDOP_ADD:
+			oC.x += pixel.x;
+			oC.y += pixel.y;
+			oC.z += pixel.z;
+			break;
+		case BLENDOP_SUB:
+			oC.x -= pixel.x;
+			oC.y -= pixel.y;
+			oC.z -= pixel.z;
+			break;
+		case BLENDOP_INVSUB:
+			oC.x = pixel.x - oC.x;
+			oC.y = pixel.y - oC.y;
+			oC.z = pixel.z - oC.z;
+			break;
+		case BLENDOP_MIN:
+			oC.x = Min(oC.x, pixel.x);
+			oC.y = Min(oC.y, pixel.y);
+			oC.z = Min(oC.z, pixel.z);
+			break;
+		case BLENDOP_MAX:
+			oC.x = Max(oC.x, pixel.x);
+			oC.y = Max(oC.y, pixel.y);
+			oC.z = Max(oC.z, pixel.z);
+			break;
+		case BLENDOP_SOURCE:
+			// No operation
+			break;
+		case BLENDOP_DEST:
+			oC.x = pixel.x;
+			oC.y = pixel.y;
+			oC.z = pixel.z;
+			break;
+		case BLENDOP_NULL:
+			oC.x = Float4(0.0f);
+			oC.y = Float4(0.0f);
+			oC.z = Float4(0.0f);
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
+		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
+
+		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
+		{
+			oC.w *= sourceFactor.w;
+		}
+
+		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
+		{
+			pixel.w *= destFactor.w;
+		}
+
+		switch(state.blendOperationAlpha)
+		{
+		case BLENDOP_ADD:
+			oC.w += pixel.w;
+			break;
+		case BLENDOP_SUB:
+			oC.w -= pixel.w;
+			break;
+		case BLENDOP_INVSUB:
+			pixel.w -= oC.w;
+			oC.w = pixel.w;
+			break;
+		case BLENDOP_MIN:
+			oC.w = Min(oC.w, pixel.w);
+			break;
+		case BLENDOP_MAX:
+			oC.w = Max(oC.w, pixel.w);
+			break;
+		case BLENDOP_SOURCE:
+			// No operation
+			break;
+		case BLENDOP_DEST:
+			oC.w = pixel.w;
+			break;
+		case BLENDOP_NULL:
+			oC.w = Float4(0.0f);
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
+	{
+		switch(state.targetFormat[index])
+		{
+		case FORMAT_R32F:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+			break;
+		case FORMAT_G32R32F:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+			oC.z = oC.x;
+			oC.x = UnpackLow(oC.x, oC.y);
+			oC.z = UnpackHigh(oC.z, oC.y);
+			oC.y = oC.z;
+			break;
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+			transpose4x4(oC.x, oC.y, oC.z, oC.w);
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		int rgbaWriteMask = state.colorWriteActive(index);
+
+		Int xMask;   // Combination of all masks
+
+		if(state.depthTestActive)
+		{
+			xMask = zMask;
+		}
+		else
+		{
+			xMask = cMask;
+		}
+
+		if(state.stencilActive)
+		{
+			xMask &= sMask;
+		}
+
+		Pointer<Byte> buffer;
+		Float4 value;
+
+		switch(state.targetFormat[index])
+		{
+		case FORMAT_R32F:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+			if(rgbaWriteMask & 0x00000001)
+			{
+				buffer = cBuffer + 4 * x;
+
+				// FIXME: movlps
+				value.x = *Pointer<Float>(buffer + 0);
+				value.y = *Pointer<Float>(buffer + 4);
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+				// FIXME: movhps
+				value.z = *Pointer<Float>(buffer + 0);
+				value.w = *Pointer<Float>(buffer + 4);
+
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
+				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+				// FIXME: movhps
+				*Pointer<Float>(buffer + 0) = oC.x.z;
+				*Pointer<Float>(buffer + 4) = oC.x.w;
+
+				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+				// FIXME: movlps
+				*Pointer<Float>(buffer + 0) = oC.x.x;
+				*Pointer<Float>(buffer + 4) = oC.x.y;
+			}
+			break;
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+			if(rgbaWriteMask & 0x00000001)
+			{
+				buffer = cBuffer + 2 * x;
+
+				UShort4 xyzw;
+				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
+				value = As<Float4>(Int4(xyzw));
+
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
+				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+
+				if(state.targetFormat[index] == FORMAT_R16I)
+				{
+					Float component = oC.x.z;
+					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+					component = oC.x.w;
+					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
+
+					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+					component = oC.x.x;
+					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
+					component = oC.x.y;
+					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
+				}
+				else // FORMAT_R16UI
+				{
+					Float component = oC.x.z;
+					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+					component = oC.x.w;
+					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
+
+					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+					component = oC.x.x;
+					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
+					component = oC.x.y;
+					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
+				}
+			}
+			break;
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+			if(rgbaWriteMask & 0x00000001)
+			{
+				buffer = cBuffer + x;
+
+				UInt xyzw, packedCol;
+
+				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
+
+				Short4 tmpCol = Short4(As<Int4>(oC.x));
+				if(state.targetFormat[index] == FORMAT_R8I)
+				{
+					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
+				}
+				else
+				{
+					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
+				}
+				packedCol = Extract(As<Int2>(tmpCol), 0);
+
+				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
+				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
+
+				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
+				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+				*Pointer<UShort>(buffer) = UShort(packedCol);
+			}
+			break;
+		case FORMAT_G32R32F:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+			buffer = cBuffer + 8 * x;
+
+			value = *Pointer<Float4>(buffer);
+
+			if((rgbaWriteMask & 0x00000003) != 0x00000003)
+			{
+				Float4 masked = value;
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
+			}
+
+			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
+			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+			*Pointer<Float4>(buffer) = oC.x;
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+			value = *Pointer<Float4>(buffer);
+
+			if((rgbaWriteMask & 0x00000003) != 0x00000003)
+			{
+				Float4 masked;
+
+				masked = value;
+				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
+				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
+			}
+
+			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
+			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
+			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
+			*Pointer<Float4>(buffer) = oC.y;
+			break;
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+			if((rgbaWriteMask & 0x00000003) != 0x0)
+			{
+				buffer = cBuffer + 4 * x;
+
+				UInt2 rgbaMask;
+				UShort4 packedCol = UShort4(As<Int4>(oC.x));
+				UShort4 value = *Pointer<UShort4>(buffer);
+				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+				if((rgbaWriteMask & 0x3) != 0x3)
+				{
+					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
+					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				packedCol = UShort4(As<Int4>(oC.y));
+				value = *Pointer<UShort4>(buffer);
+				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+				if((rgbaWriteMask & 0x3) != 0x3)
+				{
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
+			}
+			break;
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+			if((rgbaWriteMask & 0x00000003) != 0x0)
+			{
+				buffer = cBuffer + 2 * x;
+
+				Int2 xyzw, packedCol;
+
+				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
+
+				if(state.targetFormat[index] == FORMAT_G8R8I)
+				{
+					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+				}
+				else
+				{
+					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+				}
+
+				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+				if((rgbaWriteMask & 0x3) != 0x3)
+				{
+					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
+					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+					mergedMask &= rgbaMask;
+				}
+
+				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
+
+				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
+				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+			}
+			break;
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+			buffer = cBuffer + 16 * x;
+
+			{
+				value = *Pointer<Float4>(buffer, 16);
+
+				if(rgbaWriteMask != 0x0000000F)
+				{
+					Float4 masked = value;
+					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
+				}
+
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
+				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+				*Pointer<Float4>(buffer, 16) = oC.x;
+			}
+
+			{
+				value = *Pointer<Float4>(buffer + 16, 16);
+
+				if(rgbaWriteMask != 0x0000000F)
+				{
+					Float4 masked = value;
+					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
+				}
+
+				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
+				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
+				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
+				*Pointer<Float4>(buffer + 16, 16) = oC.y;
+			}
+
+			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+
+			{
+				value = *Pointer<Float4>(buffer, 16);
+
+				if(rgbaWriteMask != 0x0000000F)
+				{
+					Float4 masked = value;
+					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
+				}
+
+				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
+				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
+				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
+				*Pointer<Float4>(buffer, 16) = oC.z;
+			}
+
+			{
+				value = *Pointer<Float4>(buffer + 16, 16);
+
+				if(rgbaWriteMask != 0x0000000F)
+				{
+					Float4 masked = value;
+					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
+					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
+				}
+
+				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
+				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
+				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
+				*Pointer<Float4>(buffer + 16, 16) = oC.w;
+			}
+			break;
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+			if((rgbaWriteMask & 0x0000000F) != 0x0)
+			{
+				buffer = cBuffer + 8 * x;
+
+				UInt4 rgbaMask;
+				UShort8 value = *Pointer<UShort8>(buffer);
+				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
+				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
+				if((rgbaWriteMask & 0xF) != 0xF)
+				{
+					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
+					rgbaMask = UInt4(tmpMask, tmpMask);
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				value = *Pointer<UShort8>(buffer);
+				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
+				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
+				if((rgbaWriteMask & 0xF) != 0xF)
+				{
+					mergedMask &= rgbaMask;
+				}
+				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
+			}
+			break;
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+			if((rgbaWriteMask & 0x0000000F) != 0x0)
+			{
+				UInt2 value, packedCol, mergedMask;
+
+				buffer = cBuffer + 4 * x;
+
+				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
+				{
+					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+				}
+				else
+				{
+					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+				}
+				value = *Pointer<UInt2>(buffer, 16);
+				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+				if(rgbaWriteMask != 0xF)
+				{
+					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+				}
+				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+
+				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
+				{
+					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+				}
+				else
+				{
+					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+				}
+				value = *Pointer<UInt2>(buffer, 16);
+				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+				if(rgbaWriteMask != 0xF)
+				{
+					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
+				}
+				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
+			}
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
+	{
+		return UShort4(cf * Float4(0xFFFF), saturate);
+	}
+
+	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
+	{
+		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
+
+		c.x = As<UShort4>(c.x) >> 4;
+		c.y = As<UShort4>(c.y) >> 4;
+		c.z = As<UShort4>(c.z) >> 4;
+
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+	}
+
+	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
+	{
+		c.x = As<UShort4>(c.x) >> 4;
+		c.y = As<UShort4>(c.y) >> 4;
+		c.z = As<UShort4>(c.z) >> 4;
+
+		linearToSRGB12_16(c);
+	}
+
+	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
+	{
+		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
+
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+	}
+
+	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
+	{
+		Float4 linear = x * x;
+		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
+
+		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
+	}
+
+	bool PixelRoutine::colorUsed()
+	{
+		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
+	}
+}
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
new file mode 100644
index 0000000..1cd076e
--- /dev/null
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -0,0 +1,93 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelRoutine_hpp
+#define sw_PixelRoutine_hpp
+
+#include "Renderer/QuadRasterizer.hpp"
+
+namespace sw
+{
+	class PixelShader;
+	class SamplerCore;
+
+	class PixelRoutine : public sw::QuadRasterizer, public ShaderCore
+	{
+	public:
+		PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader);
+
+		virtual ~PixelRoutine();
+
+	protected:
+		Float4 z[4]; // Multisampled z
+		Float4 w;    // Used as is
+		Float4 rhw;  // Reciprocal w
+
+		RegisterArray<MAX_FRAGMENT_INPUTS> v;   // Varying registers
+
+		// Depth output
+		Float4 oDepth;
+
+		typedef Shader::SourceParameter Src;
+		typedef Shader::DestinationParameter Dst;
+
+		virtual void setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w) = 0;
+		virtual void applyShader(Int cMask[4]) = 0;
+		virtual Bool alphaTest(Int cMask[4]) = 0;
+		virtual void rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]) = 0;
+
+		virtual void quad(Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y);
+
+		void alphaTest(Int &aMask, Short4 &alpha);
+		void alphaToCoverage(Int cMask[4], Float4 &alpha);
+		void fogBlend(Vector4f &c0, Float4 &fog);
+		void pixelFog(Float4 &visibility);
+
+		// Raster operations
+		void alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x);
+		void logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x);
+		void writeColor(int index, Pointer<Byte> &cBuffer, Int &i, Vector4s &current, Int &sMask, Int &zMask, Int &cMask);
+		void alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x);
+		void writeColor(int index, Pointer<Byte> &cBuffer, Int &i, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask);
+
+		bool isSRGB(int index) const;
+		UShort4 convertFixed16(Float4 &cf, bool saturate = true);
+		void linearToSRGB12_16(Vector4s &c);
+
+	private:
+		Float4 interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
+		void stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask);
+		void stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW);
+		void stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask);
+		void stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW);
+		Bool depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask);
+
+		// Raster operations
+		void blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive);
+		void blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive);
+		void readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel);
+		void blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive);
+		void blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive);
+		void writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask);
+		void writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask);
+
+		void sRGBtoLinear16_12_16(Vector4s &c);
+		void linearToSRGB16_12_16(Vector4s &c);
+		Float4 sRGBtoLinear(const Float4 &x);
+
+		bool colorUsed();
+	};
+}
+
+#endif   // sw_PixelRoutine_hpp
diff --git a/src/Pipeline/PixelShader.cpp b/src/Pipeline/PixelShader.cpp
new file mode 100644
index 0000000..d24e7c2
--- /dev/null
+++ b/src/Pipeline/PixelShader.cpp
@@ -0,0 +1,746 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "PixelShader.hpp"
+
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+	PixelShader::PixelShader(const PixelShader *ps) : Shader()
+	{
+		shaderModel = 0x0300;
+		vPosDeclared = false;
+		vFaceDeclared = false;
+		centroid = false;
+
+		if(ps)   // Make a copy
+		{
+			for(size_t i = 0; i < ps->getLength(); i++)
+			{
+				append(new sw::Shader::Instruction(*ps->getInstruction(i)));
+			}
+
+			memcpy(input, ps->input, sizeof(input));
+			vPosDeclared = ps->vPosDeclared;
+			vFaceDeclared = ps->vFaceDeclared;
+			usedSamplers = ps->usedSamplers;
+
+			optimize();
+			analyze();
+		}
+	}
+
+	PixelShader::PixelShader(const unsigned long *token) : Shader()
+	{
+		parse(token);
+
+		vPosDeclared = false;
+		vFaceDeclared = false;
+		centroid = false;
+
+		optimize();
+		analyze();
+	}
+
+	PixelShader::~PixelShader()
+	{
+	}
+
+	int PixelShader::validate(const unsigned long *const token)
+	{
+		if(!token)
+		{
+			return 0;
+		}
+
+		unsigned short version = (unsigned short)(token[0] & 0x0000FFFF);
+		// unsigned char minorVersion = (unsigned char)(token[0] & 0x000000FF);
+		unsigned char majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
+		ShaderType shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
+
+		if(shaderType != SHADER_PIXEL || majorVersion > 3)
+		{
+			return 0;
+		}
+
+		int instructionCount = 1;
+
+		for(int i = 0; token[i] != 0x0000FFFF; i++)
+		{
+			if((token[i] & 0x0000FFFF) == 0x0000FFFE)   // Comment token
+			{
+				int length = (token[i] & 0x7FFF0000) >> 16;
+
+				i += length;
+			}
+			else
+			{
+				Shader::Opcode opcode = (Shader::Opcode)(token[i] & 0x0000FFFF);
+
+				switch(opcode)
+				{
+				case Shader::OPCODE_RESERVED0:
+				case Shader::OPCODE_MOVA:
+					return 0;   // Unsupported operation
+				default:
+					instructionCount++;
+					break;
+				}
+
+				i += size(token[i], version);
+			}
+		}
+
+		return instructionCount;
+	}
+
+	bool PixelShader::depthOverride() const
+	{
+		return zOverride;
+	}
+
+	bool PixelShader::containsKill() const
+	{
+		return kill;
+	}
+
+	bool PixelShader::containsCentroid() const
+	{
+		return centroid;
+	}
+
+	bool PixelShader::usesDiffuse(int component) const
+	{
+		return input[0][component].active();
+	}
+
+	bool PixelShader::usesSpecular(int component) const
+	{
+		return input[1][component].active();
+	}
+
+	bool PixelShader::usesTexture(int coordinate, int component) const
+	{
+		return input[2 + coordinate][component].active();
+	}
+
+	void PixelShader::setInput(int inputIdx, int nbComponents, const sw::Shader::Semantic& semantic)
+	{
+		for(int i = 0; i < nbComponents; ++i)
+		{
+			input[inputIdx][i] = semantic;
+		}
+	}
+
+	const sw::Shader::Semantic& PixelShader::getInput(int inputIdx, int component) const
+	{
+		return input[inputIdx][component];
+	}
+
+	void PixelShader::analyze()
+	{
+		analyzeZOverride();
+		analyzeKill();
+		analyzeInterpolants();
+		analyzeDirtyConstants();
+		analyzeDynamicBranching();
+		analyzeSamplers();
+		analyzeCallSites();
+		analyzeIndirectAddressing();
+	}
+
+	void PixelShader::analyzeZOverride()
+	{
+		zOverride = false;
+
+		for(const auto &inst : instruction)
+		{
+			if(inst->opcode == Shader::OPCODE_TEXM3X2DEPTH ||
+			   inst->opcode == Shader::OPCODE_TEXDEPTH ||
+			   inst->dst.type == Shader::PARAMETER_DEPTHOUT)
+			{
+				zOverride = true;
+
+				break;
+			}
+		}
+	}
+
+	void PixelShader::analyzeKill()
+	{
+		kill = false;
+
+		for(const auto &inst : instruction)
+		{
+			if(inst->opcode == Shader::OPCODE_TEXKILL ||
+			   inst->opcode == Shader::OPCODE_DISCARD)
+			{
+				kill = true;
+
+				break;
+			}
+		}
+	}
+
+	void PixelShader::analyzeInterpolants()
+	{
+		if(shaderModel < 0x0300)
+		{
+			// Set default mapping; disable unused interpolants below
+			input[0][0] = Semantic(Shader::USAGE_COLOR, 0);
+			input[0][1] = Semantic(Shader::USAGE_COLOR, 0);
+			input[0][2] = Semantic(Shader::USAGE_COLOR, 0);
+			input[0][3] = Semantic(Shader::USAGE_COLOR, 0);
+
+			input[1][0] = Semantic(Shader::USAGE_COLOR, 1);
+			input[1][1] = Semantic(Shader::USAGE_COLOR, 1);
+			input[1][2] = Semantic(Shader::USAGE_COLOR, 1);
+			input[1][3] = Semantic(Shader::USAGE_COLOR, 1);
+
+			for(int i = 0; i < 8; i++)
+			{
+				input[2 + i][0] = Semantic(Shader::USAGE_TEXCOORD, i);
+				input[2 + i][1] = Semantic(Shader::USAGE_TEXCOORD, i);
+				input[2 + i][2] = Semantic(Shader::USAGE_TEXCOORD, i);
+				input[2 + i][3] = Semantic(Shader::USAGE_TEXCOORD, i);
+			}
+
+			Shader::SamplerType samplerType[16];
+
+			for(int i = 0; i < 16; i++)
+			{
+				samplerType[i] = Shader::SAMPLER_UNKNOWN;
+			}
+
+			for(const auto &inst : instruction)
+			{
+				if(inst->dst.type == Shader::PARAMETER_SAMPLER)
+				{
+					int sampler = inst->dst.index;
+
+					samplerType[sampler] = inst->samplerType;
+				}
+			}
+
+			bool interpolant[MAX_FRAGMENT_INPUTS][4] = {{false}};   // Interpolants in use
+
+			for(const auto &inst : instruction)
+			{
+				if(inst->dst.type == Shader::PARAMETER_TEXTURE)
+				{
+					int index = inst->dst.index + 2;
+
+					switch(inst->opcode)
+					{
+					case Shader::OPCODE_TEX:
+					case Shader::OPCODE_TEXBEM:
+					case Shader::OPCODE_TEXBEML:
+					case Shader::OPCODE_TEXCOORD:
+					case Shader::OPCODE_TEXDP3:
+					case Shader::OPCODE_TEXDP3TEX:
+					case Shader::OPCODE_TEXM3X2DEPTH:
+					case Shader::OPCODE_TEXM3X2PAD:
+					case Shader::OPCODE_TEXM3X2TEX:
+					case Shader::OPCODE_TEXM3X3:
+					case Shader::OPCODE_TEXM3X3PAD:
+					case Shader::OPCODE_TEXM3X3TEX:
+						interpolant[index][0] = true;
+						interpolant[index][1] = true;
+						interpolant[index][2] = true;
+						break;
+					case Shader::OPCODE_TEXKILL:
+						if(majorVersion < 2)
+						{
+							interpolant[index][0] = true;
+							interpolant[index][1] = true;
+							interpolant[index][2] = true;
+						}
+						else
+						{
+							interpolant[index][0] = true;
+							interpolant[index][1] = true;
+							interpolant[index][2] = true;
+							interpolant[index][3] = true;
+						}
+						break;
+					case Shader::OPCODE_TEXM3X3VSPEC:
+						interpolant[index][0] = true;
+						interpolant[index][1] = true;
+						interpolant[index][2] = true;
+						interpolant[index - 2][3] = true;
+						interpolant[index - 1][3] = true;
+						interpolant[index - 0][3] = true;
+						break;
+					case Shader::OPCODE_DCL:
+						break;   // Ignore
+					default:   // Arithmetic instruction
+						if(shaderModel >= 0x0104)
+						{
+							ASSERT(false);
+						}
+					}
+				}
+
+				for(int argument = 0; argument < 4; argument++)
+				{
+					if(inst->src[argument].type == Shader::PARAMETER_INPUT ||
+					   inst->src[argument].type == Shader::PARAMETER_TEXTURE)
+					{
+						int index = inst->src[argument].index;
+						int swizzle = inst->src[argument].swizzle;
+						int mask = inst->dst.mask;
+
+						if(inst->src[argument].type == Shader::PARAMETER_TEXTURE)
+						{
+							index += 2;
+						}
+
+						switch(inst->opcode)
+						{
+						case Shader::OPCODE_TEX:
+						case Shader::OPCODE_TEXLDD:
+						case Shader::OPCODE_TEXLDL:
+						case Shader::OPCODE_TEXLOD:
+						case Shader::OPCODE_TEXBIAS:
+						case Shader::OPCODE_TEXOFFSET:
+						case Shader::OPCODE_TEXOFFSETBIAS:
+						case Shader::OPCODE_TEXLODOFFSET:
+						case Shader::OPCODE_TEXELFETCH:
+						case Shader::OPCODE_TEXELFETCHOFFSET:
+						case Shader::OPCODE_TEXGRAD:
+						case Shader::OPCODE_TEXGRADOFFSET:
+							{
+								int sampler = inst->src[1].index;
+
+								switch(samplerType[sampler])
+								{
+								case Shader::SAMPLER_UNKNOWN:
+									if(shaderModel == 0x0104)
+									{
+										if((inst->src[0].swizzle & 0x30) == 0x20)   // .xyz
+										{
+											interpolant[index][0] = true;
+											interpolant[index][1] = true;
+											interpolant[index][2] = true;
+										}
+										else   // .xyw
+										{
+											interpolant[index][0] = true;
+											interpolant[index][1] = true;
+											interpolant[index][3] = true;
+										}
+									}
+									else
+									{
+										ASSERT(false);
+									}
+									break;
+								case Shader::SAMPLER_1D:
+									interpolant[index][0] = true;
+									break;
+								case Shader::SAMPLER_2D:
+									interpolant[index][0] = true;
+									interpolant[index][1] = true;
+									break;
+								case Shader::SAMPLER_CUBE:
+									interpolant[index][0] = true;
+									interpolant[index][1] = true;
+									interpolant[index][2] = true;
+									break;
+								case Shader::SAMPLER_VOLUME:
+									interpolant[index][0] = true;
+									interpolant[index][1] = true;
+									interpolant[index][2] = true;
+									break;
+								default:
+									ASSERT(false);
+								}
+
+								if(inst->bias)
+								{
+									interpolant[index][3] = true;
+								}
+
+								if(inst->project)
+								{
+									interpolant[index][3] = true;
+								}
+
+								if(shaderModel == 0x0104 && inst->opcode == Shader::OPCODE_TEX)
+								{
+									if(inst->src[0].modifier == Shader::MODIFIER_DZ)
+									{
+										interpolant[index][2] = true;
+									}
+
+									if(inst->src[0].modifier == Shader::MODIFIER_DW)
+									{
+										interpolant[index][3] = true;
+									}
+								}
+							}
+							break;
+						case Shader::OPCODE_M3X2:
+							if(mask & 0x1)
+							{
+								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+								interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+								interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+							}
+
+							if(argument == 1)
+							{
+								if(mask & 0x2)
+								{
+									interpolant[index + 1][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+									interpolant[index + 1][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+									interpolant[index + 1][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+									interpolant[index + 1][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+								}
+							}
+							break;
+						case Shader::OPCODE_M3X3:
+							if(mask & 0x1)
+							{
+								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+								interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+								interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+							}
+
+							if(argument == 1)
+							{
+								if(mask & 0x2)
+								{
+									interpolant[index + 1][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+									interpolant[index + 1][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+									interpolant[index + 1][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+									interpolant[index + 1][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+								}
+
+								if(mask & 0x4)
+								{
+									interpolant[index + 2][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+									interpolant[index + 2][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+									interpolant[index + 2][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+									interpolant[index + 2][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+								}
+							}
+							break;
+						case Shader::OPCODE_M3X4:
+							if(mask & 0x1)
+							{
+								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+								interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+								interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+							}
+
+							if(argument == 1)
+							{
+								if(mask & 0x2)
+								{
+									interpolant[index + 1][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+									interpolant[index + 1][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+									interpolant[index + 1][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+									interpolant[index + 1][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+								}
+
+								if(mask & 0x4)
+								{
+									interpolant[index + 2][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+									interpolant[index + 2][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+									interpolant[index + 2][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+									interpolant[index + 2][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+								}
+
+								if(mask & 0x8)
+								{
+									interpolant[index + 3][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+									interpolant[index + 3][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+									interpolant[index + 3][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+									interpolant[index + 3][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+								}
+							}
+							break;
+						case Shader::OPCODE_M4X3:
+							if(mask & 0x1)
+							{
+								interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+								interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+								interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+								interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+							}
+
+							if(argument == 1)
+							{
+								if(mask & 0x2)
+								{
+									interpolant[index + 1][0] |= swizzleContainsComponent(swizzle, 0);
+									interpolant[index + 1][1] |= swizzleContainsComponent(swizzle, 1);
+									interpolant[index + 1][2] |= swizzleContainsComponent(swizzle, 2);
+									interpolant[index + 1][3] |= swizzleContainsComponent(swizzle, 3);
+								}
+
+								if(mask & 0x4)
+								{
+									interpolant[index + 2][0] |= swizzleContainsComponent(swizzle, 0);
+									interpolant[index + 2][1] |= swizzleContainsComponent(swizzle, 1);
+									interpolant[index + 2][2] |= swizzleContainsComponent(swizzle, 2);
+									interpolant[index + 2][3] |= swizzleContainsComponent(swizzle, 3);
+								}
+							}
+							break;
+						case Shader::OPCODE_M4X4:
+							if(mask & 0x1)
+							{
+								interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+								interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+								interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+								interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+							}
+
+							if(argument == 1)
+							{
+								if(mask & 0x2)
+								{
+									interpolant[index + 1][0] |= swizzleContainsComponent(swizzle, 0);
+									interpolant[index + 1][1] |= swizzleContainsComponent(swizzle, 1);
+									interpolant[index + 1][2] |= swizzleContainsComponent(swizzle, 2);
+									interpolant[index + 1][3] |= swizzleContainsComponent(swizzle, 3);
+								}
+
+								if(mask & 0x4)
+								{
+									interpolant[index + 2][0] |= swizzleContainsComponent(swizzle, 0);
+									interpolant[index + 2][1] |= swizzleContainsComponent(swizzle, 1);
+									interpolant[index + 2][2] |= swizzleContainsComponent(swizzle, 2);
+									interpolant[index + 2][3] |= swizzleContainsComponent(swizzle, 3);
+								}
+
+								if(mask & 0x8)
+								{
+									interpolant[index + 3][0] |= swizzleContainsComponent(swizzle, 0);
+									interpolant[index + 3][1] |= swizzleContainsComponent(swizzle, 1);
+									interpolant[index + 3][2] |= swizzleContainsComponent(swizzle, 2);
+									interpolant[index + 3][3] |= swizzleContainsComponent(swizzle, 3);
+								}
+							}
+							break;
+						case Shader::OPCODE_CRS:
+							if(mask & 0x1)
+							{
+								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x6);
+								interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x6);
+								interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x6);
+								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x6);
+							}
+
+							if(mask & 0x2)
+							{
+								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x5);
+								interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x5);
+								interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x5);
+								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x5);
+							}
+
+							if(mask & 0x4)
+							{
+								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x3);
+								interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x3);
+								interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x3);
+								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x3);
+							}
+							break;
+						case Shader::OPCODE_DP2ADD:
+							if(argument == 0 || argument == 1)
+							{
+								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x3);
+								interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x3);
+								interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x3);
+								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x3);
+							}
+							else   // argument == 2
+							{
+								interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+								interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+								interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+								interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+							}
+							break;
+						case Shader::OPCODE_DP3:
+							interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
+							interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
+							interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
+							interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
+							break;
+						case Shader::OPCODE_DP4:
+							interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+							interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+							interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+							interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+							break;
+						case Shader::OPCODE_SINCOS:
+						case Shader::OPCODE_EXP2X:
+						case Shader::OPCODE_LOG2X:
+						case Shader::OPCODE_POWX:
+						case Shader::OPCODE_RCPX:
+						case Shader::OPCODE_RSQX:
+							interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
+							interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
+							interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
+							interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
+							break;
+						case Shader::OPCODE_NRM3:
+							interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7 | mask);
+							interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7 | mask);
+							interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7 | mask);
+							interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7 | mask);
+							break;
+						case Shader::OPCODE_MOV:
+						case Shader::OPCODE_ADD:
+						case Shader::OPCODE_SUB:
+						case Shader::OPCODE_MUL:
+						case Shader::OPCODE_MAD:
+						case Shader::OPCODE_ABS:
+						case Shader::OPCODE_CMP0:
+						case Shader::OPCODE_CND:
+						case Shader::OPCODE_FRC:
+						case Shader::OPCODE_LRP:
+						case Shader::OPCODE_MAX:
+						case Shader::OPCODE_MIN:
+						case Shader::OPCODE_CMP:
+						case Shader::OPCODE_BREAKC:
+						case Shader::OPCODE_DFDX:
+						case Shader::OPCODE_DFDY:
+							interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, mask);
+							interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, mask);
+							interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, mask);
+							interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, mask);
+							break;
+						case Shader::OPCODE_TEXCOORD:
+							interpolant[index][0] = true;
+							interpolant[index][1] = true;
+							interpolant[index][2] = true;
+							interpolant[index][3] = true;
+							break;
+						case Shader::OPCODE_TEXDP3:
+						case Shader::OPCODE_TEXDP3TEX:
+						case Shader::OPCODE_TEXM3X2PAD:
+						case Shader::OPCODE_TEXM3X3PAD:
+						case Shader::OPCODE_TEXM3X2TEX:
+						case Shader::OPCODE_TEXM3X3SPEC:
+						case Shader::OPCODE_TEXM3X3VSPEC:
+						case Shader::OPCODE_TEXBEM:
+						case Shader::OPCODE_TEXBEML:
+						case Shader::OPCODE_TEXM3X2DEPTH:
+						case Shader::OPCODE_TEXM3X3:
+						case Shader::OPCODE_TEXM3X3TEX:
+							interpolant[index][0] = true;
+							interpolant[index][1] = true;
+							interpolant[index][2] = true;
+							break;
+						case Shader::OPCODE_TEXREG2AR:
+						case Shader::OPCODE_TEXREG2GB:
+						case Shader::OPCODE_TEXREG2RGB:
+							break;
+						default:
+						//	ASSERT(false);   // Refine component usage
+							interpolant[index][0] = true;
+							interpolant[index][1] = true;
+							interpolant[index][2] = true;
+							interpolant[index][3] = true;
+						}
+					}
+				}
+			}
+
+			for(int index = 0; index < MAX_FRAGMENT_INPUTS; index++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					if(!interpolant[index][component])
+					{
+						input[index][component] = Semantic();
+					}
+				}
+			}
+		}
+		else   // Shader Model 3.0 input declaration; v# indexable
+		{
+			for(const auto &inst : instruction)
+			{
+				if(inst->opcode == Shader::OPCODE_DCL)
+				{
+					if(inst->dst.type == Shader::PARAMETER_INPUT)
+					{
+						unsigned char usage = inst->usage;
+						unsigned char index = inst->usageIndex;
+						unsigned char mask = inst->dst.mask;
+						unsigned char reg = inst->dst.index;
+
+						if(mask & 0x01) input[reg][0] = Semantic(usage, index);
+						if(mask & 0x02) input[reg][1] = Semantic(usage, index);
+						if(mask & 0x04) input[reg][2] = Semantic(usage, index);
+						if(mask & 0x08) input[reg][3] = Semantic(usage, index);
+					}
+					else if(inst->dst.type == Shader::PARAMETER_MISCTYPE)
+					{
+						unsigned char index = inst->dst.index;
+
+						if(index == Shader::VPosIndex)
+						{
+							vPosDeclared = true;
+						}
+						else if(index == Shader::VFaceIndex)
+						{
+							vFaceDeclared = true;
+						}
+						else ASSERT(false);
+					}
+				}
+			}
+		}
+
+		if(shaderModel >= 0x0200)
+		{
+			for(const auto &inst : instruction)
+			{
+				if(inst->opcode == Shader::OPCODE_DCL)
+				{
+					bool centroid = inst->dst.centroid;
+					unsigned char reg = inst->dst.index;
+
+					switch(inst->dst.type)
+					{
+					case Shader::PARAMETER_INPUT:
+						input[reg][0].centroid = centroid;
+						break;
+					case Shader::PARAMETER_TEXTURE:
+						input[2 + reg][0].centroid = centroid;
+						break;
+					default:
+						break;
+					}
+
+					this->centroid = this->centroid || centroid;
+				}
+			}
+		}
+	}
+}
diff --git a/src/Pipeline/PixelShader.hpp b/src/Pipeline/PixelShader.hpp
new file mode 100644
index 0000000..a06aaaa
--- /dev/null
+++ b/src/Pipeline/PixelShader.hpp
@@ -0,0 +1,63 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_PixelShader_hpp
+#define sw_PixelShader_hpp
+
+#include "Shader.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+	class PixelShader : public Shader
+	{
+	public:
+		explicit PixelShader(const PixelShader *ps = 0);
+		explicit PixelShader(const unsigned long *token);
+
+		virtual ~PixelShader();
+
+		static int validate(const unsigned long *const token);   // Returns number of instructions if valid
+		bool depthOverride() const;
+		bool containsKill() const;
+		bool containsCentroid() const;
+		bool usesDiffuse(int component) const;
+		bool usesSpecular(int component) const;
+		bool usesTexture(int coordinate, int component) const;
+
+		void setInput(int inputIdx, int nbComponents, const Semantic& semantic);
+		const Semantic& getInput(int inputIdx, int component) const;
+
+		void declareVPos() { vPosDeclared = true; }
+		void declareVFace() { vFaceDeclared = true; }
+		bool isVPosDeclared() const { return vPosDeclared; }
+		bool isVFaceDeclared() const { return vFaceDeclared; }
+
+	private:
+		void analyze();
+		void analyzeZOverride();
+		void analyzeKill();
+		void analyzeInterpolants();
+
+		Semantic input[MAX_FRAGMENT_INPUTS][4];
+
+		bool vPosDeclared;
+		bool vFaceDeclared;
+		bool zOverride;
+		bool kill;
+		bool centroid;
+	};
+}
+
+#endif   // sw_PixelShader_hpp
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
new file mode 100644
index 0000000..8a2aa39
--- /dev/null
+++ b/src/Pipeline/SamplerCore.cpp
@@ -0,0 +1,3035 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SamplerCore.hpp"
+
+#include "Constants.hpp"
+#include "Common/Debug.hpp"
+
+namespace
+{
+	void applySwizzle(sw::SwizzleType swizzle, sw::Short4& s, const sw::Vector4s& c)
+	{
+		switch(swizzle)
+		{
+		case sw::SWIZZLE_RED:	s = c.x; break;
+		case sw::SWIZZLE_GREEN: s = c.y; break;
+		case sw::SWIZZLE_BLUE:  s = c.z; break;
+		case sw::SWIZZLE_ALPHA: s = c.w; break;
+		case sw::SWIZZLE_ZERO:  s = sw::Short4(0x0000); break;
+		case sw::SWIZZLE_ONE:   s = sw::Short4(0x1000); break;
+		default: ASSERT(false);
+		}
+	}
+
+	void applySwizzle(sw::SwizzleType swizzle, sw::Float4& f, const sw::Vector4f& c)
+	{
+		switch(swizzle)
+		{
+		case sw::SWIZZLE_RED:	f = c.x; break;
+		case sw::SWIZZLE_GREEN: f = c.y; break;
+		case sw::SWIZZLE_BLUE:  f = c.z; break;
+		case sw::SWIZZLE_ALPHA: f = c.w; break;
+		case sw::SWIZZLE_ZERO:  f = sw::Float4(0.0f, 0.0f, 0.0f, 0.0f); break;
+		case sw::SWIZZLE_ONE:   f = sw::Float4(1.0f, 1.0f, 1.0f, 1.0f); break;
+		default: ASSERT(false);
+		}
+	}
+}
+
+namespace sw
+{
+	extern bool colorsDefaultToZero;
+
+	SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler::State &state) : constants(constants), state(state)
+	{
+	}
+
+	Vector4s SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy)
+	{
+		return sampleTexture(texture, u, v, w, q, q, dsx, dsy, (dsx), Implicit, true);
+	}
+
+	Vector4s SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function, bool fixed12)
+	{
+		Vector4s c;
+
+		#if PERF_PROFILE
+			AddAtomic(Pointer<Long>(&profiler.texOperations), 4);
+
+			if(state.compressedFormat)
+			{
+				AddAtomic(Pointer<Long>(&profiler.compressedTex), 4);
+			}
+		#endif
+
+		if(state.textureType == TEXTURE_NULL)
+		{
+			c.x = Short4(0x0000);
+			c.y = Short4(0x0000);
+			c.z = Short4(0x0000);
+
+			if(fixed12)   // FIXME: Convert to fixed12 at higher level, when required
+			{
+				c.w = Short4(0x1000);
+			}
+			else
+			{
+				c.w = Short4(0xFFFFu);   // FIXME
+			}
+		}
+		else
+		{
+			Float4 uuuu = u;
+			Float4 vvvv = v;
+			Float4 wwww = w;
+			Float4 qqqq = q;
+
+			Int face[4];
+			Float lod;
+			Float anisotropy;
+			Float4 uDelta;
+			Float4 vDelta;
+
+			if(state.textureType != TEXTURE_3D)
+			{
+				if(state.textureType != TEXTURE_CUBE)
+				{
+					computeLod(texture, lod, anisotropy, uDelta, vDelta, uuuu, vvvv, bias.x, dsx, dsy, function);
+				}
+				else
+				{
+					Float4 M;
+					cubeFace(face, uuuu, vvvv, u, v, w, M);
+					computeLodCube(texture, lod, u, v, w, bias.x, dsx, dsy, M, function);
+				}
+			}
+			else
+			{
+				computeLod3D(texture, lod, uuuu, vvvv, wwww, bias.x, dsx, dsy, function);
+			}
+
+			if(!hasFloatTexture())
+			{
+				c = sampleFilter(texture, uuuu, vvvv, wwww, offset, lod, anisotropy, uDelta, vDelta, face, function);
+			}
+			else
+			{
+				Vector4f cf = sampleFloatFilter(texture, uuuu, vvvv, wwww, qqqq, offset, lod, anisotropy, uDelta, vDelta, face, function);
+
+				convertFixed12(c, cf);
+			}
+
+			if(fixed12)
+			{
+				if(!hasFloatTexture())
+				{
+					if(state.textureFormat == FORMAT_R5G6B5)
+					{
+						c.x = MulHigh(As<UShort4>(c.x), UShort4(0x10000000 / 0xF800));
+						c.y = MulHigh(As<UShort4>(c.y), UShort4(0x10000000 / 0xFC00));
+						c.z = MulHigh(As<UShort4>(c.z), UShort4(0x10000000 / 0xF800));
+					}
+					else
+					{
+						for(int component = 0; component < textureComponentCount(); component++)
+						{
+							if(hasUnsignedTextureComponent(component))
+							{
+								c[component] = As<UShort4>(c[component]) >> 4;
+							}
+							else
+							{
+								c[component] = c[component] >> 3;
+							}
+						}
+					}
+				}
+
+				if(state.textureFilter != FILTER_GATHER)
+				{
+					int componentCount = textureComponentCount();
+					short defaultColorValue = colorsDefaultToZero ? 0x0000 : 0x1000;
+
+					switch(state.textureFormat)
+					{
+					case FORMAT_R8_SNORM:
+					case FORMAT_G8R8_SNORM:
+					case FORMAT_X8B8G8R8_SNORM:
+					case FORMAT_A8B8G8R8_SNORM:
+					case FORMAT_R8:
+					case FORMAT_R5G6B5:
+					case FORMAT_G8R8:
+					case FORMAT_R8I:
+					case FORMAT_R8UI:
+					case FORMAT_G8R8I:
+					case FORMAT_G8R8UI:
+					case FORMAT_X8B8G8R8I:
+					case FORMAT_X8B8G8R8UI:
+					case FORMAT_A8B8G8R8I:
+					case FORMAT_A8B8G8R8UI:
+					case FORMAT_R16I:
+					case FORMAT_R16UI:
+					case FORMAT_G16R16:
+					case FORMAT_G16R16I:
+					case FORMAT_G16R16UI:
+					case FORMAT_X16B16G16R16I:
+					case FORMAT_X16B16G16R16UI:
+					case FORMAT_A16B16G16R16:
+					case FORMAT_A16B16G16R16I:
+					case FORMAT_A16B16G16R16UI:
+					case FORMAT_R32I:
+					case FORMAT_R32UI:
+					case FORMAT_G32R32I:
+					case FORMAT_G32R32UI:
+					case FORMAT_X32B32G32R32I:
+					case FORMAT_X32B32G32R32UI:
+					case FORMAT_A32B32G32R32I:
+					case FORMAT_A32B32G32R32UI:
+					case FORMAT_X8R8G8B8:
+					case FORMAT_X8B8G8R8:
+					case FORMAT_A8R8G8B8:
+					case FORMAT_A8B8G8R8:
+					case FORMAT_SRGB8_X8:
+					case FORMAT_SRGB8_A8:
+					case FORMAT_V8U8:
+					case FORMAT_Q8W8V8U8:
+					case FORMAT_X8L8V8U8:
+					case FORMAT_V16U16:
+					case FORMAT_A16W16V16U16:
+					case FORMAT_Q16W16V16U16:
+					case FORMAT_YV12_BT601:
+					case FORMAT_YV12_BT709:
+					case FORMAT_YV12_JFIF:
+						if(componentCount < 2) c.y = Short4(defaultColorValue);
+						if(componentCount < 3) c.z = Short4(defaultColorValue);
+						if(componentCount < 4) c.w = Short4(0x1000);
+						break;
+					case FORMAT_A8:
+						c.w = c.x;
+						c.x = Short4(0x0000);
+						c.y = Short4(0x0000);
+						c.z = Short4(0x0000);
+						break;
+					case FORMAT_L8:
+					case FORMAT_L16:
+						c.y = c.x;
+						c.z = c.x;
+						c.w = Short4(0x1000);
+						break;
+					case FORMAT_A8L8:
+						c.w = c.y;
+						c.y = c.x;
+						c.z = c.x;
+						break;
+					case FORMAT_R32F:
+						c.y = Short4(defaultColorValue);
+					case FORMAT_G32R32F:
+						c.z = Short4(defaultColorValue);
+					case FORMAT_X32B32G32R32F:
+					case FORMAT_X32B32G32R32F_UNSIGNED:
+						c.w = Short4(0x1000);
+					case FORMAT_A32B32G32R32F:
+						break;
+					case FORMAT_D32F_LOCKABLE:
+					case FORMAT_D32FS8_TEXTURE:
+					case FORMAT_D32F_SHADOW:
+					case FORMAT_D32FS8_SHADOW:
+						c.y = c.x;
+						c.z = c.x;
+						c.w = c.x;
+						break;
+					default:
+						ASSERT(false);
+					}
+				}
+
+				if((state.swizzleR != SWIZZLE_RED) ||
+				   (state.swizzleG != SWIZZLE_GREEN) ||
+				   (state.swizzleB != SWIZZLE_BLUE) ||
+				   (state.swizzleA != SWIZZLE_ALPHA))
+				{
+					const Vector4s col(c);
+					applySwizzle(state.swizzleR, c.x, col);
+					applySwizzle(state.swizzleG, c.y, col);
+					applySwizzle(state.swizzleB, c.z, col);
+					applySwizzle(state.swizzleA, c.w, col);
+				}
+			}
+		}
+
+		return c;
+	}
+
+	Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+	{
+		Vector4f c;
+
+		#if PERF_PROFILE
+			AddAtomic(Pointer<Long>(&profiler.texOperations), 4);
+
+			if(state.compressedFormat)
+			{
+				AddAtomic(Pointer<Long>(&profiler.compressedTex), 4);
+			}
+		#endif
+
+		if(state.textureType == TEXTURE_NULL)
+		{
+			c.x = Float4(0.0f);
+			c.y = Float4(0.0f);
+			c.z = Float4(0.0f);
+			c.w = Float4(1.0f);
+		}
+		else
+		{
+			// FIXME: YUV is not supported by the floating point path
+			bool forceFloatFiltering = state.highPrecisionFiltering && !hasYuvFormat() && (state.textureFilter != FILTER_POINT);
+			bool seamlessCube = (state.addressingModeU == ADDRESSING_SEAMLESS);
+			bool rectangleTexture = (state.textureType == TEXTURE_RECTANGLE);
+			if(hasFloatTexture() || hasUnnormalizedIntegerTexture() || forceFloatFiltering || seamlessCube || rectangleTexture)   // FIXME: Mostly identical to integer sampling
+			{
+				Float4 uuuu = u;
+				Float4 vvvv = v;
+				Float4 wwww = w;
+				Float4 qqqq = q;
+
+				Int face[4];
+				Float lod;
+				Float anisotropy;
+				Float4 uDelta;
+				Float4 vDelta;
+
+				if(state.textureType != TEXTURE_3D)
+				{
+					if(state.textureType != TEXTURE_CUBE)
+					{
+						computeLod(texture, lod, anisotropy, uDelta, vDelta, uuuu, vvvv, bias.x, dsx, dsy, function);
+					}
+					else
+					{
+						Float4 M;
+						cubeFace(face, uuuu, vvvv, u, v, w, M);
+						computeLodCube(texture, lod, u, v, w, bias.x, dsx, dsy, M, function);
+					}
+				}
+				else
+				{
+					computeLod3D(texture, lod, uuuu, vvvv, wwww, bias.x, dsx, dsy, function);
+				}
+
+				c = sampleFloatFilter(texture, uuuu, vvvv, wwww, qqqq, offset, lod, anisotropy, uDelta, vDelta, face, function);
+
+				if(!hasFloatTexture() && !hasUnnormalizedIntegerTexture())
+				{
+					if(has16bitTextureFormat())
+					{
+						switch(state.textureFormat)
+						{
+						case FORMAT_R5G6B5:
+							c.x *= Float4(1.0f / 0xF800);
+							c.y *= Float4(1.0f / 0xFC00);
+							c.z *= Float4(1.0f / 0xF800);
+							break;
+						default:
+							ASSERT(false);
+						}
+					}
+					else
+					{
+						for(int component = 0; component < textureComponentCount(); component++)
+						{
+							c[component] *= Float4(hasUnsignedTextureComponent(component) ? 1.0f / 0xFFFF : 1.0f / 0x7FFF);
+						}
+					}
+				}
+			}
+			else
+			{
+				Vector4s cs = sampleTexture(texture, u, v, w, q, bias, dsx, dsy, offset, function, false);
+
+				if(state.textureFormat ==  FORMAT_R5G6B5)
+				{
+					c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
+					c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
+					c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
+				}
+				else
+				{
+					for(int component = 0; component < textureComponentCount(); component++)
+					{
+						if(hasUnsignedTextureComponent(component))
+						{
+							convertUnsigned16(c[component], cs[component]);
+						}
+						else
+						{
+							convertSigned15(c[component], cs[component]);
+						}
+					}
+				}
+			}
+
+			int componentCount = textureComponentCount();
+			float defaultColorValue = colorsDefaultToZero ? 0.0f : 1.0f;
+
+			if(state.textureFilter != FILTER_GATHER)
+			{
+				switch(state.textureFormat)
+				{
+				case FORMAT_R8I:
+				case FORMAT_R8UI:
+				case FORMAT_R16I:
+				case FORMAT_R16UI:
+				case FORMAT_R32I:
+				case FORMAT_R32UI:
+					c.y = As<Float4>(UInt4(0));
+				case FORMAT_G8R8I:
+				case FORMAT_G8R8UI:
+				case FORMAT_G16R16I:
+				case FORMAT_G16R16UI:
+				case FORMAT_G32R32I:
+				case FORMAT_G32R32UI:
+					c.z = As<Float4>(UInt4(0));
+				case FORMAT_X8B8G8R8I:
+				case FORMAT_X8B8G8R8UI:
+				case FORMAT_X16B16G16R16I:
+				case FORMAT_X16B16G16R16UI:
+				case FORMAT_X32B32G32R32I:
+				case FORMAT_X32B32G32R32UI:
+					c.w = As<Float4>(UInt4(1));
+				case FORMAT_A8B8G8R8I:
+				case FORMAT_A8B8G8R8UI:
+				case FORMAT_A16B16G16R16I:
+				case FORMAT_A16B16G16R16UI:
+				case FORMAT_A32B32G32R32I:
+				case FORMAT_A32B32G32R32UI:
+					break;
+				case FORMAT_R8_SNORM:
+				case FORMAT_G8R8_SNORM:
+				case FORMAT_X8B8G8R8_SNORM:
+				case FORMAT_A8B8G8R8_SNORM:
+				case FORMAT_R8:
+				case FORMAT_R5G6B5:
+				case FORMAT_G8R8:
+				case FORMAT_G16R16:
+				case FORMAT_A16B16G16R16:
+				case FORMAT_X8R8G8B8:
+				case FORMAT_X8B8G8R8:
+				case FORMAT_A8R8G8B8:
+				case FORMAT_A8B8G8R8:
+				case FORMAT_SRGB8_X8:
+				case FORMAT_SRGB8_A8:
+				case FORMAT_V8U8:
+				case FORMAT_Q8W8V8U8:
+				case FORMAT_X8L8V8U8:
+				case FORMAT_V16U16:
+				case FORMAT_A16W16V16U16:
+				case FORMAT_Q16W16V16U16:
+				case FORMAT_YV12_BT601:
+				case FORMAT_YV12_BT709:
+				case FORMAT_YV12_JFIF:
+					if(componentCount < 2) c.y = Float4(defaultColorValue);
+					if(componentCount < 3) c.z = Float4(defaultColorValue);
+					if(componentCount < 4) c.w = Float4(1.0f);
+					break;
+				case FORMAT_A8:
+					c.w = c.x;
+					c.x = Float4(0.0f);
+					c.y = Float4(0.0f);
+					c.z = Float4(0.0f);
+					break;
+				case FORMAT_L8:
+				case FORMAT_L16:
+					c.y = c.x;
+					c.z = c.x;
+					c.w = Float4(1.0f);
+					break;
+				case FORMAT_A8L8:
+					c.w = c.y;
+					c.y = c.x;
+					c.z = c.x;
+					break;
+				case FORMAT_R32F:
+					c.y = Float4(defaultColorValue);
+				case FORMAT_G32R32F:
+					c.z = Float4(defaultColorValue);
+				case FORMAT_X32B32G32R32F:
+				case FORMAT_X32B32G32R32F_UNSIGNED:
+					c.w = Float4(1.0f);
+				case FORMAT_A32B32G32R32F:
+					break;
+				case FORMAT_D32F_LOCKABLE:
+				case FORMAT_D32FS8_TEXTURE:
+				case FORMAT_D32F_SHADOW:
+				case FORMAT_D32FS8_SHADOW:
+					c.y = Float4(0.0f);
+					c.z = Float4(0.0f);
+					c.w = Float4(1.0f);
+					break;
+				default:
+					ASSERT(false);
+				}
+			}
+
+			if((state.swizzleR != SWIZZLE_RED) ||
+			   (state.swizzleG != SWIZZLE_GREEN) ||
+			   (state.swizzleB != SWIZZLE_BLUE) ||
+			   (state.swizzleA != SWIZZLE_ALPHA))
+			{
+				const Vector4f col(c);
+				applySwizzle(state.swizzleR, c.x, col);
+				applySwizzle(state.swizzleG, c.y, col);
+				applySwizzle(state.swizzleB, c.z, col);
+				applySwizzle(state.swizzleA, c.w, col);
+			}
+		}
+
+		return c;
+	}
+
+	Vector4f SamplerCore::textureSize(Pointer<Byte> &texture, Float4 &lod)
+	{
+		Vector4f size;
+
+		for(int i = 0; i < 4; ++i)
+		{
+			Int baseLevel = *Pointer<Int>(texture + OFFSET(Texture, baseLevel));
+			Pointer<Byte> mipmap = texture + OFFSET(Texture, mipmap) + (As<Int>(Extract(lod, i)) + baseLevel) * sizeof(Mipmap);
+			size.x = Insert(size.x, As<Float>(Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, width)))), i);
+			size.y = Insert(size.y, As<Float>(Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, height)))), i);
+			size.z = Insert(size.z, As<Float>(Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, depth)))), i);
+		}
+
+		return size;
+	}
+
+	void SamplerCore::border(Short4 &mask, Float4 &coordinates)
+	{
+		Int4 border = As<Int4>(CmpLT(Abs(coordinates - Float4(0.5f)), Float4(0.5f)));
+		mask = As<Short4>(Int2(As<Int4>(PackSigned(border, border))));
+	}
+
+	void SamplerCore::border(Int4 &mask, Float4 &coordinates)
+	{
+		mask = As<Int4>(CmpLT(Abs(coordinates - Float4(0.5f)), Float4(0.5f)));
+	}
+
+	Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
+	{
+		Short4 offset = *Pointer<Short4>(mipmap + halfOffset);
+
+		if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+		{
+			offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
+		}
+		else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
+		{
+			offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
+		}
+
+		if(wrap)
+		{
+			switch(count)
+			{
+			case -1: return uvw - offset;
+			case  0: return uvw;
+			case +1: return uvw + offset;
+			case  2: return uvw + offset + offset;
+			}
+		}
+		else   // Clamp or mirror
+		{
+			switch(count)
+			{
+			case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
+			case  0: return uvw;
+			case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
+			case  2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
+			}
+		}
+
+		return uvw;
+	}
+
+	Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function)
+	{
+		Vector4s c = sampleAniso(texture, u, v, w, offset, lod, anisotropy, uDelta, vDelta, face, false, function);
+
+		if(function == Fetch)
+		{
+			return c;
+		}
+
+		if(state.mipmapFilter == MIPMAP_LINEAR)
+		{
+			Vector4s cc = sampleAniso(texture, u, v, w, offset, lod, anisotropy, uDelta, vDelta, face, true, function);
+
+			lod *= Float(1 << 16);
+
+			UShort4 utri = UShort4(Float4(lod));   // FIXME: Optimize
+			Short4 stri = utri >> 1;   // FIXME: Optimize
+
+			if(hasUnsignedTextureComponent(0)) cc.x = MulHigh(As<UShort4>(cc.x), utri); else cc.x = MulHigh(cc.x, stri);
+			if(hasUnsignedTextureComponent(1)) cc.y = MulHigh(As<UShort4>(cc.y), utri); else cc.y = MulHigh(cc.y, stri);
+			if(hasUnsignedTextureComponent(2)) cc.z = MulHigh(As<UShort4>(cc.z), utri); else cc.z = MulHigh(cc.z, stri);
+			if(hasUnsignedTextureComponent(3)) cc.w = MulHigh(As<UShort4>(cc.w), utri); else cc.w = MulHigh(cc.w, stri);
+
+			utri = ~utri;
+			stri = Short4(0x7FFF) - stri;
+
+			if(hasUnsignedTextureComponent(0)) c.x = MulHigh(As<UShort4>(c.x), utri); else c.x = MulHigh(c.x, stri);
+			if(hasUnsignedTextureComponent(1)) c.y = MulHigh(As<UShort4>(c.y), utri); else c.y = MulHigh(c.y, stri);
+			if(hasUnsignedTextureComponent(2)) c.z = MulHigh(As<UShort4>(c.z), utri); else c.z = MulHigh(c.z, stri);
+			if(hasUnsignedTextureComponent(3)) c.w = MulHigh(As<UShort4>(c.w), utri); else c.w = MulHigh(c.w, stri);
+
+			c.x += cc.x;
+			c.y += cc.y;
+			c.z += cc.z;
+			c.w += cc.w;
+
+			if(!hasUnsignedTextureComponent(0)) c.x += c.x;
+			if(!hasUnsignedTextureComponent(1)) c.y += c.y;
+			if(!hasUnsignedTextureComponent(2)) c.z += c.z;
+			if(!hasUnsignedTextureComponent(3)) c.w += c.w;
+		}
+
+		Short4 borderMask;
+
+		if(state.addressingModeU == ADDRESSING_BORDER)
+		{
+			Short4 u0;
+
+			border(u0, u);
+
+			borderMask = u0;
+		}
+
+		if(state.addressingModeV == ADDRESSING_BORDER)
+		{
+			Short4 v0;
+
+			border(v0, v);
+
+			if(state.addressingModeU == ADDRESSING_BORDER)
+			{
+				borderMask &= v0;
+			}
+			else
+			{
+				borderMask = v0;
+			}
+		}
+
+		if(state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D)
+		{
+			Short4 s0;
+
+			border(s0, w);
+
+			if(state.addressingModeU == ADDRESSING_BORDER ||
+			   state.addressingModeV == ADDRESSING_BORDER)
+			{
+				borderMask &= s0;
+			}
+			else
+			{
+				borderMask = s0;
+			}
+		}
+
+		if(state.addressingModeU == ADDRESSING_BORDER ||
+		   state.addressingModeV == ADDRESSING_BORDER ||
+		   (state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D))
+		{
+			Short4 b;
+
+			c.x = (borderMask & c.x) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[0])) >> (hasUnsignedTextureComponent(0) ? 0 : 1)));
+			c.y = (borderMask & c.y) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[1])) >> (hasUnsignedTextureComponent(1) ? 0 : 1)));
+			c.z = (borderMask & c.z) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[2])) >> (hasUnsignedTextureComponent(2) ? 0 : 1)));
+			c.w = (borderMask & c.w) | (~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[3])) >> (hasUnsignedTextureComponent(3) ? 0 : 1)));
+		}
+
+		return c;
+	}
+
+	Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function)
+	{
+		Vector4s c;
+
+		if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
+		{
+			c = sampleQuad(texture, u, v, w, offset, lod, face, secondLOD, function);
+		}
+		else
+		{
+			Int a = RoundInt(anisotropy);
+
+			Vector4s cSum;
+
+			cSum.x = Short4(0);
+			cSum.y = Short4(0);
+			cSum.z = Short4(0);
+			cSum.w = Short4(0);
+
+			Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
+			Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
+			UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants,cWeight) + 8 * a);
+			Short4 sw = Short4(cw >> 1);
+
+			Float4 du = uDelta;
+			Float4 dv = vDelta;
+
+			Float4 u0 = u + B * du;
+			Float4 v0 = v + B * dv;
+
+			du *= A;
+			dv *= A;
+
+			Int i = 0;
+
+			Do
+			{
+				c = sampleQuad(texture, u0, v0, w, offset, lod, face, secondLOD, function);
+
+				u0 += du;
+				v0 += dv;
+
+				if(hasUnsignedTextureComponent(0)) cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw)); else cSum.x += MulHigh(c.x, sw);
+				if(hasUnsignedTextureComponent(1)) cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw)); else cSum.y += MulHigh(c.y, sw);
+				if(hasUnsignedTextureComponent(2)) cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw)); else cSum.z += MulHigh(c.z, sw);
+				if(hasUnsignedTextureComponent(3)) cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw)); else cSum.w += MulHigh(c.w, sw);
+
+				i++;
+			}
+			Until(i >= a)
+
+			if(hasUnsignedTextureComponent(0)) c.x = cSum.x; else c.x = AddSat(cSum.x, cSum.x);
+			if(hasUnsignedTextureComponent(1)) c.y = cSum.y; else c.y = AddSat(cSum.y, cSum.y);
+			if(hasUnsignedTextureComponent(2)) c.z = cSum.z; else c.z = AddSat(cSum.z, cSum.z);
+			if(hasUnsignedTextureComponent(3)) c.w = cSum.w; else c.w = AddSat(cSum.w, cSum.w);
+		}
+
+		return c;
+	}
+
+	Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+	{
+		if(state.textureType != TEXTURE_3D)
+		{
+			return sampleQuad2D(texture, u, v, w, offset, lod, face, secondLOD, function);
+		}
+		else
+		{
+			return sample3D(texture, u, v, w, offset, lod, secondLOD, function);
+		}
+	}
+
+	Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+	{
+		Vector4s c;
+
+		int componentCount = textureComponentCount();
+		bool gather = state.textureFilter == FILTER_GATHER;
+
+		Pointer<Byte> mipmap;
+		Pointer<Byte> buffer[4];
+
+		selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+		bool texelFetch = (function == Fetch);
+
+		Short4 uuuu = texelFetch ? Short4(As<Int4>(u)) : address(u, state.addressingModeU, mipmap);
+		Short4 vvvv = texelFetch ? Short4(As<Int4>(v)) : address(v, state.addressingModeV, mipmap);
+		Short4 wwww = texelFetch ? Short4(As<Int4>(w)) : address(w, state.addressingModeW, mipmap);
+
+		if(state.textureFilter == FILTER_POINT || texelFetch)
+		{
+			c = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, buffer, function);
+		}
+		else
+		{
+			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 0 : -1, lod);
+			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 0 : -1, lod);
+			Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, gather ? 2 : +1, lod);
+			Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, gather ? 2 : +1, lod);
+
+			Vector4s c0 = sampleTexel(uuuu0, vvvv0, wwww, offset, mipmap, buffer, function);
+			Vector4s c1 = sampleTexel(uuuu1, vvvv0, wwww, offset, mipmap, buffer, function);
+			Vector4s c2 = sampleTexel(uuuu0, vvvv1, wwww, offset, mipmap, buffer, function);
+			Vector4s c3 = sampleTexel(uuuu1, vvvv1, wwww, offset, mipmap, buffer, function);
+
+			if(!gather)   // Blend
+			{
+				// Fractions
+				UShort4 f0u = As<UShort4>(uuuu0) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,width));
+				UShort4 f0v = As<UShort4>(vvvv0) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,height));
+
+				UShort4 f1u = ~f0u;
+				UShort4 f1v = ~f0v;
+
+				UShort4 f0u0v = MulHigh(f0u, f0v);
+				UShort4 f1u0v = MulHigh(f1u, f0v);
+				UShort4 f0u1v = MulHigh(f0u, f1v);
+				UShort4 f1u1v = MulHigh(f1u, f1v);
+
+				// Signed fractions
+				Short4 f1u1vs;
+				Short4 f0u1vs;
+				Short4 f1u0vs;
+				Short4 f0u0vs;
+
+				if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
+				{
+					f1u1vs = f1u1v >> 1;
+					f0u1vs = f0u1v >> 1;
+					f1u0vs = f1u0v >> 1;
+					f0u0vs = f0u0v >> 1;
+				}
+
+				// Bilinear interpolation
+				if(componentCount >= 1)
+				{
+					if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
+					{
+						c0.x = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0u) + MulHigh(As<UShort4>(c1.x), f0u);
+						c2.x = As<UShort4>(c2.x) - MulHigh(As<UShort4>(c2.x), f0u) + MulHigh(As<UShort4>(c3.x), f0u);
+						c.x  = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0v) + MulHigh(As<UShort4>(c2.x), f0v);
+					}
+					else
+					{
+						if(hasUnsignedTextureComponent(0))
+						{
+							c0.x = MulHigh(As<UShort4>(c0.x), f1u1v);
+							c1.x = MulHigh(As<UShort4>(c1.x), f0u1v);
+							c2.x = MulHigh(As<UShort4>(c2.x), f1u0v);
+							c3.x = MulHigh(As<UShort4>(c3.x), f0u0v);
+						}
+						else
+						{
+							c0.x = MulHigh(c0.x, f1u1vs);
+							c1.x = MulHigh(c1.x, f0u1vs);
+							c2.x = MulHigh(c2.x, f1u0vs);
+							c3.x = MulHigh(c3.x, f0u0vs);
+						}
+
+						c.x = (c0.x + c1.x) + (c2.x + c3.x);
+						if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x);   // Correct for signed fractions
+					}
+				}
+
+				if(componentCount >= 2)
+				{
+					if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
+					{
+						c0.y = As<UShort4>(c0.y) - MulHigh(As<UShort4>(c0.y), f0u) + MulHigh(As<UShort4>(c1.y), f0u);
+						c2.y = As<UShort4>(c2.y) - MulHigh(As<UShort4>(c2.y), f0u) + MulHigh(As<UShort4>(c3.y), f0u);
+						c.y  = As<UShort4>(c0.y) - MulHigh(As<UShort4>(c0.y), f0v) + MulHigh(As<UShort4>(c2.y), f0v);
+					}
+					else
+					{
+						if(hasUnsignedTextureComponent(1))
+						{
+							c0.y = MulHigh(As<UShort4>(c0.y), f1u1v);
+							c1.y = MulHigh(As<UShort4>(c1.y), f0u1v);
+							c2.y = MulHigh(As<UShort4>(c2.y), f1u0v);
+							c3.y = MulHigh(As<UShort4>(c3.y), f0u0v);
+						}
+						else
+						{
+							c0.y = MulHigh(c0.y, f1u1vs);
+							c1.y = MulHigh(c1.y, f0u1vs);
+							c2.y = MulHigh(c2.y, f1u0vs);
+							c3.y = MulHigh(c3.y, f0u0vs);
+						}
+
+						c.y = (c0.y + c1.y) + (c2.y + c3.y);
+						if(!hasUnsignedTextureComponent(1)) c.y = AddSat(c.y, c.y);   // Correct for signed fractions
+					}
+				}
+
+				if(componentCount >= 3)
+				{
+					if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
+					{
+						c0.z = As<UShort4>(c0.z) - MulHigh(As<UShort4>(c0.z), f0u) + MulHigh(As<UShort4>(c1.z), f0u);
+						c2.z = As<UShort4>(c2.z) - MulHigh(As<UShort4>(c2.z), f0u) + MulHigh(As<UShort4>(c3.z), f0u);
+						c.z  = As<UShort4>(c0.z) - MulHigh(As<UShort4>(c0.z), f0v) + MulHigh(As<UShort4>(c2.z), f0v);
+					}
+					else
+					{
+						if(hasUnsignedTextureComponent(2))
+						{
+							c0.z = MulHigh(As<UShort4>(c0.z), f1u1v);
+							c1.z = MulHigh(As<UShort4>(c1.z), f0u1v);
+							c2.z = MulHigh(As<UShort4>(c2.z), f1u0v);
+							c3.z = MulHigh(As<UShort4>(c3.z), f0u0v);
+						}
+						else
+						{
+							c0.z = MulHigh(c0.z, f1u1vs);
+							c1.z = MulHigh(c1.z, f0u1vs);
+							c2.z = MulHigh(c2.z, f1u0vs);
+							c3.z = MulHigh(c3.z, f0u0vs);
+						}
+
+						c.z = (c0.z + c1.z) + (c2.z + c3.z);
+						if(!hasUnsignedTextureComponent(2)) c.z = AddSat(c.z, c.z);   // Correct for signed fractions
+					}
+				}
+
+				if(componentCount >= 4)
+				{
+					if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
+					{
+						c0.w = As<UShort4>(c0.w) - MulHigh(As<UShort4>(c0.w), f0u) + MulHigh(As<UShort4>(c1.w), f0u);
+						c2.w = As<UShort4>(c2.w) - MulHigh(As<UShort4>(c2.w), f0u) + MulHigh(As<UShort4>(c3.w), f0u);
+						c.w  = As<UShort4>(c0.w) - MulHigh(As<UShort4>(c0.w), f0v) + MulHigh(As<UShort4>(c2.w), f0v);
+					}
+					else
+					{
+						if(hasUnsignedTextureComponent(3))
+						{
+							c0.w = MulHigh(As<UShort4>(c0.w), f1u1v);
+							c1.w = MulHigh(As<UShort4>(c1.w), f0u1v);
+							c2.w = MulHigh(As<UShort4>(c2.w), f1u0v);
+							c3.w = MulHigh(As<UShort4>(c3.w), f0u0v);
+						}
+						else
+						{
+							c0.w = MulHigh(c0.w, f1u1vs);
+							c1.w = MulHigh(c1.w, f0u1vs);
+							c2.w = MulHigh(c2.w, f1u0vs);
+							c3.w = MulHigh(c3.w, f0u0vs);
+						}
+
+						c.w = (c0.w + c1.w) + (c2.w + c3.w);
+						if(!hasUnsignedTextureComponent(3)) c.w = AddSat(c.w, c.w);   // Correct for signed fractions
+					}
+				}
+			}
+			else
+			{
+				c.x = c1.x;
+				c.y = c2.x;
+				c.z = c3.x;
+				c.w = c0.x;
+			}
+		}
+
+		return c;
+	}
+
+	Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function)
+	{
+		Vector4s c_;
+
+		int componentCount = textureComponentCount();
+
+		Pointer<Byte> mipmap;
+		Pointer<Byte> buffer[4];
+		Int face[4];
+
+		selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+		bool texelFetch = (function == Fetch);
+
+		Short4 uuuu = texelFetch ? Short4(As<Int4>(u_)) : address(u_, state.addressingModeU, mipmap);
+		Short4 vvvv = texelFetch ? Short4(As<Int4>(v_)) : address(v_, state.addressingModeV, mipmap);
+		Short4 wwww = texelFetch ? Short4(As<Int4>(w_)) : address(w_, state.addressingModeW, mipmap);
+
+		if(state.textureFilter == FILTER_POINT || texelFetch)
+		{
+			c_ = sampleTexel(uuuu, vvvv, wwww, offset, mipmap, buffer, function);
+		}
+		else
+		{
+			Vector4s c[2][2][2];
+
+			Short4 u[2][2][2];
+			Short4 v[2][2][2];
+			Short4 s[2][2][2];
+
+			for(int i = 0; i < 2; i++)
+			{
+				for(int j = 0; j < 2; j++)
+				{
+					for(int k = 0; k < 2; k++)
+					{
+						u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
+						v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
+						s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap,wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
+					}
+				}
+			}
+
+			// Fractions
+			UShort4 f0u = As<UShort4>(u[0][0][0]) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,width));
+			UShort4 f0v = As<UShort4>(v[0][0][0]) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,height));
+			UShort4 f0s = As<UShort4>(s[0][0][0]) * *Pointer<UShort4>(mipmap + OFFSET(Mipmap,depth));
+
+			UShort4 f1u = ~f0u;
+			UShort4 f1v = ~f0v;
+			UShort4 f1s = ~f0s;
+
+			UShort4 f[2][2][2];
+			Short4 fs[2][2][2];
+
+			f[1][1][1] = MulHigh(f1u, f1v);
+			f[0][1][1] = MulHigh(f0u, f1v);
+			f[1][0][1] = MulHigh(f1u, f0v);
+			f[0][0][1] = MulHigh(f0u, f0v);
+			f[1][1][0] = MulHigh(f1u, f1v);
+			f[0][1][0] = MulHigh(f0u, f1v);
+			f[1][0][0] = MulHigh(f1u, f0v);
+			f[0][0][0] = MulHigh(f0u, f0v);
+
+			f[1][1][1] = MulHigh(f[1][1][1], f1s);
+			f[0][1][1] = MulHigh(f[0][1][1], f1s);
+			f[1][0][1] = MulHigh(f[1][0][1], f1s);
+			f[0][0][1] = MulHigh(f[0][0][1], f1s);
+			f[1][1][0] = MulHigh(f[1][1][0], f0s);
+			f[0][1][0] = MulHigh(f[0][1][0], f0s);
+			f[1][0][0] = MulHigh(f[1][0][0], f0s);
+			f[0][0][0] = MulHigh(f[0][0][0], f0s);
+
+			// Signed fractions
+			if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
+			{
+				fs[0][0][0] = f[0][0][0] >> 1;
+				fs[0][0][1] = f[0][0][1] >> 1;
+				fs[0][1][0] = f[0][1][0] >> 1;
+				fs[0][1][1] = f[0][1][1] >> 1;
+				fs[1][0][0] = f[1][0][0] >> 1;
+				fs[1][0][1] = f[1][0][1] >> 1;
+				fs[1][1][0] = f[1][1][0] >> 1;
+				fs[1][1][1] = f[1][1][1] >> 1;
+			}
+
+			for(int i = 0; i < 2; i++)
+			{
+				for(int j = 0; j < 2; j++)
+				{
+					for(int k = 0; k < 2; k++)
+					{
+						c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], offset, mipmap, buffer, function);
+
+						if(componentCount >= 1) { if(hasUnsignedTextureComponent(0)) c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]); else c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]); }
+						if(componentCount >= 2) { if(hasUnsignedTextureComponent(1)) c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]); else c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]); }
+						if(componentCount >= 3) { if(hasUnsignedTextureComponent(2)) c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]); else c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]); }
+						if(componentCount >= 4) { if(hasUnsignedTextureComponent(3)) c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]); else c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]); }
+
+						if(i != 0 || j != 0 || k != 0)
+						{
+							if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
+							if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
+							if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
+							if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
+						}
+					}
+				}
+			}
+
+			if(componentCount >= 1) c_.x = c[0][0][0].x;
+			if(componentCount >= 2) c_.y = c[0][0][0].y;
+			if(componentCount >= 3) c_.z = c[0][0][0].z;
+			if(componentCount >= 4) c_.w = c[0][0][0].w;
+
+			// Correct for signed fractions
+			if(componentCount >= 1) if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
+			if(componentCount >= 2) if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
+			if(componentCount >= 3) if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
+			if(componentCount >= 4) if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
+		}
+
+		return c_;
+	}
+
+	Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function)
+	{
+		Vector4f c = sampleFloatAniso(texture, u, v, w, q, offset, lod, anisotropy, uDelta, vDelta, face, false, function);
+
+		if(function == Fetch)
+		{
+			return c;
+		}
+
+		if(state.mipmapFilter == MIPMAP_LINEAR)
+		{
+			Vector4f cc = sampleFloatAniso(texture, u, v, w, q, offset, lod, anisotropy, uDelta, vDelta, face, true, function);
+
+			Float4 lod4 = Float4(Frac(lod));
+
+			c.x = (cc.x - c.x) * lod4 + c.x;
+			c.y = (cc.y - c.y) * lod4 + c.y;
+			c.z = (cc.z - c.z) * lod4 + c.z;
+			c.w = (cc.w - c.w) * lod4 + c.w;
+		}
+
+		Int4 borderMask;
+
+		if(state.addressingModeU == ADDRESSING_BORDER)
+		{
+			Int4 u0;
+
+			border(u0, u);
+
+			borderMask = u0;
+		}
+
+		if(state.addressingModeV == ADDRESSING_BORDER)
+		{
+			Int4 v0;
+
+			border(v0, v);
+
+			if(state.addressingModeU == ADDRESSING_BORDER)
+			{
+				borderMask &= v0;
+			}
+			else
+			{
+				borderMask = v0;
+			}
+		}
+
+		if(state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D)
+		{
+			Int4 s0;
+
+			border(s0, w);
+
+			if(state.addressingModeU == ADDRESSING_BORDER ||
+			   state.addressingModeV == ADDRESSING_BORDER)
+			{
+				borderMask &= s0;
+			}
+			else
+			{
+				borderMask = s0;
+			}
+		}
+
+		if(state.addressingModeU == ADDRESSING_BORDER ||
+		   state.addressingModeV == ADDRESSING_BORDER ||
+		   (state.addressingModeW == ADDRESSING_BORDER && state.textureType == TEXTURE_3D))
+		{
+			Int4 b;
+
+			c.x = As<Float4>((borderMask & As<Int4>(c.x)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[0]))));
+			c.y = As<Float4>((borderMask & As<Int4>(c.y)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[1]))));
+			c.z = As<Float4>((borderMask & As<Int4>(c.z)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[2]))));
+			c.w = As<Float4>((borderMask & As<Int4>(c.w)) | (~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[3]))));
+		}
+
+		return c;
+	}
+
+	Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function)
+	{
+		Vector4f c;
+
+		if(state.textureFilter != FILTER_ANISOTROPIC || function == Lod || function == Fetch)
+		{
+			c = sampleFloat(texture, u, v, w, q, offset, lod, face, secondLOD, function);
+		}
+		else
+		{
+			Int a = RoundInt(anisotropy);
+
+			Vector4f cSum;
+
+			cSum.x = Float4(0.0f);
+			cSum.y = Float4(0.0f);
+			cSum.z = Float4(0.0f);
+			cSum.w = Float4(0.0f);
+
+			Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
+			Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
+
+			Float4 du = uDelta;
+			Float4 dv = vDelta;
+
+			Float4 u0 = u + B * du;
+			Float4 v0 = v + B * dv;
+
+			du *= A;
+			dv *= A;
+
+			Int i = 0;
+
+			Do
+			{
+				c = sampleFloat(texture, u0, v0, w, q, offset, lod, face, secondLOD, function);
+
+				u0 += du;
+				v0 += dv;
+
+				cSum.x += c.x * A;
+				cSum.y += c.y * A;
+				cSum.z += c.z * A;
+				cSum.w += c.w * A;
+
+				i++;
+			}
+			Until(i >= a)
+
+			c.x = cSum.x;
+			c.y = cSum.y;
+			c.z = cSum.z;
+			c.w = cSum.w;
+		}
+
+		return c;
+	}
+
+	Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+	{
+		if(state.textureType != TEXTURE_3D)
+		{
+			return sampleFloat2D(texture, u, v, w, q, offset, lod, face, secondLOD, function);
+		}
+		else
+		{
+			return sampleFloat3D(texture, u, v, w, offset, lod, secondLOD, function);
+		}
+	}
+
+	Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
+	{
+		Vector4f c;
+
+		int componentCount = textureComponentCount();
+		bool gather = state.textureFilter == FILTER_GATHER;
+
+		Pointer<Byte> mipmap;
+		Pointer<Byte> buffer[4];
+
+		selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+		Int4 x0, x1, y0, y1, z0;
+		Float4 fu, fv;
+		Int4 filter = computeFilterOffset(lod);
+		address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
+		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+		address(w, z0, z0, fv, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
+
+		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+		y0 *= pitchP;
+		if(hasThirdCoordinate())
+		{
+			Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+			z0 *= sliceP;
+		}
+
+		if(state.textureFilter == FILTER_POINT || (function == Fetch))
+		{
+			c = sampleTexel(x0, y0, z0, q, mipmap, buffer, function);
+		}
+		else
+		{
+			y1 *= pitchP;
+
+			Vector4f c0 = sampleTexel(x0, y0, z0, q, mipmap, buffer, function);
+			Vector4f c1 = sampleTexel(x1, y0, z0, q, mipmap, buffer, function);
+			Vector4f c2 = sampleTexel(x0, y1, z0, q, mipmap, buffer, function);
+			Vector4f c3 = sampleTexel(x1, y1, z0, q, mipmap, buffer, function);
+
+			if(!gather)   // Blend
+			{
+				if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
+				if(componentCount >= 2) c0.y = c0.y + fu * (c1.y - c0.y);
+				if(componentCount >= 3) c0.z = c0.z + fu * (c1.z - c0.z);
+				if(componentCount >= 4) c0.w = c0.w + fu * (c1.w - c0.w);
+
+				if(componentCount >= 1) c2.x = c2.x + fu * (c3.x - c2.x);
+				if(componentCount >= 2) c2.y = c2.y + fu * (c3.y - c2.y);
+				if(componentCount >= 3) c2.z = c2.z + fu * (c3.z - c2.z);
+				if(componentCount >= 4) c2.w = c2.w + fu * (c3.w - c2.w);
+
+				if(componentCount >= 1) c.x = c0.x + fv * (c2.x - c0.x);
+				if(componentCount >= 2) c.y = c0.y + fv * (c2.y - c0.y);
+				if(componentCount >= 3) c.z = c0.z + fv * (c2.z - c0.z);
+				if(componentCount >= 4) c.w = c0.w + fv * (c2.w - c0.w);
+			}
+			else
+			{
+				c.x = c1.x;
+				c.y = c2.x;
+				c.z = c3.x;
+				c.w = c0.x;
+			}
+		}
+
+		return c;
+	}
+
+	Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function)
+	{
+		Vector4f c;
+
+		int componentCount = textureComponentCount();
+
+		Pointer<Byte> mipmap;
+		Pointer<Byte> buffer[4];
+		Int face[4];
+
+		selectMipmap(texture, buffer, mipmap, lod, face, secondLOD);
+
+		Int4 x0, x1, y0, y1, z0, z1;
+		Float4 fu, fv, fw;
+		Int4 filter = computeFilterOffset(lod);
+		address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
+		address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
+		address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
+
+		Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
+		Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
+		y0 *= pitchP;
+		z0 *= sliceP;
+
+		if(state.textureFilter == FILTER_POINT || (function == Fetch))
+		{
+			c = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
+		}
+		else
+		{
+			y1 *= pitchP;
+			z1 *= sliceP;
+
+			Vector4f c0 = sampleTexel(x0, y0, z0, w, mipmap, buffer, function);
+			Vector4f c1 = sampleTexel(x1, y0, z0, w, mipmap, buffer, function);
+			Vector4f c2 = sampleTexel(x0, y1, z0, w, mipmap, buffer, function);
+			Vector4f c3 = sampleTexel(x1, y1, z0, w, mipmap, buffer, function);
+			Vector4f c4 = sampleTexel(x0, y0, z1, w, mipmap, buffer, function);
+			Vector4f c5 = sampleTexel(x1, y0, z1, w, mipmap, buffer, function);
+			Vector4f c6 = sampleTexel(x0, y1, z1, w, mipmap, buffer, function);
+			Vector4f c7 = sampleTexel(x1, y1, z1, w, mipmap, buffer, function);
+
+			// Blend first slice
+			if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
+			if(componentCount >= 2) c0.y = c0.y + fu * (c1.y - c0.y);
+			if(componentCount >= 3) c0.z = c0.z + fu * (c1.z - c0.z);
+			if(componentCount >= 4) c0.w = c0.w + fu * (c1.w - c0.w);
+
+			if(componentCount >= 1) c2.x = c2.x + fu * (c3.x - c2.x);
+			if(componentCount >= 2) c2.y = c2.y + fu * (c3.y - c2.y);
+			if(componentCount >= 3) c2.z = c2.z + fu * (c3.z - c2.z);
+			if(componentCount >= 4) c2.w = c2.w + fu * (c3.w - c2.w);
+
+			if(componentCount >= 1) c0.x = c0.x + fv * (c2.x - c0.x);
+			if(componentCount >= 2) c0.y = c0.y + fv * (c2.y - c0.y);
+			if(componentCount >= 3) c0.z = c0.z + fv * (c2.z - c0.z);
+			if(componentCount >= 4) c0.w = c0.w + fv * (c2.w - c0.w);
+
+			// Blend second slice
+			if(componentCount >= 1) c4.x = c4.x + fu * (c5.x - c4.x);
+			if(componentCount >= 2) c4.y = c4.y + fu * (c5.y - c4.y);
+			if(componentCount >= 3) c4.z = c4.z + fu * (c5.z - c4.z);
+			if(componentCount >= 4) c4.w = c4.w + fu * (c5.w - c4.w);
+
+			if(componentCount >= 1) c6.x = c6.x + fu * (c7.x - c6.x);
+			if(componentCount >= 2) c6.y = c6.y + fu * (c7.y - c6.y);
+			if(componentCount >= 3) c6.z = c6.z + fu * (c7.z - c6.z);
+			if(componentCount >= 4) c6.w = c6.w + fu * (c7.w - c6.w);
+
+			if(componentCount >= 1) c4.x = c4.x + fv * (c6.x - c4.x);
+			if(componentCount >= 2) c4.y = c4.y + fv * (c6.y - c4.y);
+			if(componentCount >= 3) c4.z = c4.z + fv * (c6.z - c4.z);
+			if(componentCount >= 4) c4.w = c4.w + fv * (c6.w - c4.w);
+
+			// Blend slices
+			if(componentCount >= 1) c.x = c0.x + fw * (c4.x - c0.x);
+			if(componentCount >= 2) c.y = c0.y + fw * (c4.y - c0.y);
+			if(componentCount >= 3) c.z = c0.z + fw * (c4.z - c0.z);
+			if(componentCount >= 4) c.w = c0.w + fw * (c4.w - c0.w);
+		}
+
+		return c;
+	}
+
+	Float SamplerCore::log2sqrt(Float lod)
+	{
+		// log2(sqrt(lod))                               // Equals 0.25 * log2(lod^2).
+		lod *= lod;                                      // Squaring doubles the exponent and produces an extra bit of precision.
+		lod = Float(As<Int>(lod)) - Float(0x3F800000);   // Interpret as integer and subtract the exponent bias.
+		lod *= As<Float>(Int(0x33000000));               // Scale by 0.25 * 2^-23 (mantissa length).
+
+		return lod;
+	}
+
+	Float SamplerCore::log2(Float lod)
+	{
+		lod *= lod;                                      // Squaring doubles the exponent and produces an extra bit of precision.
+		lod = Float(As<Int>(lod)) - Float(0x3F800000);   // Interpret as integer and subtract the exponent bias.
+		lod *= As<Float>(Int(0x33800000));               // Scale by 0.5 * 2^-23 (mantissa length).
+
+		return lod;
+	}
+
+	void SamplerCore::computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function)
+	{
+		if(function != Lod && function != Fetch)
+		{
+			Float4 duvdxy;
+
+			if(function != Grad)   // Implicit
+			{
+				duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
+			}
+			else
+			{
+				Float4 dudxy = Float4(dsx.x.xx, dsy.x.xx);
+				Float4 dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+
+				duvdxy = Float4(dudxy.xz, dvdxy.xz);
+			}
+
+			// Scale by texture dimensions and global LOD.
+			Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture,widthHeightLOD));
+
+			Float4 dUV2dxy = dUVdxy * dUVdxy;
+			Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
+
+			lod = Max(Float(dUV2.x), Float(dUV2.y));   // Square length of major axis
+
+			if(state.textureFilter == FILTER_ANISOTROPIC)
+			{
+				Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
+
+				Float4 dudx = duvdxy.xxxx;
+				Float4 dudy = duvdxy.yyyy;
+				Float4 dvdx = duvdxy.zzzz;
+				Float4 dvdy = duvdxy.wwww;
+
+				Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
+				uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
+				vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
+
+				anisotropy = lod * Rcp_pp(det);
+				anisotropy = Min(anisotropy, *Pointer<Float>(texture + OFFSET(Texture,maxAnisotropy)));
+
+				lod *= Rcp_pp(anisotropy * anisotropy);
+			}
+
+			lod = log2sqrt(lod);   // log2(sqrt(lod))
+
+			if(function == Bias)
+			{
+				lod += lodBias;
+			}
+		}
+		else if(function == Lod)
+		{
+			lod = lodBias;
+		}
+		else if(function == Fetch)
+		{
+			// TODO: Eliminate int-float-int conversion.
+			lod = Float(As<Int>(lodBias));
+		}
+		else if(function == Base)
+		{
+			lod = Float(0);
+		}
+		else assert(false);
+
+		lod = Max(lod, *Pointer<Float>(texture + OFFSET(Texture, minLod)));
+		lod = Min(lod, *Pointer<Float>(texture + OFFSET(Texture, maxLod)));
+	}
+
+	void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, Float4 &M, SamplerFunction function)
+	{
+		if(function != Lod && function != Fetch)
+		{
+			Float4 dudxy, dvdxy, dsdxy;
+
+			if(function != Grad)  // Implicit
+			{
+				Float4 U = u * M;
+				Float4 V = v * M;
+				Float4 W = w * M;
+
+				dudxy = Abs(U - U.xxxx);
+				dvdxy = Abs(V - V.xxxx);
+				dsdxy = Abs(W - W.xxxx);
+			}
+			else
+			{
+				dudxy = Float4(dsx.x.xx, dsy.x.xx);
+				dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+				dsdxy = Float4(dsx.z.xx, dsy.z.xx);
+
+				dudxy = Abs(dudxy * Float4(M.x));
+				dvdxy = Abs(dvdxy * Float4(M.x));
+				dsdxy = Abs(dsdxy * Float4(M.x));
+			}
+
+			// Compute the largest Manhattan distance in two dimensions.
+			// This takes the footprint across adjacent faces into account.
+			Float4 duvdxy = dudxy + dvdxy;
+			Float4 dusdxy = dudxy + dsdxy;
+			Float4 dvsdxy = dvdxy + dsdxy;
+
+			dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
+
+			lod = Max(Float(dudxy.y), Float(dudxy.z));   // FIXME: Max(dudxy.y, dudxy.z);
+
+			// Scale by texture dimension and global LOD.
+			lod *= *Pointer<Float>(texture + OFFSET(Texture,widthLOD));
+
+			lod = log2(lod);
+
+			if(function == Bias)
+			{
+				lod += lodBias;
+			}
+		}
+		else if(function == Lod)
+		{
+			lod = lodBias;
+		}
+		else if(function == Fetch)
+		{
+			// TODO: Eliminate int-float-int conversion.
+			lod = Float(As<Int>(lodBias));
+		}
+		else if(function == Base)
+		{
+			lod = Float(0);
+		}
+		else assert(false);
+
+		lod = Max(lod, *Pointer<Float>(texture + OFFSET(Texture, minLod)));
+		lod = Min(lod, *Pointer<Float>(texture + OFFSET(Texture, maxLod)));
+	}
+
+	void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function)
+	{
+		if(function != Lod && function != Fetch)
+		{
+			Float4 dudxy, dvdxy, dsdxy;
+
+			if(function != Grad)   // Implicit
+			{
+				dudxy = uuuu - uuuu.xxxx;
+				dvdxy = vvvv - vvvv.xxxx;
+				dsdxy = wwww - wwww.xxxx;
+			}
+			else
+			{
+				dudxy = Float4(dsx.x.xx, dsy.x.xx);
+				dvdxy = Float4(dsx.y.xx, dsy.y.xx);
+				dsdxy = Float4(dsx.z.xx, dsy.z.xx);
+			}
+
+			// Scale by texture dimensions and global LOD.
+			dudxy *= *Pointer<Float4>(texture + OFFSET(Texture,widthLOD));
+			dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture,heightLOD));
+			dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture,depthLOD));
+
+			dudxy *= dudxy;
+			dvdxy *= dvdxy;
+			dsdxy *= dsdxy;
+
+			dudxy += dvdxy;
+			dudxy += dsdxy;
+
+			lod = Max(Float(dudxy.y), Float(dudxy.z));   // FIXME: Max(dudxy.y, dudxy.z);
+
+			lod = log2sqrt(lod);   // log2(sqrt(lod))
+
+			if(function == Bias)
+			{
+				lod += lodBias;
+			}
+		}
+		else if(function == Lod)
+		{
+			lod = lodBias;
+		}
+		else if(function == Fetch)
+		{
+			// TODO: Eliminate int-float-int conversion.
+			lod = Float(As<Int>(lodBias));
+		}
+		else if(function == Base)
+		{
+			lod = Float(0);
+		}
+		else assert(false);
+
+		lod = Max(lod, *Pointer<Float>(texture + OFFSET(Texture, minLod)));
+		lod = Min(lod, *Pointer<Float>(texture + OFFSET(Texture, maxLod)));
+	}
+
+	void SamplerCore::cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
+	{
+		Int4 xn = CmpLT(x, Float4(0.0f));   // x < 0
+		Int4 yn = CmpLT(y, Float4(0.0f));   // y < 0
+		Int4 zn = CmpLT(z, Float4(0.0f));   // z < 0
+
+		Float4 absX = Abs(x);
+		Float4 absY = Abs(y);
+		Float4 absZ = Abs(z);
+
+		Int4 xy = CmpNLE(absX, absY);   // abs(x) > abs(y)
+		Int4 yz = CmpNLE(absY, absZ);   // abs(y) > abs(z)
+		Int4 zx = CmpNLE(absZ, absX);   // abs(z) > abs(x)
+		Int4 xMajor = xy & ~zx;   // abs(x) > abs(y) && abs(x) > abs(z)
+		Int4 yMajor = yz & ~xy;   // abs(y) > abs(z) && abs(y) > abs(x)
+		Int4 zMajor = zx & ~yz;   // abs(z) > abs(x) && abs(z) > abs(y)
+
+		// FACE_POSITIVE_X = 000b
+		// FACE_NEGATIVE_X = 001b
+		// FACE_POSITIVE_Y = 010b
+		// FACE_NEGATIVE_Y = 011b
+		// FACE_POSITIVE_Z = 100b
+		// FACE_NEGATIVE_Z = 101b
+
+		Int yAxis = SignMask(yMajor);
+		Int zAxis = SignMask(zMajor);
+
+		Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
+		Int negative = SignMask(n);
+
+		face[0] = *Pointer<Int>(constants + OFFSET(Constants,transposeBit0) + negative * 4);
+		face[0] |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit1) + yAxis * 4);
+		face[0] |= *Pointer<Int>(constants + OFFSET(Constants,transposeBit2) + zAxis * 4);
+		face[1] = (face[0] >> 4)  & 0x7;
+		face[2] = (face[0] >> 8)  & 0x7;
+		face[3] = (face[0] >> 12) & 0x7;
+		face[0] &= 0x7;
+
+		M = Max(Max(absX, absY), absZ);
+
+		// U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
+		U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
+
+		// V = !yMajor ? -y : (n ^ z)
+		V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
+
+		M = reciprocal(M) * Float4(0.5f);
+		U = U * M + Float4(0.5f);
+		V = V * M + Float4(0.5f);
+	}
+
+	Short4 SamplerCore::applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode)
+	{
+		Int4 tmp = Int4(As<UShort4>(uvw));
+		tmp = tmp + As<Int4>(offset);
+
+		switch(mode)
+		{
+		case AddressingMode::ADDRESSING_WRAP:
+			tmp = (tmp + whd * Int4(-MIN_PROGRAM_TEXEL_OFFSET)) % whd;
+			break;
+		case AddressingMode::ADDRESSING_CLAMP:
+		case AddressingMode::ADDRESSING_MIRROR:
+		case AddressingMode::ADDRESSING_MIRRORONCE:
+		case AddressingMode::ADDRESSING_BORDER: // FIXME: Implement and test ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, ADDRESSING_BORDER
+			tmp = Min(Max(tmp, Int4(0)), whd - Int4(1));
+			break;
+		case ADDRESSING_TEXELFETCH:
+			break;
+		case AddressingMode::ADDRESSING_SEAMLESS:
+			ASSERT(false);   // Cube sampling doesn't support offset.
+		default:
+			ASSERT(false);
+		}
+
+		return As<Short4>(UShort4(tmp));
+	}
+
+	void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function)
+	{
+		bool texelFetch = (function == Fetch);
+		bool hasOffset = (function.option == Offset);
+
+		if(!texelFetch)
+		{
+			uuuu = MulHigh(As<UShort4>(uuuu), *Pointer<UShort4>(mipmap + OFFSET(Mipmap, width)));
+			vvvv = MulHigh(As<UShort4>(vvvv), *Pointer<UShort4>(mipmap + OFFSET(Mipmap, height)));
+		}
+
+		if(hasOffset)
+		{
+			UShort4 w = *Pointer<UShort4>(mipmap + OFFSET(Mipmap, width));
+			uuuu = applyOffset(uuuu, offset.x, Int4(w), texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeU);
+			UShort4 h = *Pointer<UShort4>(mipmap + OFFSET(Mipmap, height));
+			vvvv = applyOffset(vvvv, offset.y, Int4(h), texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeV);
+		}
+
+		Short4 uuu2 = uuuu;
+		uuuu = As<Short4>(UnpackLow(uuuu, vvvv));
+		uuu2 = As<Short4>(UnpackHigh(uuu2, vvvv));
+		uuuu = As<Short4>(MulAdd(uuuu, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
+		uuu2 = As<Short4>(MulAdd(uuu2, *Pointer<Short4>(mipmap + OFFSET(Mipmap,onePitchP))));
+
+		if(hasThirdCoordinate())
+		{
+			if(state.textureType != TEXTURE_2D_ARRAY)
+			{
+				if(!texelFetch)
+				{
+					wwww = MulHigh(As<UShort4>(wwww), *Pointer<UShort4>(mipmap + OFFSET(Mipmap, depth)));
+				}
+
+				if(hasOffset)
+				{
+					UShort4 d = *Pointer<UShort4>(mipmap + OFFSET(Mipmap, depth));
+					wwww = applyOffset(wwww, offset.z, Int4(d), texelFetch ? ADDRESSING_TEXELFETCH : state.addressingModeW);
+				}
+			}
+
+			UInt4 uv(As<UInt2>(uuuu), As<UInt2>(uuu2));
+			uv += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
+
+			index[0] = Extract(As<Int4>(uv), 0);
+			index[1] = Extract(As<Int4>(uv), 1);
+			index[2] = Extract(As<Int4>(uv), 2);
+			index[3] = Extract(As<Int4>(uv), 3);
+		}
+		else
+		{
+			index[0] = Extract(As<Int2>(uuuu), 0);
+			index[1] = Extract(As<Int2>(uuuu), 1);
+			index[2] = Extract(As<Int2>(uuu2), 0);
+			index[3] = Extract(As<Int2>(uuu2), 1);
+		}
+
+		if(texelFetch)
+		{
+			Int size = Int(*Pointer<Int>(mipmap + OFFSET(Mipmap, sliceP)));
+			if(hasThirdCoordinate())
+			{
+				size *= Int(*Pointer<Short>(mipmap + OFFSET(Mipmap, depth)));
+			}
+			UInt min = 0;
+			UInt max = size - 1;
+
+			for(int i = 0; i < 4; i++)
+			{
+				index[i] = Min(Max(index[i], min), max);
+			}
+		}
+	}
+
+	void SamplerCore::computeIndices(UInt index[4], Int4& uuuu, Int4& vvvv, Int4& wwww, const Pointer<Byte> &mipmap, SamplerFunction function)
+	{
+		UInt4 indices = uuuu + vvvv;
+
+		if(hasThirdCoordinate())
+		{
+			indices += As<UInt4>(wwww);
+		}
+
+		for(int i = 0; i < 4; i++)
+		{
+			index[i] = Extract(As<Int4>(indices), i);
+		}
+	}
+
+	Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer[4])
+	{
+		Vector4s c;
+
+		int f0 = state.textureType == TEXTURE_CUBE ? 0 : 0;
+		int f1 = state.textureType == TEXTURE_CUBE ? 1 : 0;
+		int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
+		int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
+
+		if(has16bitTextureFormat())
+		{
+			c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+			c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+			c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+			c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
+
+			switch(state.textureFormat)
+			{
+			case FORMAT_R5G6B5:
+				c.z = (c.x & Short4(0x001Fu)) << 11;
+				c.y = (c.x & Short4(0x07E0u)) << 5;
+				c.x = (c.x & Short4(0xF800u));
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+		else if(has8bitTextureComponents())
+		{
+			switch(textureComponentCount())
+			{
+			case 4:
+				{
+					Byte4 c0 = Pointer<Byte4>(buffer[f0])[index[0]];
+					Byte4 c1 = Pointer<Byte4>(buffer[f1])[index[1]];
+					Byte4 c2 = Pointer<Byte4>(buffer[f2])[index[2]];
+					Byte4 c3 = Pointer<Byte4>(buffer[f3])[index[3]];
+					c.x = Unpack(c0, c1);
+					c.y = Unpack(c2, c3);
+
+					switch(state.textureFormat)
+					{
+					case FORMAT_A8R8G8B8:
+						c.z = As<Short4>(UnpackLow(c.x, c.y));
+						c.x = As<Short4>(UnpackHigh(c.x, c.y));
+						c.y = c.z;
+						c.w = c.x;
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(c.w));
+						break;
+					case FORMAT_A8B8G8R8:
+					case FORMAT_A8B8G8R8I:
+					case FORMAT_A8B8G8R8_SNORM:
+					case FORMAT_Q8W8V8U8:
+					case FORMAT_SRGB8_A8:
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.y = c.x;
+						c.w = c.z;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(c.w));
+						// Propagate sign bit
+						if(state.textureFormat == FORMAT_A8B8G8R8I)
+						{
+							c.x >>= 8;
+							c.y >>= 8;
+							c.z >>= 8;
+							c.w >>= 8;
+						}
+						break;
+					case FORMAT_A8B8G8R8UI:
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.y = c.x;
+						c.w = c.z;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
+						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
+						break;
+					default:
+						ASSERT(false);
+					}
+				}
+				break;
+			case 3:
+				{
+					Byte4 c0 = Pointer<Byte4>(buffer[f0])[index[0]];
+					Byte4 c1 = Pointer<Byte4>(buffer[f1])[index[1]];
+					Byte4 c2 = Pointer<Byte4>(buffer[f2])[index[2]];
+					Byte4 c3 = Pointer<Byte4>(buffer[f3])[index[3]];
+					c.x = Unpack(c0, c1);
+					c.y = Unpack(c2, c3);
+
+					switch(state.textureFormat)
+					{
+					case FORMAT_X8R8G8B8:
+						c.z = As<Short4>(UnpackLow(c.x, c.y));
+						c.x = As<Short4>(UnpackHigh(c.x, c.y));
+						c.y = c.z;
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+						break;
+					case FORMAT_X8B8G8R8_SNORM:
+					case FORMAT_X8B8G8R8I:
+					case FORMAT_X8B8G8R8:
+					case FORMAT_X8L8V8U8:
+					case FORMAT_SRGB8_X8:
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.y = c.x;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						// Propagate sign bit
+						if(state.textureFormat == FORMAT_X8B8G8R8I)
+						{
+							c.x >>= 8;
+							c.y >>= 8;
+							c.z >>= 8;
+						}
+						break;
+					case FORMAT_X8B8G8R8UI:
+						c.z = As<Short4>(UnpackHigh(c.x, c.y));
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.y = c.x;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
+						break;
+					default:
+						ASSERT(false);
+					}
+				}
+				break;
+			case 2:
+				c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
+
+				switch(state.textureFormat)
+				{
+				case FORMAT_G8R8:
+				case FORMAT_G8R8_SNORM:
+				case FORMAT_V8U8:
+				case FORMAT_A8L8:
+					c.y = (c.x & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c.x) >> 8);
+					c.x = (c.x & Short4(0x00FFu)) | (c.x << 8);
+					break;
+				case FORMAT_G8R8I:
+					c.y = c.x >> 8;
+					c.x = (c.x << 8) >> 8; // Propagate sign bit
+					break;
+				case FORMAT_G8R8UI:
+					c.y = As<Short4>(As<UShort4>(c.x) >> 8);
+					c.x &= Short4(0x00FFu);
+					break;
+				default:
+					ASSERT(false);
+				}
+				break;
+			case 1:
+				{
+					Int c0 = Int(*Pointer<Byte>(buffer[f0] + index[0]));
+					Int c1 = Int(*Pointer<Byte>(buffer[f1] + index[1]));
+					Int c2 = Int(*Pointer<Byte>(buffer[f2] + index[2]));
+					Int c3 = Int(*Pointer<Byte>(buffer[f3] + index[3]));
+					c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+
+					switch(state.textureFormat)
+					{
+					case FORMAT_R8I:
+					case FORMAT_R8UI:
+						{
+							Int zero(0);
+							c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
+							// Propagate sign bit
+							if(state.textureFormat == FORMAT_R8I)
+							{
+								c.x = (c.x << 8) >> 8;
+							}
+						}
+						break;
+					default:
+						c.x = Unpack(As<Byte4>(c0));
+						break;
+					}
+				}
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+		else if(has16bitTextureComponents())
+		{
+			switch(textureComponentCount())
+			{
+			case 4:
+				c.x = Pointer<Short4>(buffer[f0])[index[0]];
+				c.y = Pointer<Short4>(buffer[f1])[index[1]];
+				c.z = Pointer<Short4>(buffer[f2])[index[2]];
+				c.w = Pointer<Short4>(buffer[f3])[index[3]];
+				transpose4x4(c.x, c.y, c.z, c.w);
+				break;
+			case 3:
+				c.x = Pointer<Short4>(buffer[f0])[index[0]];
+				c.y = Pointer<Short4>(buffer[f1])[index[1]];
+				c.z = Pointer<Short4>(buffer[f2])[index[2]];
+				c.w = Pointer<Short4>(buffer[f3])[index[3]];
+				transpose4x3(c.x, c.y, c.z, c.w);
+				break;
+			case 2:
+				c.x = *Pointer<Short4>(buffer[f0] + 4 * index[0]);
+				c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer[f1] + 4 * index[1])));
+				c.z = *Pointer<Short4>(buffer[f2] + 4 * index[2]);
+				c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer[f3] + 4 * index[3])));
+				c.y = c.x;
+				c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
+				c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
+				break;
+			case 1:
+				c.x = Insert(c.x, Pointer<Short>(buffer[f0])[index[0]], 0);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f1])[index[1]], 1);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f2])[index[2]], 2);
+				c.x = Insert(c.x, Pointer<Short>(buffer[f3])[index[3]], 3);
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+		else ASSERT(false);
+
+		if(state.sRGB)
+		{
+			if(state.textureFormat == FORMAT_R5G6B5)
+			{
+				sRGBtoLinear16_5_16(c.x);
+				sRGBtoLinear16_6_16(c.y);
+				sRGBtoLinear16_5_16(c.z);
+			}
+			else
+			{
+				for(int i = 0; i < textureComponentCount(); i++)
+				{
+					if(isRGBComponent(i))
+					{
+						sRGBtoLinear16_8_16(c[i]);
+					}
+				}
+			}
+		}
+
+		return c;
+	}
+
+	Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
+	{
+		Vector4s c;
+
+		UInt index[4];
+		computeIndices(index, uuuu, vvvv, wwww, offset, mipmap, function);
+
+		if(hasYuvFormat())
+		{
+			// Generic YPbPr to RGB transformation
+			// R = Y                               +           2 * (1 - Kr) * Pr
+			// G = Y - 2 * Kb * (1 - Kb) / Kg * Pb - 2 * Kr * (1 - Kr) / Kg * Pr
+			// B = Y +           2 * (1 - Kb) * Pb
+
+			float Kb = 0.114f;
+			float Kr = 0.299f;
+			int studioSwing = 1;
+
+			switch(state.textureFormat)
+			{
+			case FORMAT_YV12_BT601:
+				Kb = 0.114f;
+				Kr = 0.299f;
+				studioSwing = 1;
+				break;
+			case FORMAT_YV12_BT709:
+				Kb = 0.0722f;
+				Kr = 0.2126f;
+				studioSwing = 1;
+				break;
+			case FORMAT_YV12_JFIF:
+				Kb = 0.114f;
+				Kr = 0.299f;
+				studioSwing = 0;
+				break;
+			default:
+				ASSERT(false);
+			}
+
+			const float Kg = 1.0f - Kr - Kb;
+
+			const float Rr = 2 * (1 - Kr);
+			const float Gb = -2 * Kb * (1 - Kb) / Kg;
+			const float Gr = -2 * Kr * (1 - Kr) / Kg;
+			const float Bb = 2 * (1 - Kb);
+
+			// Scaling and bias for studio-swing range: Y = [16 .. 235], U/V = [16 .. 240]
+			const float Yy = studioSwing ? 255.0f / (235 - 16) : 1.0f;
+			const float Uu = studioSwing ? 255.0f / (240 - 16) : 1.0f;
+			const float Vv = studioSwing ? 255.0f / (240 - 16) : 1.0f;
+
+			const float Rv = Vv *  Rr;
+			const float Gu = Uu *  Gb;
+			const float Gv = Vv *  Gr;
+			const float Bu = Uu *  Bb;
+
+			const float R0 = (studioSwing * -16 * Yy - 128 * Rv) / 255;
+			const float G0 = (studioSwing * -16 * Yy - 128 * Gu - 128 * Gv) / 255;
+			const float B0 = (studioSwing * -16 * Yy - 128 * Bu) / 255;
+
+			Int c0 = Int(buffer[0][index[0]]);
+			Int c1 = Int(buffer[0][index[1]]);
+			Int c2 = Int(buffer[0][index[2]]);
+			Int c3 = Int(buffer[0][index[3]]);
+			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+			UShort4 Y = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+			computeIndices(index, uuuu, vvvv, wwww, offset, mipmap + sizeof(Mipmap), function);
+			c0 = Int(buffer[1][index[0]]);
+			c1 = Int(buffer[1][index[1]]);
+			c2 = Int(buffer[1][index[2]]);
+			c3 = Int(buffer[1][index[3]]);
+			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+			UShort4 V = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+			c0 = Int(buffer[2][index[0]]);
+			c1 = Int(buffer[2][index[1]]);
+			c2 = Int(buffer[2][index[2]]);
+			c3 = Int(buffer[2][index[3]]);
+			c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
+			UShort4 U = As<UShort4>(Unpack(As<Byte4>(c0)));
+
+			const UShort4 yY = UShort4(iround(Yy * 0x4000));
+			const UShort4 rV = UShort4(iround(Rv * 0x4000));
+			const UShort4 gU = UShort4(iround(-Gu * 0x4000));
+			const UShort4 gV = UShort4(iround(-Gv * 0x4000));
+			const UShort4 bU = UShort4(iround(Bu * 0x4000));
+
+			const UShort4 r0 = UShort4(iround(-R0 * 0x4000));
+			const UShort4 g0 = UShort4(iround(G0 * 0x4000));
+			const UShort4 b0 = UShort4(iround(-B0 * 0x4000));
+
+			UShort4 y = MulHigh(Y, yY);
+			UShort4 r = SubSat(y + MulHigh(V, rV), r0);
+			UShort4 g = SubSat(y + g0, MulHigh(U, gU) + MulHigh(V, gV));
+			UShort4 b = SubSat(y + MulHigh(U, bU), b0);
+
+			c.x = Min(r, UShort4(0x3FFF)) << 2;
+			c.y = Min(g, UShort4(0x3FFF)) << 2;
+			c.z = Min(b, UShort4(0x3FFF)) << 2;
+		}
+		else
+		{
+			return sampleTexel(index, buffer);
+		}
+
+		return c;
+	}
+
+	Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function)
+	{
+		Vector4f c;
+
+		UInt index[4];
+		computeIndices(index, uuuu, vvvv, wwww, mipmap, function);
+
+		if(hasFloatTexture() || has32bitIntegerTextureComponents())
+		{
+			int f0 = state.textureType == TEXTURE_CUBE ? 0 : 0;
+			int f1 = state.textureType == TEXTURE_CUBE ? 1 : 0;
+			int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
+			int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
+
+			// Read texels
+			switch(textureComponentCount())
+			{
+			case 4:
+				c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+				c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+				c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+				c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+				transpose4x4(c.x, c.y, c.z, c.w);
+				break;
+			case 3:
+				c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+				c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+				c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+				c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+				transpose4x3(c.x, c.y, c.z, c.w);
+				break;
+			case 2:
+				// FIXME: Optimal shuffling?
+				c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
+				c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
+				c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
+				c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
+				c.y = c.x;
+				c.x = Float4(c.x.xz, c.z.xz);
+				c.y = Float4(c.y.yw, c.z.yw);
+				break;
+			case 1:
+				// FIXME: Optimal shuffling?
+				c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
+				c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
+				c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
+				c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
+				break;
+			default:
+				ASSERT(false);
+			}
+
+			if(state.compare != COMPARE_BYPASS)
+			{
+				Float4 ref = z;
+
+				if(!hasFloatTexture())
+				{
+					ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
+				}
+
+				Int4 boolean;
+
+				switch(state.compare)
+				{
+				case COMPARE_LESSEQUAL:    boolean = CmpLE(ref, c.x);  break;
+				case COMPARE_GREATEREQUAL: boolean = CmpNLT(ref, c.x); break;
+				case COMPARE_LESS:         boolean = CmpLT(ref, c.x);  break;
+				case COMPARE_GREATER:      boolean = CmpNLE(ref, c.x); break;
+				case COMPARE_EQUAL:        boolean = CmpEQ(ref, c.x);  break;
+				case COMPARE_NOTEQUAL:     boolean = CmpNEQ(ref, c.x); break;
+				case COMPARE_ALWAYS:       boolean = Int4(-1);         break;
+				case COMPARE_NEVER:        boolean = Int4(0);          break;
+				default:                   ASSERT(false);
+				}
+
+				c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
+				c.y = Float4(0.0f);
+				c.z = Float4(0.0f);
+				c.w = Float4(1.0f);
+			}
+		}
+		else
+		{
+			ASSERT(!hasYuvFormat());
+
+			Vector4s cs = sampleTexel(index, buffer);
+
+			bool isInteger = Surface::isNonNormalizedInteger(state.textureFormat);
+			int componentCount = textureComponentCount();
+			for(int n = 0; n < componentCount; n++)
+			{
+				if(hasUnsignedTextureComponent(n))
+				{
+					if(isInteger)
+					{
+						c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
+					}
+					else
+					{
+						c[n] = Float4(As<UShort4>(cs[n]));
+					}
+				}
+				else
+				{
+					if(isInteger)
+					{
+						c[n] = As<Float4>(Int4(cs[n]));
+					}
+					else
+					{
+						c[n] = Float4(cs[n]);
+					}
+				}
+			}
+		}
+
+		return c;
+	}
+
+	void SamplerCore::selectMipmap(Pointer<Byte> &texture, Pointer<Byte> buffer[4], Pointer<Byte> &mipmap, Float &lod, Int face[4], bool secondLOD)
+	{
+		if(state.mipmapFilter == MIPMAP_NONE)
+		{
+			mipmap = texture + OFFSET(Texture,mipmap[0]);
+		}
+		else
+		{
+			Int ilod;
+
+			if(state.mipmapFilter == MIPMAP_POINT)
+			{
+				ilod = RoundInt(lod);
+			}
+			else   // MIPMAP_LINEAR
+			{
+				ilod = Int(lod);
+			}
+
+			mipmap = texture + OFFSET(Texture,mipmap) + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
+		}
+
+		if(state.textureType != TEXTURE_CUBE)
+		{
+			buffer[0] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer[0]));
+
+			if(hasYuvFormat())
+			{
+				buffer[1] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer[1]));
+				buffer[2] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer[2]));
+			}
+		}
+		else
+		{
+			for(int i = 0; i < 4; i++)
+			{
+				buffer[i] = *Pointer<Pointer<Byte> >(mipmap + OFFSET(Mipmap,buffer) + face[i] * sizeof(void*));
+			}
+		}
+	}
+
+	Int4 SamplerCore::computeFilterOffset(Float &lod)
+	{
+		Int4 filter = -1;
+
+		if(state.textureFilter == FILTER_POINT)
+		{
+			filter = 0;
+		}
+		else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+		{
+			filter = CmpNLE(Float4(lod), Float4(0.0f));
+		}
+		else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
+		{
+			filter = CmpLE(Float4(lod), Float4(0.0f));
+		}
+
+		return filter;
+	}
+
+	Short4 SamplerCore::address(Float4 &uw, AddressingMode addressingMode, Pointer<Byte> &mipmap)
+	{
+		if(addressingMode == ADDRESSING_LAYER && state.textureType != TEXTURE_2D_ARRAY)
+		{
+			return Short4();   // Unused
+		}
+		else if(addressingMode == ADDRESSING_LAYER && state.textureType == TEXTURE_2D_ARRAY)
+		{
+			return Min(Max(Short4(RoundInt(uw)), Short4(0)), *Pointer<Short4>(mipmap + OFFSET(Mipmap, depth)) - Short4(1));
+		}
+		else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
+		{
+			Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
+
+			return Short4(Int4(clamp * Float4(1 << 16)));
+		}
+		else if(addressingMode == ADDRESSING_MIRROR)
+		{
+			Int4 convert = Int4(uw * Float4(1 << 16));
+			Int4 mirror = (convert << 15) >> 31;
+
+			convert ^= mirror;
+
+			return Short4(convert);
+		}
+		else if(addressingMode == ADDRESSING_MIRRORONCE)
+		{
+			// Absolute value
+			Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
+
+			// Clamp
+			convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
+			convert = As<Int4>(PackSigned(convert, convert));
+
+			return As<Short4>(Int2(convert)) + Short4(0x8000u);
+		}
+		else   // Wrap
+		{
+			return Short4(Int4(uw * Float4(1 << 16)));
+		}
+	}
+
+	void SamplerCore::address(Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function)
+	{
+		if(addressingMode == ADDRESSING_LAYER && state.textureType != TEXTURE_2D_ARRAY)
+		{
+			return;   // Unused
+		}
+
+		Int4 dim = Int4(*Pointer<Short4>(mipmap + whd, 16));
+		Int4 maxXYZ = dim - Int4(1);
+
+		if(function == Fetch)
+		{
+			xyz0 = Min(Max(((function.option == Offset) && (addressingMode != ADDRESSING_LAYER)) ? As<Int4>(uvw) + As<Int4>(texOffset) : As<Int4>(uvw), Int4(0)), maxXYZ);
+		}
+		else if(addressingMode == ADDRESSING_LAYER && state.textureType == TEXTURE_2D_ARRAY)   // Note: Offset does not apply to array layers
+		{
+			xyz0 = Min(Max(RoundInt(uvw), Int4(0)), maxXYZ);
+		}
+		else
+		{
+			const int halfBits = 0x3EFFFFFF;   // Value just under 0.5f
+			const int oneBits  = 0x3F7FFFFF;   // Value just under 1.0f
+			const int twoBits  = 0x3FFFFFFF;   // Value just under 2.0f
+
+			bool pointFilter = state.textureFilter == FILTER_POINT ||
+			                   state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
+			                   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT;
+
+			Float4 coord = uvw;
+
+			if(state.textureType == TEXTURE_RECTANGLE)
+			{
+				// According to https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_rectangle.txt
+				// "CLAMP_TO_EDGE causes the s coordinate to be clamped to the range[0.5, wt - 0.5].
+				//  CLAMP_TO_EDGE causes the t coordinate to be clamped to the range[0.5, ht - 0.5]."
+				// Unless SwiftShader implements support for ADDRESSING_BORDER, other modes should be equivalent
+				// to CLAMP_TO_EDGE. Rectangle textures have no support for any MIRROR or REPEAT modes.
+				coord = Min(Max(coord, Float4(0.5f)), Float4(dim) - Float4(0.5f));
+			}
+			else
+			{
+				switch(addressingMode)
+				{
+				case ADDRESSING_CLAMP:
+				case ADDRESSING_BORDER:
+				case ADDRESSING_SEAMLESS:
+					// Linear filtering of cube doesn't require clamping because the coordinates
+					// are already in [0, 1] range and numerical imprecision is tolerated.
+					if(addressingMode != ADDRESSING_SEAMLESS || pointFilter)
+					{
+						Float4 one = As<Float4>(Int4(oneBits));
+						coord = Min(Max(coord, Float4(0.0f)), one);
+					}
+					break;
+				case ADDRESSING_MIRROR:
+				{
+					Float4 half = As<Float4>(Int4(halfBits));
+					Float4 one = As<Float4>(Int4(oneBits));
+					Float4 two = As<Float4>(Int4(twoBits));
+					coord = one - Abs(two * Frac(coord * half) - one);
+				}
+				break;
+				case ADDRESSING_MIRRORONCE:
+				{
+					Float4 half = As<Float4>(Int4(halfBits));
+					Float4 one = As<Float4>(Int4(oneBits));
+					Float4 two = As<Float4>(Int4(twoBits));
+					coord = one - Abs(two * Frac(Min(Max(coord, -one), two) * half) - one);
+				}
+				break;
+				default:   // Wrap
+					coord = Frac(coord);
+					break;
+				}
+
+				coord = coord * Float4(dim);
+			}
+
+			if(state.textureFilter == FILTER_POINT ||
+			   state.textureFilter == FILTER_GATHER)
+			{
+				xyz0 = Int4(coord);
+			}
+			else
+			{
+				if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
+				   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
+				{
+					coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
+				}
+				else
+				{
+					coord -= Float4(0.5f);
+				}
+
+				Float4 floor = Floor(coord);
+				xyz0 = Int4(floor);
+				f = coord - floor;
+			}
+
+			if(function.option == Offset)
+			{
+				xyz0 += As<Int4>(texOffset);
+			}
+
+			if(addressingMode == ADDRESSING_SEAMLESS)
+			{
+				xyz0 += Int4(1);
+			}
+
+			xyz1 = xyz0 - filter;   // Increment
+
+			if(function.option == Offset)
+			{
+				switch(addressingMode)
+				{
+				case ADDRESSING_SEAMLESS:
+					ASSERT(false);   // Cube sampling doesn't support offset.
+				case ADDRESSING_MIRROR:
+				case ADDRESSING_MIRRORONCE:
+				case ADDRESSING_BORDER:
+					// FIXME: Implement ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, and ADDRESSING_BORDER.
+					// Fall through to Clamp.
+				case ADDRESSING_CLAMP:
+					xyz0 = Min(Max(xyz0, Int4(0)), maxXYZ);
+					xyz1 = Min(Max(xyz1, Int4(0)), maxXYZ);
+					break;
+				default:   // Wrap
+					xyz0 = (xyz0 + dim * Int4(-MIN_PROGRAM_TEXEL_OFFSET)) % dim;
+					xyz1 = (xyz1 + dim * Int4(-MIN_PROGRAM_TEXEL_OFFSET)) % dim;
+					break;
+				}
+			}
+			else if(state.textureFilter != FILTER_POINT)
+			{
+				switch(addressingMode)
+				{
+				case ADDRESSING_SEAMLESS:
+					break;
+				case ADDRESSING_MIRROR:
+				case ADDRESSING_MIRRORONCE:
+				case ADDRESSING_BORDER:
+				case ADDRESSING_CLAMP:
+					xyz0 = Max(xyz0, Int4(0));
+					xyz1 = Min(xyz1, maxXYZ);
+					break;
+				default:   // Wrap
+					{
+						Int4 under = CmpLT(xyz0, Int4(0));
+						xyz0 = (under & maxXYZ) | (~under & xyz0);   // xyz < 0 ? dim - 1 : xyz   // FIXME: IfThenElse()
+
+						Int4 nover = CmpLT(xyz1, dim);
+						xyz1 = nover & xyz1;   // xyz >= dim ? 0 : xyz
+					}
+					break;
+				}
+			}
+		}
+	}
+
+	void SamplerCore::convertFixed12(Short4 &cs, Float4 &cf)
+	{
+		cs = RoundShort4(cf * Float4(0x1000));
+	}
+
+	void SamplerCore::convertFixed12(Vector4s &cs, Vector4f &cf)
+	{
+		convertFixed12(cs.x, cf.x);
+		convertFixed12(cs.y, cf.y);
+		convertFixed12(cs.z, cf.z);
+		convertFixed12(cs.w, cf.w);
+	}
+
+	void SamplerCore::convertSigned12(Float4 &cf, Short4 &cs)
+	{
+		cf = Float4(cs) * Float4(1.0f / 0x0FFE);
+	}
+
+//	void SamplerCore::convertSigned12(Vector4f &cf, Vector4s &cs)
+//	{
+//		convertSigned12(cf.x, cs.x);
+//		convertSigned12(cf.y, cs.y);
+//		convertSigned12(cf.z, cs.z);
+//		convertSigned12(cf.w, cs.w);
+//	}
+
+	void SamplerCore::convertSigned15(Float4 &cf, Short4 &cs)
+	{
+		cf = Float4(cs) * Float4(1.0f / 0x7FFF);
+	}
+
+	void SamplerCore::convertUnsigned16(Float4 &cf, Short4 &cs)
+	{
+		cf = Float4(As<UShort4>(cs)) * Float4(1.0f / 0xFFFF);
+	}
+
+	void SamplerCore::sRGBtoLinear16_8_16(Short4 &c)
+	{
+		c = As<UShort4>(c) >> 8;
+
+		Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear8_16));
+
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
+	}
+
+	void SamplerCore::sRGBtoLinear16_6_16(Short4 &c)
+	{
+		c = As<UShort4>(c) >> 10;
+
+		Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear6_16));
+
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
+	}
+
+	void SamplerCore::sRGBtoLinear16_5_16(Short4 &c)
+	{
+		c = As<UShort4>(c) >> 11;
+
+		Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants,sRGBtoLinear5_16));
+
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
+		c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
+	}
+
+	bool SamplerCore::hasFloatTexture() const
+	{
+		return Surface::isFloatFormat(state.textureFormat);
+	}
+
+	bool SamplerCore::hasUnnormalizedIntegerTexture() const
+	{
+		return Surface::isNonNormalizedInteger(state.textureFormat);
+	}
+
+	bool SamplerCore::hasUnsignedTextureComponent(int component) const
+	{
+		return Surface::isUnsignedComponent(state.textureFormat, component);
+	}
+
+	int SamplerCore::textureComponentCount() const
+	{
+		return Surface::componentCount(state.textureFormat);
+	}
+
+	bool SamplerCore::hasThirdCoordinate() const
+	{
+		return (state.textureType == TEXTURE_3D) || (state.textureType == TEXTURE_2D_ARRAY);
+	}
+
+	bool SamplerCore::has16bitTextureFormat() const
+	{
+		switch(state.textureFormat)
+		{
+		case FORMAT_R5G6B5:
+			return true;
+		case FORMAT_R8_SNORM:
+		case FORMAT_G8R8_SNORM:
+		case FORMAT_X8B8G8R8_SNORM:
+		case FORMAT_A8B8G8R8_SNORM:
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_V8U8:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_A8:
+		case FORMAT_R8:
+		case FORMAT_L8:
+		case FORMAT_A8L8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_L16:
+		case FORMAT_G16R16:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_V16U16:
+		case FORMAT_A16W16V16U16:
+		case FORMAT_Q16W16V16U16:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return false;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool SamplerCore::has8bitTextureComponents() const
+	{
+		switch(state.textureFormat)
+		{
+		case FORMAT_G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_V8U8:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_A8:
+		case FORMAT_R8:
+		case FORMAT_L8:
+		case FORMAT_A8L8:
+		case FORMAT_R8_SNORM:
+		case FORMAT_G8R8_SNORM:
+		case FORMAT_X8B8G8R8_SNORM:
+		case FORMAT_A8B8G8R8_SNORM:
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+			return true;
+		case FORMAT_R5G6B5:
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_L16:
+		case FORMAT_G16R16:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_V16U16:
+		case FORMAT_A16W16V16U16:
+		case FORMAT_Q16W16V16U16:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return false;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool SamplerCore::has16bitTextureComponents() const
+	{
+		switch(state.textureFormat)
+		{
+		case FORMAT_R5G6B5:
+		case FORMAT_R8_SNORM:
+		case FORMAT_G8R8_SNORM:
+		case FORMAT_X8B8G8R8_SNORM:
+		case FORMAT_A8B8G8R8_SNORM:
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_V8U8:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_A8:
+		case FORMAT_R8:
+		case FORMAT_L8:
+		case FORMAT_A8L8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return false;
+		case FORMAT_L16:
+		case FORMAT_G16R16:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_V16U16:
+		case FORMAT_A16W16V16U16:
+		case FORMAT_Q16W16V16U16:
+			return true;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool SamplerCore::has32bitIntegerTextureComponents() const
+	{
+		switch(state.textureFormat)
+		{
+		case FORMAT_R5G6B5:
+		case FORMAT_R8_SNORM:
+		case FORMAT_G8R8_SNORM:
+		case FORMAT_X8B8G8R8_SNORM:
+		case FORMAT_A8B8G8R8_SNORM:
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_V8U8:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_L16:
+		case FORMAT_G16R16:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_V16U16:
+		case FORMAT_A16W16V16U16:
+		case FORMAT_Q16W16V16U16:
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_A8:
+		case FORMAT_R8:
+		case FORMAT_L8:
+		case FORMAT_A8L8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return false;
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+			return true;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool SamplerCore::hasYuvFormat() const
+	{
+		switch(state.textureFormat)
+		{
+		case FORMAT_YV12_BT601:
+		case FORMAT_YV12_BT709:
+		case FORMAT_YV12_JFIF:
+			return true;
+		case FORMAT_R5G6B5:
+		case FORMAT_R8_SNORM:
+		case FORMAT_G8R8_SNORM:
+		case FORMAT_X8B8G8R8_SNORM:
+		case FORMAT_A8B8G8R8_SNORM:
+		case FORMAT_R8I:
+		case FORMAT_R8UI:
+		case FORMAT_G8R8I:
+		case FORMAT_G8R8UI:
+		case FORMAT_X8B8G8R8I:
+		case FORMAT_X8B8G8R8UI:
+		case FORMAT_A8B8G8R8I:
+		case FORMAT_A8B8G8R8UI:
+		case FORMAT_R32I:
+		case FORMAT_R32UI:
+		case FORMAT_G32R32I:
+		case FORMAT_G32R32UI:
+		case FORMAT_X32B32G32R32I:
+		case FORMAT_X32B32G32R32UI:
+		case FORMAT_A32B32G32R32I:
+		case FORMAT_A32B32G32R32UI:
+		case FORMAT_G8R8:
+		case FORMAT_X8R8G8B8:
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8R8G8B8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+		case FORMAT_V8U8:
+		case FORMAT_Q8W8V8U8:
+		case FORMAT_X8L8V8U8:
+		case FORMAT_R32F:
+		case FORMAT_G32R32F:
+		case FORMAT_X32B32G32R32F:
+		case FORMAT_A32B32G32R32F:
+		case FORMAT_X32B32G32R32F_UNSIGNED:
+		case FORMAT_A8:
+		case FORMAT_R8:
+		case FORMAT_L8:
+		case FORMAT_A8L8:
+		case FORMAT_D32F_LOCKABLE:
+		case FORMAT_D32FS8_TEXTURE:
+		case FORMAT_D32F_SHADOW:
+		case FORMAT_D32FS8_SHADOW:
+		case FORMAT_L16:
+		case FORMAT_G16R16:
+		case FORMAT_A16B16G16R16:
+		case FORMAT_R16I:
+		case FORMAT_R16UI:
+		case FORMAT_G16R16I:
+		case FORMAT_G16R16UI:
+		case FORMAT_X16B16G16R16I:
+		case FORMAT_X16B16G16R16UI:
+		case FORMAT_A16B16G16R16I:
+		case FORMAT_A16B16G16R16UI:
+		case FORMAT_V16U16:
+		case FORMAT_A16W16V16U16:
+		case FORMAT_Q16W16V16U16:
+			return false;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+
+	bool SamplerCore::isRGBComponent(int component) const
+	{
+		switch(state.textureFormat)
+		{
+		case FORMAT_R5G6B5:         return component < 3;
+		case FORMAT_R8_SNORM:      return component < 1;
+		case FORMAT_G8R8_SNORM:    return component < 2;
+		case FORMAT_X8B8G8R8_SNORM: return component < 3;
+		case FORMAT_A8B8G8R8_SNORM: return component < 3;
+		case FORMAT_R8I:            return component < 1;
+		case FORMAT_R8UI:           return component < 1;
+		case FORMAT_G8R8I:          return component < 2;
+		case FORMAT_G8R8UI:         return component < 2;
+		case FORMAT_X8B8G8R8I:      return component < 3;
+		case FORMAT_X8B8G8R8UI:     return component < 3;
+		case FORMAT_A8B8G8R8I:      return component < 3;
+		case FORMAT_A8B8G8R8UI:     return component < 3;
+		case FORMAT_R32I:           return component < 1;
+		case FORMAT_R32UI:          return component < 1;
+		case FORMAT_G32R32I:        return component < 2;
+		case FORMAT_G32R32UI:       return component < 2;
+		case FORMAT_X32B32G32R32I:  return component < 3;
+		case FORMAT_X32B32G32R32UI: return component < 3;
+		case FORMAT_A32B32G32R32I:  return component < 3;
+		case FORMAT_A32B32G32R32UI: return component < 3;
+		case FORMAT_G8R8:           return component < 2;
+		case FORMAT_X8R8G8B8:       return component < 3;
+		case FORMAT_X8B8G8R8:       return component < 3;
+		case FORMAT_A8R8G8B8:       return component < 3;
+		case FORMAT_A8B8G8R8:       return component < 3;
+		case FORMAT_SRGB8_X8:       return component < 3;
+		case FORMAT_SRGB8_A8:       return component < 3;
+		case FORMAT_V8U8:           return false;
+		case FORMAT_Q8W8V8U8:       return false;
+		case FORMAT_X8L8V8U8:       return false;
+		case FORMAT_R32F:           return component < 1;
+		case FORMAT_G32R32F:        return component < 2;
+		case FORMAT_X32B32G32R32F:  return component < 3;
+		case FORMAT_A32B32G32R32F:  return component < 3;
+		case FORMAT_X32B32G32R32F_UNSIGNED: return component < 3;
+		case FORMAT_A8:             return false;
+		case FORMAT_R8:             return component < 1;
+		case FORMAT_L8:             return component < 1;
+		case FORMAT_A8L8:           return component < 1;
+		case FORMAT_D32F_LOCKABLE:  return false;
+		case FORMAT_D32FS8_TEXTURE: return false;
+		case FORMAT_D32F_SHADOW:    return false;
+		case FORMAT_D32FS8_SHADOW:  return false;
+		case FORMAT_L16:            return component < 1;
+		case FORMAT_G16R16:         return component < 2;
+		case FORMAT_A16B16G16R16:   return component < 3;
+		case FORMAT_R16I:           return component < 1;
+		case FORMAT_R16UI:          return component < 1;
+		case FORMAT_G16R16I:        return component < 2;
+		case FORMAT_G16R16UI:       return component < 2;
+		case FORMAT_X16B16G16R16I:  return component < 3;
+		case FORMAT_X16B16G16R16UI: return component < 3;
+		case FORMAT_A16B16G16R16I:  return component < 3;
+		case FORMAT_A16B16G16R16UI: return component < 3;
+		case FORMAT_V16U16:         return false;
+		case FORMAT_A16W16V16U16:   return false;
+		case FORMAT_Q16W16V16U16:   return false;
+		case FORMAT_YV12_BT601:     return component < 3;
+		case FORMAT_YV12_BT709:     return component < 3;
+		case FORMAT_YV12_JFIF:      return component < 3;
+		default:
+			ASSERT(false);
+		}
+
+		return false;
+	}
+}
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
new file mode 100644
index 0000000..684c1a7
--- /dev/null
+++ b/src/Pipeline/SamplerCore.hpp
@@ -0,0 +1,116 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SamplerCore_hpp
+#define sw_SamplerCore_hpp
+
+#include "PixelRoutine.hpp"
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+	enum SamplerMethod
+	{
+		Implicit,  // Compute gradients (pixel shader only).
+		Bias,      // Compute gradients and add provided bias.
+		Lod,       // Use provided LOD.
+		Grad,      // Use provided gradients.
+		Fetch,     // Use provided integer coordinates.
+		Base       // Sample base level.
+	};
+
+	enum SamplerOption
+	{
+		None,
+		Offset   // Offset sample location by provided integer coordinates.
+	};
+
+	struct SamplerFunction
+	{
+		SamplerFunction(SamplerMethod method, SamplerOption option = None) : method(method), option(option) {}
+		operator SamplerMethod() { return method; }
+
+		const SamplerMethod method;
+		const SamplerOption option;
+ 	};
+
+	class SamplerCore
+	{
+	public:
+		SamplerCore(Pointer<Byte> &constants, const Sampler::State &state);
+
+		Vector4s sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy);
+		Vector4f sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+		static Vector4f textureSize(Pointer<Byte> &mipmap, Float4 &lod);
+
+	private:
+		Vector4s sampleTexture(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function, bool fixed12);
+
+		void border(Short4 &mask, Float4 &coordinates);
+		void border(Int4 &mask, Float4 &coordinates);
+		Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod);
+		Vector4s sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function);
+		Vector4s sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function);
+		Vector4s sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+		Vector4s sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+		Vector4s sample3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function);
+		Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], SamplerFunction function);
+		Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, SamplerFunction function);
+		Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+		Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function);
+		Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, bool secondLOD, SamplerFunction function);
+		Float log2sqrt(Float lod);
+		Float log2(Float lod);
+		void computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
+		void computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, Float4 &M, SamplerFunction function);
+		void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float &lodBias, Vector4f &dsx, Vector4f &dsy, SamplerFunction function);
+		void cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M);
+		Short4 applyOffset(Short4 &uvw, Float4 &offset, const Int4 &whd, AddressingMode mode);
+		void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, Vector4f &offset, const Pointer<Byte> &mipmap, SamplerFunction function);
+		void computeIndices(UInt index[4], Int4& uuuu, Int4& vvvv, Int4& wwww, const Pointer<Byte> &mipmap, SamplerFunction function);
+		Vector4s sampleTexel(Short4 &u, Short4 &v, Short4 &s, Vector4f &offset, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
+		Vector4s sampleTexel(UInt index[4], Pointer<Byte> buffer[4]);
+		Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &s, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4], SamplerFunction function);
+		void selectMipmap(Pointer<Byte> &texture, Pointer<Byte> buffer[4], Pointer<Byte> &mipmap, Float &lod, Int face[4], bool secondLOD);
+		Short4 address(Float4 &uw, AddressingMode addressingMode, Pointer<Byte>& mipmap);
+		void address(Float4 &uw, Int4& xyz0, Int4& xyz1, Float4& f, Pointer<Byte>& mipmap, Float4 &texOffset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function);
+		Int4 computeFilterOffset(Float &lod);
+
+		void convertFixed12(Short4 &ci, Float4 &cf);
+		void convertFixed12(Vector4s &cs, Vector4f &cf);
+		void convertSigned12(Float4 &cf, Short4 &ci);
+		void convertSigned15(Float4 &cf, Short4 &ci);
+		void convertUnsigned16(Float4 &cf, Short4 &ci);
+		void sRGBtoLinear16_8_16(Short4 &c);
+		void sRGBtoLinear16_6_16(Short4 &c);
+		void sRGBtoLinear16_5_16(Short4 &c);
+
+		bool hasFloatTexture() const;
+		bool hasUnnormalizedIntegerTexture() const;
+		bool hasUnsignedTextureComponent(int component) const;
+		int textureComponentCount() const;
+		bool hasThirdCoordinate() const;
+		bool has16bitTextureFormat() const;
+		bool has8bitTextureComponents() const;
+		bool has16bitTextureComponents() const;
+		bool has32bitIntegerTextureComponents() const;
+		bool hasYuvFormat() const;
+		bool isRGBComponent(int component) const;
+
+		Pointer<Byte> &constants;
+		const Sampler::State &state;
+	};
+}
+
+#endif   // sw_SamplerCore_hpp
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
new file mode 100644
index 0000000..d733c2d
--- /dev/null
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -0,0 +1,669 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "SetupRoutine.hpp"
+
+#include "Constants.hpp"
+#include "Renderer/Primitive.hpp"
+#include "Renderer/Polygon.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+	extern bool complementaryDepthBuffer;
+	extern TranscendentalPrecision logPrecision;
+	extern bool leadingVertexFirst;
+
+	SetupRoutine::SetupRoutine(const SetupProcessor::State &state) : state(state)
+	{
+		routine = 0;
+	}
+
+	SetupRoutine::~SetupRoutine()
+	{
+	}
+
+	void SetupRoutine::generate()
+	{
+		Function<Bool(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)> function;
+		{
+			Pointer<Byte> primitive(function.Arg<0>());
+			Pointer<Byte> tri(function.Arg<1>());
+			Pointer<Byte> polygon(function.Arg<2>());
+			Pointer<Byte> data(function.Arg<3>());
+
+			Pointer<Byte> constants = *Pointer<Pointer<Byte> >(data + OFFSET(DrawData,constants));
+
+			const bool point = state.isDrawPoint;
+			const bool sprite = state.pointSprite;
+			const bool line = state.isDrawLine;
+			const bool triangle = state.isDrawSolidTriangle || sprite;
+			const bool solidTriangle = state.isDrawSolidTriangle;
+
+			const int V0 = OFFSET(Triangle,v0);
+			const int V1 = (triangle || line) ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0);
+			const int V2 = triangle ? OFFSET(Triangle,v2) : (line ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0));
+
+			int pos = state.positionRegister;
+
+			Pointer<Byte> v0 = tri + V0;
+			Pointer<Byte> v1 = tri + V1;
+			Pointer<Byte> v2 = tri + V2;
+
+			Array<Int> X(16);
+			Array<Int> Y(16);
+
+			X[0] = *Pointer<Int>(v0 + OFFSET(Vertex,X));
+			X[1] = *Pointer<Int>(v1 + OFFSET(Vertex,X));
+			X[2] = *Pointer<Int>(v2 + OFFSET(Vertex,X));
+
+			Y[0] = *Pointer<Int>(v0 + OFFSET(Vertex,Y));
+			Y[1] = *Pointer<Int>(v1 + OFFSET(Vertex,Y));
+			Y[2] = *Pointer<Int>(v2 + OFFSET(Vertex,Y));
+
+			Int d = 1;     // Winding direction
+
+			// Culling
+			if(solidTriangle)
+			{
+				Float x0 = Float(X[0]);
+				Float x1 = Float(X[1]);
+				Float x2 = Float(X[2]);
+
+				Float y0 = Float(Y[0]);
+				Float y1 = Float(Y[1]);
+				Float y2 = Float(Y[2]);
+
+				Float A = (y2 - y0) * x1 + (y1 - y2) * x0 + (y0 - y1) * x2;   // Area
+
+				If(A == 0.0f)
+				{
+					Return(false);
+				}
+
+				Int w0w1w2 = *Pointer<Int>(v0 + pos * 16 + 12) ^
+							 *Pointer<Int>(v1 + pos * 16 + 12) ^
+							 *Pointer<Int>(v2 + pos * 16 + 12);
+
+				A = IfThenElse(w0w1w2 < 0, -A, A);
+
+				if(state.cullMode == CULL_CLOCKWISE)
+				{
+					If(A >= 0.0f) Return(false);
+				}
+				else if(state.cullMode == CULL_COUNTERCLOCKWISE)
+				{
+					If(A <= 0.0f) Return(false);
+				}
+
+				d = IfThenElse(A < 0.0f, d, Int(0));
+
+				if(state.twoSidedStencil)
+				{
+					If(A > 0.0f)
+					{
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+					}
+					Else
+					{
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+					}
+				}
+
+				if(state.vFace)
+				{
+					*Pointer<Float>(primitive + OFFSET(Primitive,area)) = 0.5f * A;
+				}
+			}
+			else
+			{
+				if(state.twoSidedStencil)
+				{
+					*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+					*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+				}
+			}
+
+			Int n = *Pointer<Int>(polygon + OFFSET(Polygon,n));
+			Int m = *Pointer<Int>(polygon + OFFSET(Polygon,i));
+
+			If(m != 0 || Bool(!solidTriangle))   // Clipped triangle; reproject
+			{
+				Pointer<Byte> V = polygon + OFFSET(Polygon,P) + m * sizeof(void*) * 16;
+
+				Int i = 0;
+
+				Do
+				{
+					Pointer<Float4> p = *Pointer<Pointer<Float4> >(V + i * sizeof(void*));
+					Float4 v = *Pointer<Float4>(p, 16);
+
+					Float w = v.w;
+					Float rhw = IfThenElse(w != 0.0f, 1.0f / w, Float(1.0f));
+
+					X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,Wx16)));
+					Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,Hx16)));
+
+					i++;
+				}
+				Until(i >= n)
+			}
+
+			// Vertical range
+			Int yMin = Y[0];
+			Int yMax = Y[0];
+
+			Int i = 1;
+
+			Do
+			{
+				yMin = Min(Y[i], yMin);
+				yMax = Max(Y[i], yMax);
+
+				i++;
+			}
+			Until(i >= n)
+
+			if(state.multiSample > 1)
+			{
+				yMin = (yMin + 0x0A) >> 4;
+				yMax = (yMax + 0x14) >> 4;
+			}
+			else
+			{
+				yMin = (yMin + 0x0F) >> 4;
+				yMax = (yMax + 0x0F) >> 4;
+			}
+
+			If(yMin == yMax)
+			{
+				Return(false);
+			}
+
+			yMin = Max(yMin, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+			yMax = Min(yMax, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+
+			For(Int q = 0, q < state.multiSample, q++)
+			{
+				Array<Int> Xq(16);
+				Array<Int> Yq(16);
+
+				Int i = 0;
+
+				Do
+				{
+					Xq[i] = X[i];
+					Yq[i] = Y[i];
+
+					if(state.multiSample > 1)
+					{
+						Xq[i] = Xq[i] + *Pointer<Int>(constants + OFFSET(Constants,Xf) + q * sizeof(int));
+						Yq[i] = Yq[i] + *Pointer<Int>(constants + OFFSET(Constants,Yf) + q * sizeof(int));
+					}
+
+					i++;
+				}
+				Until(i >= n)
+
+				Pointer<Byte> leftEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->left)) + q * sizeof(Primitive);
+				Pointer<Byte> rightEdge = Pointer<Byte>(primitive + OFFSET(Primitive,outline->right)) + q * sizeof(Primitive);
+
+				if(state.multiSample > 1)
+				{
+					Int xMin = *Pointer<Int>(data + OFFSET(DrawData, scissorX0));
+					Int xMax = *Pointer<Int>(data + OFFSET(DrawData, scissorX1));
+					Short x = Short(Clamp((X[0] + 0xF) >> 4, xMin, xMax));
+
+					For(Int y = yMin - 1, y < yMax + 1, y++)
+					{
+						*Pointer<Short>(leftEdge + y * sizeof(Primitive::Span)) = x;
+						*Pointer<Short>(rightEdge + y * sizeof(Primitive::Span)) = x;
+					}
+				}
+
+				Xq[n] = Xq[0];
+				Yq[n] = Yq[0];
+
+				// Rasterize
+				{
+					Int i = 0;
+
+					Do
+					{
+						edge(primitive, data, Xq[i + 1 - d], Yq[i + 1 - d], Xq[i + d], Yq[i + d], q);
+
+						i++;
+					}
+					Until(i >= n)
+				}
+
+				if(state.multiSample == 1)
+				{
+					For(, yMin < yMax && *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + yMin * sizeof(Primitive::Span)), yMin++)
+					{
+						// Increments yMin
+					}
+
+					For(, yMax > yMin && *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span)) == *Pointer<Short>(rightEdge + (yMax - 1) * sizeof(Primitive::Span)), yMax--)
+					{
+						// Decrements yMax
+					}
+
+					If(yMin == yMax)
+					{
+						Return(false);
+					}
+
+					*Pointer<Short>(leftEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
+					*Pointer<Short>(rightEdge + (yMin - 1) * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + yMin * sizeof(Primitive::Span));
+					*Pointer<Short>(leftEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
+					*Pointer<Short>(rightEdge + yMax * sizeof(Primitive::Span)) = *Pointer<Short>(leftEdge + (yMax - 1) * sizeof(Primitive::Span));
+				}
+			}
+
+			*Pointer<Int>(primitive + OFFSET(Primitive,yMin)) = yMin;
+			*Pointer<Int>(primitive + OFFSET(Primitive,yMax)) = yMax;
+
+			// Sort by minimum y
+			if(solidTriangle && logPrecision >= WHQL)
+			{
+				Float y0 = *Pointer<Float>(v0 + pos * 16 + 4);
+				Float y1 = *Pointer<Float>(v1 + pos * 16 + 4);
+				Float y2 = *Pointer<Float>(v2 + pos * 16 + 4);
+
+				Float yMin = Min(Min(y0, y1), y2);
+
+				conditionalRotate1(yMin == y1, v0, v1, v2);
+				conditionalRotate2(yMin == y2, v0, v1, v2);
+			}
+
+			// Sort by maximum w
+			if(solidTriangle)
+			{
+				Float w0 = *Pointer<Float>(v0 + pos * 16 + 12);
+				Float w1 = *Pointer<Float>(v1 + pos * 16 + 12);
+				Float w2 = *Pointer<Float>(v2 + pos * 16 + 12);
+
+				Float wMax = Max(Max(w0, w1), w2);
+
+				conditionalRotate1(wMax == w1, v0, v1, v2);
+				conditionalRotate2(wMax == w2, v0, v1, v2);
+			}
+
+			Float w0 = *Pointer<Float>(v0 + pos * 16 + 12);
+			Float w1 = *Pointer<Float>(v1 + pos * 16 + 12);
+			Float w2 = *Pointer<Float>(v2 + pos * 16 + 12);
+
+			Float4 w012;
+
+			w012.x = w0;
+			w012.y = w1;
+			w012.z = w2;
+			w012.w = 1;
+
+			Float rhw0 = *Pointer<Float>(v0 + OFFSET(Vertex,W));
+
+			Int X0 = *Pointer<Int>(v0 + OFFSET(Vertex,X));
+			Int X1 = *Pointer<Int>(v1 + OFFSET(Vertex,X));
+			Int X2 = *Pointer<Int>(v2 + OFFSET(Vertex,X));
+
+			Int Y0 = *Pointer<Int>(v0 + OFFSET(Vertex,Y));
+			Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex,Y));
+			Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex,Y));
+
+			if(line)
+			{
+				X2 = X1 + Y1 - Y0;
+				Y2 = Y1 + X0 - X1;
+			}
+
+			Float dx = Float(X0) * (1.0f / 16.0f);
+			Float dy = Float(Y0) * (1.0f / 16.0f);
+
+			X1 -= X0;
+			Y1 -= Y0;
+
+			X2 -= X0;
+			Y2 -= Y0;
+
+			Float x1 = w1 * (1.0f / 16.0f) * Float(X1);
+			Float y1 = w1 * (1.0f / 16.0f) * Float(Y1);
+
+			Float x2 = w2 * (1.0f / 16.0f) * Float(X2);
+			Float y2 = w2 * (1.0f / 16.0f) * Float(Y2);
+
+			Float a = x1 * y2 - x2 * y1;
+
+			Float4 xQuad = Float4(0, 1, 0, 1) - Float4(dx);
+			Float4 yQuad = Float4(0, 0, 1, 1) - Float4(dy);
+
+			*Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16) = xQuad;
+			*Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16) = yQuad;
+
+			Float4 M[3];
+
+			M[0] = Float4(0, 0, 0, 0);
+			M[1] = Float4(0, 0, 0, 0);
+			M[2] = Float4(0, 0, 0, 0);
+
+			M[0].z = rhw0;
+
+			If(a != 0.0f)
+			{
+				Float A = 1.0f / a;
+				Float D = A * rhw0;
+
+				M[0].x = (y1 * w2 - y2 * w1) * D;
+				M[0].y = (x2 * w1 - x1 * w2) * D;
+			//	M[0].z = rhw0;
+			//	M[0].w = 0;
+
+				M[1].x = y2 * A;
+				M[1].y = -x2 * A;
+			//	M[1].z = 0;
+			//	M[1].w = 0;
+
+				M[2].x = -y1 * A;
+				M[2].y = x1 * A;
+			//	M[2].z = 0;
+			//	M[2].w = 0;
+			}
+
+			if(state.interpolateW)
+			{
+				Float4 ABC = M[0] + M[1] + M[2];
+
+				Float4 A = ABC.x;
+				Float4 B = ABC.y;
+				Float4 C = ABC.z;
+
+				*Pointer<Float4>(primitive + OFFSET(Primitive,w.A), 16) = A;
+				*Pointer<Float4>(primitive + OFFSET(Primitive,w.B), 16) = B;
+				*Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) = C;
+			}
+
+			if(state.interpolateZ)
+			{
+				Float z0 = *Pointer<Float>(v0 + OFFSET(Vertex,Z));
+				Float z1 = *Pointer<Float>(v1 + OFFSET(Vertex,Z));
+				Float z2 = *Pointer<Float>(v2 + OFFSET(Vertex,Z));
+
+				z1 -= z0;
+				z2 -= z0;
+
+				Float4 A;
+				Float4 B;
+				Float4 C;
+
+				if(!point)
+				{
+					Float x1 = Float(X1) * (1.0f / 16.0f);
+					Float y1 = Float(Y1) * (1.0f / 16.0f);
+					Float x2 = Float(X2) * (1.0f / 16.0f);
+					Float y2 = Float(Y2) * (1.0f / 16.0f);
+
+					Float D = *Pointer<Float>(data + OFFSET(DrawData,depthRange)) / (x1 * y2 - x2 * y1);
+
+					Float a = (y2 * z1 - y1 * z2) * D;
+					Float b = (x1 * z2 - x2 * z1) * D;
+
+					A = Float4(a);
+					B = Float4(b);
+				}
+				else
+				{
+					A = Float4(0, 0, 0, 0);
+					B = Float4(0, 0, 0, 0);
+				}
+
+				*Pointer<Float4>(primitive + OFFSET(Primitive,z.A), 16) = A;
+				*Pointer<Float4>(primitive + OFFSET(Primitive,z.B), 16) = B;
+
+				Float c = z0;
+
+				if(state.isDrawTriangle && state.slopeDepthBias)
+				{
+					Float bias = Max(Abs(Float(A.x)), Abs(Float(B.x)));
+					bias *= *Pointer<Float>(data + OFFSET(DrawData,slopeDepthBias));
+
+					if(complementaryDepthBuffer)
+					{
+						bias = -bias;
+					}
+
+					c += bias;
+				}
+
+				C = Float4(c * *Pointer<Float>(data + OFFSET(DrawData,depthRange)) + *Pointer<Float>(data + OFFSET(DrawData,depthNear)));
+
+				*Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) = C;
+			}
+
+			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+			{
+				for(int component = 0; component < 4; component++)
+				{
+					int attribute = state.gradient[interpolant][component].attribute;
+					bool flat = state.gradient[interpolant][component].flat;
+					bool wrap = state.gradient[interpolant][component].wrap;
+
+					if(attribute != Unused)
+					{
+						setupGradient(primitive, tri, w012, M, v0, v1, v2, OFFSET(Vertex,v[attribute][component]), OFFSET(Primitive,V[interpolant][component]), flat, sprite, state.perspective, wrap, component);
+					}
+				}
+			}
+
+			if(state.fog.attribute == Fog)
+			{
+				setupGradient(primitive, tri, w012, M, v0, v1, v2, OFFSET(Vertex,f), OFFSET(Primitive,f), state.fog.flat, false, state.perspective, false, 0);
+			}
+
+			Return(true);
+		}
+
+		routine = function(L"SetupRoutine");
+	}
+
+	void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool sprite, bool perspective, bool wrap, int component)
+	{
+		Float4 i;
+
+		if(!flat)
+		{
+			if(!sprite)
+			{
+				i.x = *Pointer<Float>(v0 + attribute);
+				i.y = *Pointer<Float>(v1 + attribute);
+				i.z = *Pointer<Float>(v2 + attribute);
+				i.w = 0;
+			}
+			else
+			{
+				if(component == 0) i.x = 0.5f;
+				if(component == 1) i.x = 0.5f;
+				if(component == 2) i.x = 0.0f;
+				if(component == 3) i.x = 1.0f;
+
+				if(component == 0) i.y = 1.0f;
+				if(component == 1) i.y = 0.5f;
+				if(component == 2) i.y = 0.0f;
+				if(component == 3) i.y = 1.0f;
+
+				if(component == 0) i.z = 0.5f;
+				if(component == 1) i.z = 1.0f;
+				if(component == 2) i.z = 0.0f;
+				if(component == 3) i.z = 1.0f;
+
+				i.w = 0;
+			}
+
+			if(wrap)
+			{
+				Float m;
+
+				m = *Pointer<Float>(v0 + attribute);
+				m = Max(m, *Pointer<Float>(v1 + attribute));
+				m = Max(m, *Pointer<Float>(v2 + attribute));
+				m -= 0.5f;
+
+				// FIXME: Vectorize
+				If(Float(i.x) < m) i.x = i.x + 1.0f;
+				If(Float(i.y) < m) i.y = i.y + 1.0f;
+				If(Float(i.z) < m) i.z = i.z + 1.0f;
+			}
+
+			if(!perspective)
+			{
+				i *= w012;
+			}
+
+			Float4 A = i.xxxx * m[0];
+			Float4 B = i.yyyy * m[1];
+			Float4 C = i.zzzz * m[2];
+
+			C = A + B + C;
+
+			A = C.xxxx;
+			B = C.yyyy;
+			C = C.zzzz;
+
+			*Pointer<Float4>(primitive + planeEquation + 0, 16) = A;
+			*Pointer<Float4>(primitive + planeEquation + 16, 16) = B;
+			*Pointer<Float4>(primitive + planeEquation + 32, 16) = C;
+		}
+		else
+		{
+			int leadingVertex = leadingVertexFirst ? OFFSET(Triangle,v0) : OFFSET(Triangle,v2);
+			Float C = *Pointer<Float>(triangle + leadingVertex + attribute);
+
+			*Pointer<Float4>(primitive + planeEquation + 0, 16) = Float4(0, 0, 0, 0);
+			*Pointer<Float4>(primitive + planeEquation + 16, 16) = Float4(0, 0, 0, 0);
+			*Pointer<Float4>(primitive + planeEquation + 32, 16) = Float4(C);
+		}
+	}
+
+	void SetupRoutine::edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q)
+	{
+		If(Ya != Yb)
+		{
+			Bool swap = Yb < Ya;
+
+			Int X1 = IfThenElse(swap, Xb, Xa);
+			Int X2 = IfThenElse(swap, Xa, Xb);
+			Int Y1 = IfThenElse(swap, Yb, Ya);
+			Int Y2 = IfThenElse(swap, Ya, Yb);
+
+			Int y1 = Max((Y1 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+			Int y2 = Min((Y2 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
+
+			If(y1 < y2)
+			{
+				Int xMin = *Pointer<Int>(data + OFFSET(DrawData,scissorX0));
+				Int xMax = *Pointer<Int>(data + OFFSET(DrawData,scissorX1));
+
+				Pointer<Byte> leftEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left);
+				Pointer<Byte> rightEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->right);
+				Pointer<Byte> edge = IfThenElse(swap, rightEdge, leftEdge);
+
+				// Deltas
+				Int DX12 = X2 - X1;
+				Int DY12 = Y2 - Y1;
+
+				Int FDX12 = DX12 << 4;
+				Int FDY12 = DY12 << 4;
+
+				Int X = DX12 * ((y1 << 4) - Y1) + (X1 & 0x0000000F) * DY12;
+				Int x = (X1 >> 4) + X / FDY12;   // Edge
+				Int d = X % FDY12;               // Error-term
+				Int ceil = -d >> 31;             // Ceiling division: remainder <= 0
+				x -= ceil;
+				d -= ceil & FDY12;
+
+				Int Q = FDX12 / FDY12;   // Edge-step
+				Int R = FDX12 % FDY12;   // Error-step
+				Int floor = R >> 31;     // Flooring division: remainder >= 0
+				Q += floor;
+				R += floor & FDY12;
+
+				Int D = FDY12;   // Error-overflow
+				Int y = y1;
+
+				Do
+				{
+					*Pointer<Short>(edge + y * sizeof(Primitive::Span)) = Short(Clamp(x, xMin, xMax));
+
+					x += Q;
+					d += R;
+
+					Int overflow = -d >> 31;
+
+					d -= D & overflow;
+					x -= overflow;
+
+					y++;
+				}
+				Until(y >= y2)
+			}
+		}
+	}
+
+	void SetupRoutine::conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
+	{
+		#if 0   // Rely on LLVM optimization
+			If(condition)
+			{
+				Pointer<Byte> vX;
+
+				vX = v0;
+				v0 = v1;
+				v1 = v2;
+				v2 = vX;
+			}
+		#else
+			Pointer<Byte> vX = v0;
+			v0 = IfThenElse(condition, v1, v0);
+			v1 = IfThenElse(condition, v2, v1);
+			v2 = IfThenElse(condition, vX, v2);
+		#endif
+	}
+
+	void SetupRoutine::conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2)
+	{
+		#if 0   // Rely on LLVM optimization
+			If(condition)
+			{
+				Pointer<Byte> vX;
+
+				vX = v2;
+				v2 = v1;
+				v1 = v0;
+				v0 = vX;
+			}
+		#else
+			Pointer<Byte> vX = v2;
+			v2 = IfThenElse(condition, v1, v2);
+			v1 = IfThenElse(condition, v0, v1);
+			v0 = IfThenElse(condition, vX, v0);
+		#endif
+	}
+
+	Routine *SetupRoutine::getRoutine()
+	{
+		return routine;
+	}
+}
diff --git a/src/Pipeline/SetupRoutine.hpp b/src/Pipeline/SetupRoutine.hpp
new file mode 100644
index 0000000..c1c3205
--- /dev/null
+++ b/src/Pipeline/SetupRoutine.hpp
@@ -0,0 +1,47 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_SetupRoutine_hpp
+#define sw_SetupRoutine_hpp
+
+#include "Renderer/SetupProcessor.hpp"
+#include "Reactor/Reactor.hpp"
+
+namespace sw
+{
+	class Context;
+
+	class SetupRoutine
+	{
+	public:
+		SetupRoutine(const SetupProcessor::State &state);
+
+		virtual ~SetupRoutine();
+
+		void generate();
+		Routine *getRoutine();
+
+	private:
+		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool sprite, bool perspective, bool wrap, int component);
+		void edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q);
+		void conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
+		void conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
+
+		const SetupProcessor::State &state;
+
+		Routine *routine;
+	};
+}
+
+#endif   // sw_SetupRoutine_hpp
diff --git a/src/Pipeline/Shader.cpp b/src/Pipeline/Shader.cpp
new file mode 100644
index 0000000..36192c9
--- /dev/null
+++ b/src/Pipeline/Shader.cpp
@@ -0,0 +1,1927 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Shader.hpp"
+
+#include "VertexShader.hpp"
+#include "PixelShader.hpp"
+#include "Common/Math.hpp"
+#include "Common/Debug.hpp"
+
+#include <set>
+#include <fstream>
+#include <sstream>
+#include <stdarg.h>
+
+namespace sw
+{
+	volatile int Shader::serialCounter = 1;
+
+	Shader::Opcode Shader::OPCODE_DP(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_DP1;
+		case 2: return OPCODE_DP2;
+		case 3: return OPCODE_DP3;
+		case 4: return OPCODE_DP4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_LEN(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_ABS;
+		case 2: return OPCODE_LEN2;
+		case 3: return OPCODE_LEN3;
+		case 4: return OPCODE_LEN4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_DIST(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_DIST1;
+		case 2: return OPCODE_DIST2;
+		case 3: return OPCODE_DIST3;
+		case 4: return OPCODE_DIST4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_NRM(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_SGN;
+		case 2: return OPCODE_NRM2;
+		case 3: return OPCODE_NRM3;
+		case 4: return OPCODE_NRM4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_FORWARD(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_FORWARD1;
+		case 2: return OPCODE_FORWARD2;
+		case 3: return OPCODE_FORWARD3;
+		case 4: return OPCODE_FORWARD4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_REFLECT(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_REFLECT1;
+		case 2: return OPCODE_REFLECT2;
+		case 3: return OPCODE_REFLECT3;
+		case 4: return OPCODE_REFLECT4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_REFRACT(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_REFRACT1;
+		case 2: return OPCODE_REFRACT2;
+		case 3: return OPCODE_REFRACT3;
+		case 4: return OPCODE_REFRACT4;
+		}
+	}
+
+	Shader::Instruction::Instruction(Opcode opcode) : opcode(opcode), analysis(0)
+	{
+		control = CONTROL_RESERVED0;
+
+		predicate = false;
+		predicateNot = false;
+		predicateSwizzle = 0xE4;
+
+		coissue = false;
+		samplerType = SAMPLER_UNKNOWN;
+		usage = USAGE_POSITION;
+		usageIndex = 0;
+	}
+
+	Shader::Instruction::Instruction(const unsigned long *token, int size, unsigned char majorVersion) : analysis(0)
+	{
+		parseOperationToken(*token++, majorVersion);
+
+		samplerType = SAMPLER_UNKNOWN;
+		usage = USAGE_POSITION;
+		usageIndex = 0;
+
+		if(opcode == OPCODE_IF ||
+		   opcode == OPCODE_IFC ||
+		   opcode == OPCODE_LOOP ||
+		   opcode == OPCODE_REP ||
+		   opcode == OPCODE_BREAKC ||
+		   opcode == OPCODE_BREAKP)   // No destination operand
+		{
+			if(size > 0) parseSourceToken(0, token++, majorVersion);
+			if(size > 1) parseSourceToken(1, token++, majorVersion);
+			if(size > 2) parseSourceToken(2, token++, majorVersion);
+			if(size > 3) ASSERT(false);
+		}
+		else if(opcode == OPCODE_DCL)
+		{
+			parseDeclarationToken(*token++);
+			parseDestinationToken(token++, majorVersion);
+		}
+		else
+		{
+			if(size > 0)
+			{
+				parseDestinationToken(token, majorVersion);
+
+				if(dst.rel.type != PARAMETER_VOID && majorVersion >= 3)
+				{
+					token++;
+					size--;
+				}
+
+				token++;
+				size--;
+			}
+
+			if(predicate)
+			{
+				ASSERT(size != 0);
+
+				predicateNot = (Modifier)((*token & 0x0F000000) >> 24) == MODIFIER_NOT;
+				predicateSwizzle = (unsigned char)((*token & 0x00FF0000) >> 16);
+
+				token++;
+				size--;
+			}
+
+			for(int i = 0; size > 0; i++)
+			{
+				parseSourceToken(i, token, majorVersion);
+
+				token++;
+				size--;
+
+				if(src[i].rel.type != PARAMETER_VOID && majorVersion >= 2)
+				{
+					token++;
+					size--;
+				}
+			}
+		}
+	}
+
+	Shader::Instruction::~Instruction()
+	{
+	}
+
+	std::string Shader::Instruction::string(ShaderType shaderType, unsigned short version) const
+	{
+		std::string instructionString;
+
+		if(opcode != OPCODE_DCL)
+		{
+			instructionString += coissue ? "+ " : "";
+
+			if(predicate)
+			{
+				instructionString += predicateNot ? "(!p0" : "(p0";
+				instructionString += swizzleString(PARAMETER_PREDICATE, predicateSwizzle);
+				instructionString += ") ";
+			}
+
+			instructionString += operationString(version) + controlString() + dst.shiftString() + dst.modifierString();
+
+			if(dst.type != PARAMETER_VOID)
+			{
+				instructionString += " " + dst.string(shaderType, version) +
+				                           dst.relativeString() +
+				                           dst.maskString();
+			}
+
+			for(int i = 0; i < 4; i++)
+			{
+				if(src[i].type != PARAMETER_VOID)
+				{
+					instructionString += (dst.type != PARAMETER_VOID || i > 0) ? ", " : " ";
+					instructionString += src[i].preModifierString() +
+										 src[i].string(shaderType, version) +
+										 src[i].relativeString() +
+										 src[i].postModifierString() +
+										 src[i].swizzleString();
+				}
+			}
+		}
+		else   // DCL
+		{
+			instructionString += "dcl";
+
+			if(dst.type == PARAMETER_SAMPLER)
+			{
+				switch(samplerType)
+				{
+				case SAMPLER_UNKNOWN: instructionString += " ";        break;
+				case SAMPLER_1D:      instructionString += "_1d ";     break;
+				case SAMPLER_2D:      instructionString += "_2d ";     break;
+				case SAMPLER_CUBE:    instructionString += "_cube ";   break;
+				case SAMPLER_VOLUME:  instructionString += "_volume "; break;
+				default:
+					ASSERT(false);
+				}
+
+				instructionString += dst.string(shaderType, version);
+			}
+			else if(dst.type == PARAMETER_INPUT ||
+				    dst.type == PARAMETER_OUTPUT ||
+				    dst.type == PARAMETER_TEXTURE)
+			{
+				if(version >= 0x0300)
+				{
+					switch(usage)
+					{
+					case USAGE_POSITION:     instructionString += "_position";     break;
+					case USAGE_BLENDWEIGHT:  instructionString += "_blendweight";  break;
+					case USAGE_BLENDINDICES: instructionString += "_blendindices"; break;
+					case USAGE_NORMAL:       instructionString += "_normal";       break;
+					case USAGE_PSIZE:        instructionString += "_psize";        break;
+					case USAGE_TEXCOORD:     instructionString += "_texcoord";     break;
+					case USAGE_TANGENT:      instructionString += "_tangent";      break;
+					case USAGE_BINORMAL:     instructionString += "_binormal";     break;
+					case USAGE_TESSFACTOR:   instructionString += "_tessfactor";   break;
+					case USAGE_POSITIONT:    instructionString += "_positiont";    break;
+					case USAGE_COLOR:        instructionString += "_color";        break;
+					case USAGE_FOG:          instructionString += "_fog";          break;
+					case USAGE_DEPTH:        instructionString += "_depth";        break;
+					case USAGE_SAMPLE:       instructionString += "_sample";       break;
+					default:
+						ASSERT(false);
+					}
+
+					if(usageIndex > 0)
+					{
+						std::ostringstream buffer;
+
+						buffer << (int)usageIndex;
+
+						instructionString += buffer.str();
+					}
+				}
+				else ASSERT(dst.type != PARAMETER_OUTPUT);
+
+				instructionString += " ";
+
+				instructionString += dst.string(shaderType, version);
+				instructionString += dst.maskString();
+			}
+			else if(dst.type == PARAMETER_MISCTYPE)   // vPos and vFace
+			{
+				instructionString += " ";
+
+				instructionString += dst.string(shaderType, version);
+			}
+			else ASSERT(false);
+		}
+
+		return instructionString;
+	}
+
+	std::string Shader::DestinationParameter::modifierString() const
+	{
+		if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
+		{
+			return "";
+		}
+
+		std::string modifierString;
+
+		if(saturate)
+		{
+			modifierString += "_sat";
+		}
+
+		if(partialPrecision)
+		{
+			modifierString += "_pp";
+		}
+
+		if(centroid)
+		{
+			modifierString += "_centroid";
+		}
+
+		return modifierString;
+	}
+
+	std::string Shader::DestinationParameter::shiftString() const
+	{
+		if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
+		{
+			return "";
+		}
+
+		switch(shift)
+		{
+		case 0:		return "";
+		case 1:		return "_x2";
+		case 2:		return "_x4";
+		case 3:		return "_x8";
+		case -1:	return "_d2";
+		case -2:	return "_d4";
+		case -3:	return "_d8";
+		default:
+			return "";
+		//	ASSERT(false);   // FIXME
+		}
+	}
+
+	std::string Shader::DestinationParameter::maskString() const
+	{
+		if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
+		{
+			return "";
+		}
+
+		switch(mask)
+		{
+		case 0x0:	return "";
+		case 0x1:	return ".x";
+		case 0x2:	return ".y";
+		case 0x3:	return ".xy";
+		case 0x4:	return ".z";
+		case 0x5:	return ".xz";
+		case 0x6:	return ".yz";
+		case 0x7:	return ".xyz";
+		case 0x8:	return ".w";
+		case 0x9:	return ".xw";
+		case 0xA:	return ".yw";
+		case 0xB:	return ".xyw";
+		case 0xC:	return ".zw";
+		case 0xD:	return ".xzw";
+		case 0xE:	return ".yzw";
+		case 0xF:	return "";
+		default:
+			ASSERT(false);
+		}
+
+		return "";
+	}
+
+	std::string Shader::SourceParameter::preModifierString() const
+	{
+		if(type == PARAMETER_VOID)
+		{
+			return "";
+		}
+
+		switch(modifier)
+		{
+		case MODIFIER_NONE:			return "";
+		case MODIFIER_NEGATE:		return "-";
+		case MODIFIER_BIAS:			return "";
+		case MODIFIER_BIAS_NEGATE:	return "-";
+		case MODIFIER_SIGN:			return "";
+		case MODIFIER_SIGN_NEGATE:	return "-";
+		case MODIFIER_COMPLEMENT:	return "1-";
+		case MODIFIER_X2:			return "";
+		case MODIFIER_X2_NEGATE:	return "-";
+		case MODIFIER_DZ:			return "";
+		case MODIFIER_DW:			return "";
+		case MODIFIER_ABS:			return "";
+		case MODIFIER_ABS_NEGATE:	return "-";
+		case MODIFIER_NOT:			return "!";
+		default:
+			ASSERT(false);
+		}
+
+		return "";
+	}
+
+	std::string Shader::Parameter::relativeString() const
+	{
+		if(type == PARAMETER_CONST || type == PARAMETER_INPUT || type == PARAMETER_OUTPUT || type == PARAMETER_TEMP)
+		{
+			if(rel.type == PARAMETER_VOID)
+			{
+				return "";
+			}
+			else if(rel.type == PARAMETER_ADDR)
+			{
+				switch(rel.swizzle & 0x03)
+				{
+				case 0: return "[a0.x]";
+				case 1: return "[a0.y]";
+				case 2: return "[a0.z]";
+				case 3: return "[a0.w]";
+				}
+			}
+			else if(rel.type == PARAMETER_TEMP)
+			{
+				std::ostringstream buffer;
+				buffer << rel.index;
+
+				switch(rel.swizzle & 0x03)
+				{
+				case 0: return "[r" + buffer.str() + ".x]";
+				case 1: return "[r" + buffer.str() + ".y]";
+				case 2: return "[r" + buffer.str() + ".z]";
+				case 3: return "[r" + buffer.str() + ".w]";
+				}
+			}
+			else if(rel.type == PARAMETER_LOOP)
+			{
+				return "[aL]";
+			}
+			else if(rel.type == PARAMETER_CONST)
+			{
+				std::ostringstream buffer;
+				buffer << rel.index;
+
+				switch(rel.swizzle & 0x03)
+				{
+				case 0: return "[c" + buffer.str() + ".x]";
+				case 1: return "[c" + buffer.str() + ".y]";
+				case 2: return "[c" + buffer.str() + ".z]";
+				case 3: return "[c" + buffer.str() + ".w]";
+				}
+			}
+			else ASSERT(false);
+		}
+
+		return "";
+	}
+
+	std::string Shader::SourceParameter::postModifierString() const
+	{
+		if(type == PARAMETER_VOID)
+		{
+			return "";
+		}
+
+		switch(modifier)
+		{
+		case MODIFIER_NONE:			return "";
+		case MODIFIER_NEGATE:		return "";
+		case MODIFIER_BIAS:			return "_bias";
+		case MODIFIER_BIAS_NEGATE:	return "_bias";
+		case MODIFIER_SIGN:			return "_bx2";
+		case MODIFIER_SIGN_NEGATE:	return "_bx2";
+		case MODIFIER_COMPLEMENT:	return "";
+		case MODIFIER_X2:			return "_x2";
+		case MODIFIER_X2_NEGATE:	return "_x2";
+		case MODIFIER_DZ:			return "_dz";
+		case MODIFIER_DW:			return "_dw";
+		case MODIFIER_ABS:			return "_abs";
+		case MODIFIER_ABS_NEGATE:	return "_abs";
+		case MODIFIER_NOT:			return "";
+		default:
+			ASSERT(false);
+		}
+
+		return "";
+	}
+
+	std::string Shader::SourceParameter::string(ShaderType shaderType, unsigned short version) const
+	{
+		if(type == PARAMETER_CONST && bufferIndex >= 0)
+		{
+			std::ostringstream buffer;
+			buffer << bufferIndex;
+
+			std::ostringstream offset;
+			offset << index;
+
+			return "cb" + buffer.str() + "[" + offset.str() + "]";
+		}
+		else
+		{
+			return Parameter::string(shaderType, version);
+		}
+	}
+
+	std::string Shader::SourceParameter::swizzleString() const
+	{
+		return Instruction::swizzleString(type, swizzle);
+	}
+
+	void Shader::Instruction::parseOperationToken(unsigned long token, unsigned char majorVersion)
+	{
+		if((token & 0xFFFF0000) == 0xFFFF0000 || (token & 0xFFFF0000) == 0xFFFE0000)   // Version token
+		{
+			opcode = (Opcode)token;
+
+			control = CONTROL_RESERVED0;
+			predicate = false;
+			coissue = false;
+		}
+		else
+		{
+			opcode = (Opcode)(token & 0x0000FFFF);
+			control = (Control)((token & 0x00FF0000) >> 16);
+
+			int size = (token & 0x0F000000) >> 24;
+
+			predicate = (token & 0x10000000) != 0x00000000;
+			coissue = (token & 0x40000000) != 0x00000000;
+
+			if(majorVersion < 2)
+			{
+				if(size != 0)
+				{
+					ASSERT(false);   // Reserved
+				}
+			}
+
+			if(majorVersion < 2)
+			{
+				if(predicate)
+				{
+					ASSERT(false);
+				}
+			}
+
+			if((token & 0x20000000) != 0x00000000)
+			{
+				ASSERT(false);   // Reserved
+			}
+
+			if(majorVersion >= 2)
+			{
+				if(coissue)
+				{
+					ASSERT(false);   // Reserved
+				}
+			}
+
+			if((token & 0x80000000) != 0x00000000)
+			{
+				ASSERT(false);
+			}
+		}
+	}
+
+	void Shader::Instruction::parseDeclarationToken(unsigned long token)
+	{
+		samplerType = (SamplerType)((token & 0x78000000) >> 27);
+		usage = (Usage)(token & 0x0000001F);
+		usageIndex = (unsigned char)((token & 0x000F0000) >> 16);
+	}
+
+	void Shader::Instruction::parseDestinationToken(const unsigned long *token, unsigned char majorVersion)
+	{
+		dst.index = (unsigned short)(token[0] & 0x000007FF);
+		dst.type = (ParameterType)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
+
+		// TODO: Check type and index range
+
+		bool relative = (token[0] & 0x00002000) != 0x00000000;
+		dst.rel.type = relative ? PARAMETER_ADDR : PARAMETER_VOID;
+		dst.rel.swizzle = 0x00;
+		dst.rel.scale = 1;
+
+		if(relative && majorVersion >= 3)
+		{
+			dst.rel.type = (ParameterType)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
+			dst.rel.swizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
+		}
+		else if(relative) ASSERT(false);   // Reserved
+
+		if((token[0] & 0x0000C000) != 0x00000000)
+		{
+			ASSERT(false);   // Reserved
+		}
+
+		dst.mask = (unsigned char)((token[0] & 0x000F0000) >> 16);
+		dst.saturate = (token[0] & 0x00100000) != 0;
+		dst.partialPrecision = (token[0] & 0x00200000) != 0;
+		dst.centroid = (token[0] & 0x00400000) != 0;
+		dst.shift = (signed char)((token[0] & 0x0F000000) >> 20) >> 4;
+
+		if(majorVersion >= 2)
+		{
+			if(dst.shift)
+			{
+				ASSERT(false);   // Reserved
+			}
+		}
+
+		if((token[0] & 0x80000000) != 0x80000000)
+		{
+			ASSERT(false);
+		}
+	}
+
+	void Shader::Instruction::parseSourceToken(int i, const unsigned long *token, unsigned char majorVersion)
+	{
+		// Defaults
+		src[i].index = 0;
+		src[i].type = PARAMETER_VOID;
+		src[i].modifier = MODIFIER_NONE;
+		src[i].swizzle = 0xE4;
+		src[i].rel.type = PARAMETER_VOID;
+		src[i].rel.swizzle = 0x00;
+		src[i].rel.scale = 1;
+
+		switch(opcode)
+		{
+		case OPCODE_DEF:
+			src[0].type = PARAMETER_FLOAT4LITERAL;
+			src[0].value[i] = *(float*)token;
+			break;
+		case OPCODE_DEFB:
+			src[0].type = PARAMETER_BOOL1LITERAL;
+			src[0].boolean[0] = *(int*)token;
+			break;
+		case OPCODE_DEFI:
+			src[0].type = PARAMETER_INT4LITERAL;
+			src[0].integer[i] = *(int*)token;
+			break;
+		default:
+			src[i].index = (unsigned short)(token[0] & 0x000007FF);
+			src[i].type = (ParameterType)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
+
+			// FIXME: Check type and index range
+
+			bool relative = (token[0] & 0x00002000) != 0x00000000;
+			src[i].rel.type = relative ? PARAMETER_ADDR : PARAMETER_VOID;
+
+			if((token[0] & 0x0000C000) != 0x00000000)
+			{
+				if(opcode != OPCODE_DEF &&
+				   opcode != OPCODE_DEFI &&
+				   opcode != OPCODE_DEFB)
+				{
+					ASSERT(false);
+				}
+			}
+
+			src[i].swizzle = (unsigned char)((token[0] & 0x00FF0000) >> 16);
+			src[i].modifier = (Modifier)((token[0] & 0x0F000000) >> 24);
+
+			if((token[0] & 0x80000000) != 0x80000000)
+			{
+				if(opcode != OPCODE_DEF &&
+				   opcode != OPCODE_DEFI &&
+				   opcode != OPCODE_DEFB)
+				{
+					ASSERT(false);
+				}
+			}
+
+			if(relative && majorVersion >= 2)
+			{
+				src[i].rel.type = (ParameterType)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
+				src[i].rel.swizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
+			}
+		}
+	}
+
+	std::string Shader::Instruction::swizzleString(ParameterType type, unsigned char swizzle)
+	{
+		if(type == PARAMETER_VOID || type == PARAMETER_LABEL || swizzle == 0xE4)
+		{
+			return "";
+		}
+
+		int x = (swizzle & 0x03) >> 0;
+		int y = (swizzle & 0x0C) >> 2;
+		int z = (swizzle & 0x30) >> 4;
+		int w = (swizzle & 0xC0) >> 6;
+
+		std::string swizzleString = ".";
+
+		switch(x)
+		{
+		case 0: swizzleString += "x"; break;
+		case 1: swizzleString += "y"; break;
+		case 2: swizzleString += "z"; break;
+		case 3: swizzleString += "w"; break;
+		}
+
+		if(!(x == y && y == z && z == w))
+		{
+			switch(y)
+			{
+			case 0: swizzleString += "x"; break;
+			case 1: swizzleString += "y"; break;
+			case 2: swizzleString += "z"; break;
+			case 3: swizzleString += "w"; break;
+			}
+
+			if(!(y == z && z == w))
+			{
+				switch(z)
+				{
+				case 0: swizzleString += "x"; break;
+				case 1: swizzleString += "y"; break;
+				case 2: swizzleString += "z"; break;
+				case 3: swizzleString += "w"; break;
+				}
+
+				if(!(z == w))
+				{
+					switch(w)
+					{
+					case 0: swizzleString += "x"; break;
+					case 1: swizzleString += "y"; break;
+					case 2: swizzleString += "z"; break;
+					case 3: swizzleString += "w"; break;
+					}
+				}
+			}
+		}
+
+		return swizzleString;
+	}
+
+	std::string Shader::Instruction::operationString(unsigned short version) const
+	{
+		switch(opcode)
+		{
+		case OPCODE_NULL:            return "null";
+		case OPCODE_NOP:             return "nop";
+		case OPCODE_MOV:             return "mov";
+		case OPCODE_ADD:             return "add";
+		case OPCODE_IADD:            return "iadd";
+		case OPCODE_SUB:             return "sub";
+		case OPCODE_ISUB:            return "isub";
+		case OPCODE_MAD:             return "mad";
+		case OPCODE_IMAD:            return "imad";
+		case OPCODE_MUL:             return "mul";
+		case OPCODE_IMUL:            return "imul";
+		case OPCODE_RCPX:            return "rcpx";
+		case OPCODE_DIV:             return "div";
+		case OPCODE_IDIV:            return "idiv";
+		case OPCODE_UDIV:            return "udiv";
+		case OPCODE_MOD:             return "mod";
+		case OPCODE_IMOD:            return "imod";
+		case OPCODE_UMOD:            return "umod";
+		case OPCODE_SHL:             return "shl";
+		case OPCODE_ISHR:            return "ishr";
+		case OPCODE_USHR:            return "ushr";
+		case OPCODE_RSQX:            return "rsqx";
+		case OPCODE_SQRT:            return "sqrt";
+		case OPCODE_RSQ:             return "rsq";
+		case OPCODE_LEN2:            return "len2";
+		case OPCODE_LEN3:            return "len3";
+		case OPCODE_LEN4:            return "len4";
+		case OPCODE_DIST1:           return "dist1";
+		case OPCODE_DIST2:           return "dist2";
+		case OPCODE_DIST3:           return "dist3";
+		case OPCODE_DIST4:           return "dist4";
+		case OPCODE_DP3:             return "dp3";
+		case OPCODE_DP4:             return "dp4";
+		case OPCODE_DET2:            return "det2";
+		case OPCODE_DET3:            return "det3";
+		case OPCODE_DET4:            return "det4";
+		case OPCODE_MIN:             return "min";
+		case OPCODE_IMIN:            return "imin";
+		case OPCODE_UMIN:            return "umin";
+		case OPCODE_MAX:             return "max";
+		case OPCODE_IMAX:            return "imax";
+		case OPCODE_UMAX:            return "umax";
+		case OPCODE_SLT:             return "slt";
+		case OPCODE_SGE:             return "sge";
+		case OPCODE_EXP2X:           return "exp2x";
+		case OPCODE_LOG2X:           return "log2x";
+		case OPCODE_LIT:             return "lit";
+		case OPCODE_ATT:             return "att";
+		case OPCODE_LRP:             return "lrp";
+		case OPCODE_STEP:            return "step";
+		case OPCODE_SMOOTH:          return "smooth";
+		case OPCODE_FLOATBITSTOINT:  return "floatBitsToInt";
+		case OPCODE_FLOATBITSTOUINT: return "floatBitsToUInt";
+		case OPCODE_INTBITSTOFLOAT:  return "intBitsToFloat";
+		case OPCODE_UINTBITSTOFLOAT: return "uintBitsToFloat";
+		case OPCODE_PACKSNORM2x16:   return "packSnorm2x16";
+		case OPCODE_PACKUNORM2x16:   return "packUnorm2x16";
+		case OPCODE_PACKHALF2x16:    return "packHalf2x16";
+		case OPCODE_UNPACKSNORM2x16: return "unpackSnorm2x16";
+		case OPCODE_UNPACKUNORM2x16: return "unpackUnorm2x16";
+		case OPCODE_UNPACKHALF2x16:  return "unpackHalf2x16";
+		case OPCODE_FRC:             return "frc";
+		case OPCODE_M4X4:            return "m4x4";
+		case OPCODE_M4X3:            return "m4x3";
+		case OPCODE_M3X4:            return "m3x4";
+		case OPCODE_M3X3:            return "m3x3";
+		case OPCODE_M3X2:            return "m3x2";
+		case OPCODE_CALL:            return "call";
+		case OPCODE_CALLNZ:          return "callnz";
+		case OPCODE_LOOP:            return "loop";
+		case OPCODE_RET:             return "ret";
+		case OPCODE_ENDLOOP:         return "endloop";
+		case OPCODE_LABEL:           return "label";
+		case OPCODE_DCL:             return "dcl";
+		case OPCODE_POWX:            return "powx";
+		case OPCODE_CRS:             return "crs";
+		case OPCODE_SGN:             return "sgn";
+		case OPCODE_ISGN:            return "isgn";
+		case OPCODE_ABS:             return "abs";
+		case OPCODE_IABS:            return "iabs";
+		case OPCODE_NRM2:            return "nrm2";
+		case OPCODE_NRM3:            return "nrm3";
+		case OPCODE_NRM4:            return "nrm4";
+		case OPCODE_SINCOS:          return "sincos";
+		case OPCODE_REP:             return "rep";
+		case OPCODE_ENDREP:          return "endrep";
+		case OPCODE_IF:              return "if";
+		case OPCODE_IFC:             return "ifc";
+		case OPCODE_ELSE:            return "else";
+		case OPCODE_ENDIF:           return "endif";
+		case OPCODE_BREAK:           return "break";
+		case OPCODE_BREAKC:          return "breakc";
+		case OPCODE_MOVA:            return "mova";
+		case OPCODE_DEFB:            return "defb";
+		case OPCODE_DEFI:            return "defi";
+		case OPCODE_TEXCOORD:        return "texcoord";
+		case OPCODE_TEXKILL:         return "texkill";
+		case OPCODE_DISCARD:         return "discard";
+		case OPCODE_TEX:
+			if(version < 0x0104)     return "tex";
+			else                     return "texld";
+		case OPCODE_TEXBEM:          return "texbem";
+		case OPCODE_TEXBEML:         return "texbeml";
+		case OPCODE_TEXREG2AR:       return "texreg2ar";
+		case OPCODE_TEXREG2GB:       return "texreg2gb";
+		case OPCODE_TEXM3X2PAD:      return "texm3x2pad";
+		case OPCODE_TEXM3X2TEX:      return "texm3x2tex";
+		case OPCODE_TEXM3X3PAD:      return "texm3x3pad";
+		case OPCODE_TEXM3X3TEX:      return "texm3x3tex";
+		case OPCODE_RESERVED0:       return "reserved0";
+		case OPCODE_TEXM3X3SPEC:     return "texm3x3spec";
+		case OPCODE_TEXM3X3VSPEC:    return "texm3x3vspec";
+		case OPCODE_EXPP:            return "expp";
+		case OPCODE_LOGP:            return "logp";
+		case OPCODE_CND:             return "cnd";
+		case OPCODE_DEF:             return "def";
+		case OPCODE_TEXREG2RGB:      return "texreg2rgb";
+		case OPCODE_TEXDP3TEX:       return "texdp3tex";
+		case OPCODE_TEXM3X2DEPTH:    return "texm3x2depth";
+		case OPCODE_TEXDP3:          return "texdp3";
+		case OPCODE_TEXM3X3:         return "texm3x3";
+		case OPCODE_TEXDEPTH:        return "texdepth";
+		case OPCODE_CMP0:            return "cmp0";
+		case OPCODE_ICMP:            return "icmp";
+		case OPCODE_UCMP:            return "ucmp";
+		case OPCODE_SELECT:          return "select";
+		case OPCODE_EXTRACT:         return "extract";
+		case OPCODE_INSERT:          return "insert";
+		case OPCODE_BEM:             return "bem";
+		case OPCODE_DP2ADD:          return "dp2add";
+		case OPCODE_DFDX:            return "dFdx";
+		case OPCODE_DFDY:            return "dFdy";
+		case OPCODE_FWIDTH:          return "fwidth";
+		case OPCODE_TEXLDD:          return "texldd";
+		case OPCODE_CMP:             return "cmp";
+		case OPCODE_TEXLDL:          return "texldl";
+		case OPCODE_TEXBIAS:         return "texbias";
+		case OPCODE_TEXOFFSET:       return "texoffset";
+		case OPCODE_TEXOFFSETBIAS:   return "texoffsetbias";
+		case OPCODE_TEXLODOFFSET:    return "texlodoffset";
+		case OPCODE_TEXELFETCH:      return "texelfetch";
+		case OPCODE_TEXELFETCHOFFSET: return "texelfetchoffset";
+		case OPCODE_TEXGRAD:         return "texgrad";
+		case OPCODE_TEXGRADOFFSET:   return "texgradoffset";
+		case OPCODE_BREAKP:          return "breakp";
+		case OPCODE_TEXSIZE:         return "texsize";
+		case OPCODE_PHASE:           return "phase";
+		case OPCODE_COMMENT:         return "comment";
+		case OPCODE_END:             return "end";
+		case OPCODE_PS_1_0:          return "ps_1_0";
+		case OPCODE_PS_1_1:          return "ps_1_1";
+		case OPCODE_PS_1_2:          return "ps_1_2";
+		case OPCODE_PS_1_3:          return "ps_1_3";
+		case OPCODE_PS_1_4:          return "ps_1_4";
+		case OPCODE_PS_2_0:          return "ps_2_0";
+		case OPCODE_PS_2_x:          return "ps_2_x";
+		case OPCODE_PS_3_0:          return "ps_3_0";
+		case OPCODE_VS_1_0:          return "vs_1_0";
+		case OPCODE_VS_1_1:          return "vs_1_1";
+		case OPCODE_VS_2_0:          return "vs_2_0";
+		case OPCODE_VS_2_x:          return "vs_2_x";
+		case OPCODE_VS_2_sw:         return "vs_2_sw";
+		case OPCODE_VS_3_0:          return "vs_3_0";
+		case OPCODE_VS_3_sw:         return "vs_3_sw";
+		case OPCODE_WHILE:           return "while";
+		case OPCODE_ENDWHILE:        return "endwhile";
+		case OPCODE_COS:             return "cos";
+		case OPCODE_SIN:             return "sin";
+		case OPCODE_TAN:             return "tan";
+		case OPCODE_ACOS:            return "acos";
+		case OPCODE_ASIN:            return "asin";
+		case OPCODE_ATAN:            return "atan";
+		case OPCODE_ATAN2:           return "atan2";
+		case OPCODE_COSH:            return "cosh";
+		case OPCODE_SINH:            return "sinh";
+		case OPCODE_TANH:            return "tanh";
+		case OPCODE_ACOSH:           return "acosh";
+		case OPCODE_ASINH:           return "asinh";
+		case OPCODE_ATANH:           return "atanh";
+		case OPCODE_DP1:             return "dp1";
+		case OPCODE_DP2:             return "dp2";
+		case OPCODE_TRUNC:           return "trunc";
+		case OPCODE_FLOOR:           return "floor";
+		case OPCODE_ROUND:           return "round";
+		case OPCODE_ROUNDEVEN:       return "roundEven";
+		case OPCODE_CEIL:            return "ceil";
+		case OPCODE_EXP2:            return "exp2";
+		case OPCODE_LOG2:            return "log2";
+		case OPCODE_EXP:             return "exp";
+		case OPCODE_LOG:             return "log";
+		case OPCODE_POW:             return "pow";
+		case OPCODE_F2B:             return "f2b";
+		case OPCODE_B2F:             return "b2f";
+		case OPCODE_F2I:             return "f2i";
+		case OPCODE_I2F:             return "i2f";
+		case OPCODE_F2U:             return "f2u";
+		case OPCODE_U2F:             return "u2f";
+		case OPCODE_B2I:             return "b2i";
+		case OPCODE_I2B:             return "i2b";
+		case OPCODE_ALL:             return "all";
+		case OPCODE_ANY:             return "any";
+		case OPCODE_NEG:             return "neg";
+		case OPCODE_INEG:            return "ineg";
+		case OPCODE_ISNAN:           return "isnan";
+		case OPCODE_ISINF:           return "isinf";
+		case OPCODE_NOT:             return "not";
+		case OPCODE_OR:              return "or";
+		case OPCODE_XOR:             return "xor";
+		case OPCODE_AND:             return "and";
+		case OPCODE_EQ:              return "eq";
+		case OPCODE_NE:              return "neq";
+		case OPCODE_FORWARD1:        return "forward1";
+		case OPCODE_FORWARD2:        return "forward2";
+		case OPCODE_FORWARD3:        return "forward3";
+		case OPCODE_FORWARD4:        return "forward4";
+		case OPCODE_REFLECT1:        return "reflect1";
+		case OPCODE_REFLECT2:        return "reflect2";
+		case OPCODE_REFLECT3:        return "reflect3";
+		case OPCODE_REFLECT4:        return "reflect4";
+		case OPCODE_REFRACT1:        return "refract1";
+		case OPCODE_REFRACT2:        return "refract2";
+		case OPCODE_REFRACT3:        return "refract3";
+		case OPCODE_REFRACT4:        return "refract4";
+		case OPCODE_LEAVE:           return "leave";
+		case OPCODE_CONTINUE:        return "continue";
+		case OPCODE_TEST:            return "test";
+		case OPCODE_SWITCH:          return "switch";
+		case OPCODE_ENDSWITCH:       return "endswitch";
+		default:
+			ASSERT(false);
+		}
+
+		return "<unknown>";
+	}
+
+	std::string Shader::Instruction::controlString() const
+	{
+		if(opcode != OPCODE_LOOP && opcode != OPCODE_BREAKC && opcode != OPCODE_IFC && opcode != OPCODE_CMP)
+		{
+			if(project) return "p";
+
+			if(bias) return "b";
+
+			// FIXME: LOD
+		}
+
+		switch(control)
+		{
+		case 1: return "_gt";
+		case 2: return "_eq";
+		case 3: return "_ge";
+		case 4: return "_lt";
+		case 5: return "_ne";
+		case 6: return "_le";
+		default:
+			return "";
+		//	ASSERT(false);   // FIXME
+		}
+	}
+
+	std::string Shader::Parameter::string(ShaderType shaderType, unsigned short version) const
+	{
+		std::ostringstream buffer;
+
+		if(type == PARAMETER_FLOAT4LITERAL)
+		{
+			buffer << '{' << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << '}';
+
+			return buffer.str();
+		}
+		else if(type != PARAMETER_RASTOUT && !(type == PARAMETER_ADDR && shaderType == SHADER_VERTEX) && type != PARAMETER_LOOP && type != PARAMETER_PREDICATE && type != PARAMETER_MISCTYPE)
+		{
+			buffer << index;
+
+			return typeString(shaderType, version) + buffer.str();
+		}
+		else
+		{
+			return typeString(shaderType, version);
+		}
+	}
+
+	std::string Shader::Parameter::typeString(ShaderType shaderType, unsigned short version) const
+	{
+		switch(type)
+		{
+		case PARAMETER_TEMP:			return "r";
+		case PARAMETER_INPUT:			return "v";
+		case PARAMETER_CONST:			return "c";
+		case PARAMETER_TEXTURE:
+	//	case PARAMETER_ADDR:
+			if(shaderType == SHADER_PIXEL)	return "t";
+			else							return "a0";
+		case PARAMETER_RASTOUT:
+			if(index == 0)              return "oPos";
+			else if(index == 1)         return "oFog";
+			else if(index == 2)         return "oPts";
+			else                        ASSERT(false);
+		case PARAMETER_ATTROUT:			return "oD";
+		case PARAMETER_TEXCRDOUT:
+	//	case PARAMETER_OUTPUT:			return "";
+			if(version < 0x0300)		return "oT";
+			else						return "o";
+		case PARAMETER_CONSTINT:		return "i";
+		case PARAMETER_COLOROUT:		return "oC";
+		case PARAMETER_DEPTHOUT:		return "oDepth";
+		case PARAMETER_SAMPLER:			return "s";
+	//	case PARAMETER_CONST2:			return "";
+	//	case PARAMETER_CONST3:			return "";
+	//	case PARAMETER_CONST4:			return "";
+		case PARAMETER_CONSTBOOL:		return "b";
+		case PARAMETER_LOOP:			return "aL";
+	//	case PARAMETER_TEMPFLOAT16:		return "";
+		case PARAMETER_MISCTYPE:
+			switch(index)
+			{
+			case VPosIndex:				return "vPos";
+			case VFaceIndex:			return "vFace";
+			case InstanceIDIndex:		return "iID";
+			case VertexIDIndex:			return "vID";
+			default: ASSERT(false);
+			}
+		case PARAMETER_LABEL:			return "l";
+		case PARAMETER_PREDICATE:		return "p0";
+		case PARAMETER_FLOAT4LITERAL:	return "";
+		case PARAMETER_BOOL1LITERAL:	return "";
+		case PARAMETER_INT4LITERAL:		return "";
+	//	case PARAMETER_VOID:			return "";
+		default:
+			ASSERT(false);
+		}
+
+		return "";
+	}
+
+	bool Shader::Instruction::isBranch() const
+	{
+		return opcode == OPCODE_IF || opcode == OPCODE_IFC;
+	}
+
+	bool Shader::Instruction::isCall() const
+	{
+		return opcode == OPCODE_CALL || opcode == OPCODE_CALLNZ;
+	}
+
+	bool Shader::Instruction::isBreak() const
+	{
+		return opcode == OPCODE_BREAK || opcode == OPCODE_BREAKC || opcode == OPCODE_BREAKP;
+	}
+
+	bool Shader::Instruction::isLoop() const
+	{
+		return opcode == OPCODE_LOOP || opcode == OPCODE_REP || opcode == OPCODE_WHILE;
+	}
+
+	bool Shader::Instruction::isEndLoop() const
+	{
+		return opcode == OPCODE_ENDLOOP || opcode == OPCODE_ENDREP || opcode == OPCODE_ENDWHILE;
+	}
+
+	bool Shader::Instruction::isPredicated() const
+	{
+		return predicate ||
+		       analysisBranch ||
+		       analysisBreak ||
+		       analysisContinue ||
+		       analysisLeave;
+	}
+
+	Shader::Shader() : serialID(serialCounter++)
+	{
+		usedSamplers = 0;
+	}
+
+	Shader::~Shader()
+	{
+		for(auto &inst : instruction)
+		{
+			delete inst;
+			inst = 0;
+		}
+	}
+
+	void Shader::parse(const unsigned long *token)
+	{
+		minorVersion = (unsigned char)(token[0] & 0x000000FF);
+		majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
+		shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
+
+		int length = 0;
+
+		if(shaderType == SHADER_VERTEX)
+		{
+			length = VertexShader::validate(token);
+		}
+		else if(shaderType == SHADER_PIXEL)
+		{
+			length = PixelShader::validate(token);
+		}
+		else ASSERT(false);
+
+		ASSERT(length != 0);
+		instruction.resize(length);
+
+		for(int i = 0; i < length; i++)
+		{
+			while((*token & 0x0000FFFF) == 0x0000FFFE)   // Comment token
+			{
+				int length = (*token & 0x7FFF0000) >> 16;
+
+				token += length + 1;
+			}
+
+			int tokenCount = size(*token);
+
+			instruction[i] = new Instruction(token, tokenCount, majorVersion);
+
+			token += 1 + tokenCount;
+		}
+	}
+
+	int Shader::size(unsigned long opcode) const
+	{
+		return size(opcode, shaderModel);
+	}
+
+	int Shader::size(unsigned long opcode, unsigned short shaderModel)
+	{
+		if(shaderModel > 0x0300)
+		{
+			ASSERT(false);
+		}
+
+		static const signed char size[] =
+		{
+			0,   // NOP = 0
+			2,   // MOV
+			3,   // ADD
+			3,   // SUB
+			4,   // MAD
+			3,   // MUL
+			2,   // RCP
+			2,   // RSQ
+			3,   // DP3
+			3,   // DP4
+			3,   // MIN
+			3,   // MAX
+			3,   // SLT
+			3,   // SGE
+			2,   // EXP
+			2,   // LOG
+			2,   // LIT
+			3,   // DST
+			4,   // LRP
+			2,   // FRC
+			3,   // M4x4
+			3,   // M4x3
+			3,   // M3x4
+			3,   // M3x3
+			3,   // M3x2
+			1,   // CALL
+			2,   // CALLNZ
+			2,   // LOOP
+			0,   // RET
+			0,   // ENDLOOP
+			1,   // LABEL
+			2,   // DCL
+			3,   // POW
+			3,   // CRS
+			4,   // SGN
+			2,   // ABS
+			2,   // NRM
+			4,   // SINCOS
+			1,   // REP
+			0,   // ENDREP
+			1,   // IF
+			2,   // IFC
+			0,   // ELSE
+			0,   // ENDIF
+			0,   // BREAK
+			2,   // BREAKC
+			2,   // MOVA
+			2,   // DEFB
+			5,   // DEFI
+			-1,  // 49
+			-1,  // 50
+			-1,  // 51
+			-1,  // 52
+			-1,  // 53
+			-1,  // 54
+			-1,  // 55
+			-1,  // 56
+			-1,  // 57
+			-1,  // 58
+			-1,  // 59
+			-1,  // 60
+			-1,  // 61
+			-1,  // 62
+			-1,  // 63
+			1,   // TEXCOORD = 64
+			1,   // TEXKILL
+			1,   // TEX
+			2,   // TEXBEM
+			2,   // TEXBEML
+			2,   // TEXREG2AR
+			2,   // TEXREG2GB
+			2,   // TEXM3x2PAD
+			2,   // TEXM3x2TEX
+			2,   // TEXM3x3PAD
+			2,   // TEXM3x3TEX
+			-1,  // RESERVED0
+			3,   // TEXM3x3SPEC
+			2,   // TEXM3x3VSPEC
+			2,   // EXPP
+			2,   // LOGP
+			4,   // CND
+			5,   // DEF
+			2,   // TEXREG2RGB
+			2,   // TEXDP3TEX
+			2,   // TEXM3x2DEPTH
+			2,   // TEXDP3
+			2,   // TEXM3x3
+			1,   // TEXDEPTH
+			4,   // CMP
+			3,   // BEM
+			4,   // DP2ADD
+			2,   // DSX
+			2,   // DSY
+			5,   // TEXLDD
+			3,   // SETP
+			3,   // TEXLDL
+			2,   // BREAKP
+			-1,  // 97
+			-1,  // 98
+			-1,  // 99
+			-1,  // 100
+			-1,  // 101
+			-1,  // 102
+			-1,  // 103
+			-1,  // 104
+			-1,  // 105
+			-1,  // 106
+			-1,  // 107
+			-1,  // 108
+			-1,  // 109
+			-1,  // 110
+			-1,  // 111
+			-1,  // 112
+		};
+
+		int length = 0;
+
+		if((opcode & 0x0000FFFF) == OPCODE_COMMENT)
+		{
+			return (opcode & 0x7FFF0000) >> 16;
+		}
+
+		if(opcode != OPCODE_PS_1_0 &&
+		   opcode != OPCODE_PS_1_1 &&
+		   opcode != OPCODE_PS_1_2 &&
+		   opcode != OPCODE_PS_1_3 &&
+		   opcode != OPCODE_PS_1_4 &&
+		   opcode != OPCODE_PS_2_0 &&
+		   opcode != OPCODE_PS_2_x &&
+		   opcode != OPCODE_PS_3_0 &&
+		   opcode != OPCODE_VS_1_0 &&
+		   opcode != OPCODE_VS_1_1 &&
+		   opcode != OPCODE_VS_2_0 &&
+		   opcode != OPCODE_VS_2_x &&
+		   opcode != OPCODE_VS_2_sw &&
+		   opcode != OPCODE_VS_3_0 &&
+		   opcode != OPCODE_VS_3_sw &&
+		   opcode != OPCODE_PHASE &&
+		   opcode != OPCODE_END)
+		{
+			if(shaderModel >= 0x0200)
+			{
+				length = (opcode & 0x0F000000) >> 24;
+			}
+			else
+			{
+				length = size[opcode & 0x0000FFFF];
+			}
+		}
+
+		if(length < 0)
+		{
+			ASSERT(false);
+		}
+
+		if(shaderModel == 0x0104)
+		{
+			switch(opcode & 0x0000FFFF)
+			{
+			case OPCODE_TEX:
+				length += 1;
+				break;
+			case OPCODE_TEXCOORD:
+				length += 1;
+				break;
+			default:
+				break;
+			}
+		}
+
+		return length;
+	}
+
+	bool Shader::maskContainsComponent(int mask, int component)
+	{
+		return (mask & (1 << component)) != 0;
+	}
+
+	bool Shader::swizzleContainsComponent(int swizzle, int component)
+	{
+		if((swizzle & 0x03) >> 0 == component) return true;
+		if((swizzle & 0x0C) >> 2 == component) return true;
+		if((swizzle & 0x30) >> 4 == component) return true;
+		if((swizzle & 0xC0) >> 6 == component) return true;
+
+		return false;
+	}
+
+	bool Shader::swizzleContainsComponentMasked(int swizzle, int component, int mask)
+	{
+		if(mask & 0x1) if((swizzle & 0x03) >> 0 == component) return true;
+		if(mask & 0x2) if((swizzle & 0x0C) >> 2 == component) return true;
+		if(mask & 0x4) if((swizzle & 0x30) >> 4 == component) return true;
+		if(mask & 0x8) if((swizzle & 0xC0) >> 6 == component) return true;
+
+		return false;
+	}
+
+	bool Shader::containsDynamicBranching() const
+	{
+		return dynamicBranching;
+	}
+
+	bool Shader::containsBreakInstruction() const
+	{
+		return containsBreak;
+	}
+
+	bool Shader::containsContinueInstruction() const
+	{
+		return containsContinue;
+	}
+
+	bool Shader::containsLeaveInstruction() const
+	{
+		return containsLeave;
+	}
+
+	bool Shader::containsDefineInstruction() const
+	{
+		return containsDefine;
+	}
+
+	bool Shader::usesSampler(int index) const
+	{
+		return (usedSamplers & (1 << index)) != 0;
+	}
+
+	int Shader::getSerialID() const
+	{
+		return serialID;
+	}
+
+	size_t Shader::getLength() const
+	{
+		return instruction.size();
+	}
+
+	Shader::ShaderType Shader::getShaderType() const
+	{
+		return shaderType;
+	}
+
+	unsigned short Shader::getShaderModel() const
+	{
+		return shaderModel;
+	}
+
+	void Shader::print(const char *fileName, ...) const
+	{
+		char fullName[1024 + 1];
+
+		va_list vararg;
+		va_start(vararg, fileName);
+		vsnprintf(fullName, 1024, fileName, vararg);
+		va_end(vararg);
+
+		std::ofstream file(fullName, std::ofstream::out);
+
+		for(const auto &inst : instruction)
+		{
+			file << inst->string(shaderType, shaderModel) << std::endl;
+		}
+	}
+
+	void Shader::printInstruction(int index, const char *fileName) const
+	{
+		std::ofstream file(fileName, std::ofstream::out | std::ofstream::app);
+
+		file << instruction[index]->string(shaderType, shaderModel) << std::endl;
+	}
+
+	void Shader::append(Instruction *instruction)
+	{
+		this->instruction.push_back(instruction);
+	}
+
+	void Shader::declareSampler(int i)
+	{
+		if(i >= 0 && i < 16)
+		{
+			usedSamplers |= 1 << i;
+		}
+	}
+
+	const Shader::Instruction *Shader::getInstruction(size_t i) const
+	{
+		ASSERT(i < instruction.size());
+
+		return instruction[i];
+	}
+
+	void Shader::optimize()
+	{
+		optimizeLeave();
+		optimizeCall();
+		removeNull();
+	}
+
+	void Shader::optimizeLeave()
+	{
+		// A return (leave) right before the end of a function or the shader can be removed
+		for(unsigned int i = 0; i < instruction.size(); i++)
+		{
+			if(instruction[i]->opcode == OPCODE_LEAVE)
+			{
+				if(i == instruction.size() - 1 || instruction[i + 1]->opcode == OPCODE_RET)
+				{
+					instruction[i]->opcode = OPCODE_NULL;
+				}
+			}
+		}
+	}
+
+	void Shader::optimizeCall()
+	{
+		// Eliminate uncalled functions
+		std::set<int> calledFunctions;
+		bool rescan = true;
+
+		while(rescan)
+		{
+			calledFunctions.clear();
+			rescan = false;
+
+			for(const auto &inst : instruction)
+			{
+				if(inst->isCall())
+				{
+					calledFunctions.insert(inst->dst.label);
+				}
+			}
+
+			if(!calledFunctions.empty())
+			{
+				for(unsigned int i = 0; i < instruction.size(); i++)
+				{
+					if(instruction[i]->opcode == OPCODE_LABEL)
+					{
+						if(calledFunctions.find(instruction[i]->dst.label) == calledFunctions.end())
+						{
+							for( ; i < instruction.size(); i++)
+							{
+								Opcode oldOpcode = instruction[i]->opcode;
+								instruction[i]->opcode = OPCODE_NULL;
+
+								if(oldOpcode == OPCODE_RET)
+								{
+									rescan = true;
+									break;
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+
+		// Optimize the entry call
+		if(instruction.size() >= 2 && instruction[0]->opcode == OPCODE_CALL && instruction[1]->opcode == OPCODE_RET)
+		{
+			if(calledFunctions.size() == 1)
+			{
+				instruction[0]->opcode = OPCODE_NULL;
+				instruction[1]->opcode = OPCODE_NULL;
+
+				for(size_t i = 2; i < instruction.size(); i++)
+				{
+					if(instruction[i]->opcode == OPCODE_LABEL || instruction[i]->opcode == OPCODE_RET)
+					{
+						instruction[i]->opcode = OPCODE_NULL;
+					}
+				}
+			}
+		}
+	}
+
+	void Shader::removeNull()
+	{
+		size_t size = 0;
+		for(size_t i = 0; i < instruction.size(); i++)
+		{
+			if(instruction[i]->opcode != OPCODE_NULL)
+			{
+				instruction[size] = instruction[i];
+				size++;
+			}
+			else
+			{
+				delete instruction[i];
+			}
+		}
+
+		instruction.resize(size);
+	}
+
+	void Shader::analyzeDirtyConstants()
+	{
+		dirtyConstantsF = 0;
+		dirtyConstantsI = 0;
+		dirtyConstantsB = 0;
+
+		for(const auto &inst : instruction)
+		{
+			switch(inst->opcode)
+			{
+			case OPCODE_DEF:
+				if(inst->dst.index + 1 > dirtyConstantsF)
+				{
+					dirtyConstantsF = inst->dst.index + 1;
+				}
+				break;
+			case OPCODE_DEFI:
+				if(inst->dst.index + 1 > dirtyConstantsI)
+				{
+					dirtyConstantsI = inst->dst.index + 1;
+				}
+				break;
+			case OPCODE_DEFB:
+				if(inst->dst.index + 1 > dirtyConstantsB)
+				{
+					dirtyConstantsB = inst->dst.index + 1;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	void Shader::analyzeDynamicBranching()
+	{
+		dynamicBranching = false;
+		containsLeave = false;
+		containsBreak = false;
+		containsContinue = false;
+		containsDefine = false;
+
+		// Determine global presence of branching instructions
+		for(const auto &inst : instruction)
+		{
+			switch(inst->opcode)
+			{
+			case OPCODE_CALLNZ:
+			case OPCODE_IF:
+			case OPCODE_IFC:
+			case OPCODE_BREAK:
+			case OPCODE_BREAKC:
+			case OPCODE_CMP:
+			case OPCODE_BREAKP:
+			case OPCODE_LEAVE:
+			case OPCODE_CONTINUE:
+				if(inst->src[0].type != PARAMETER_CONSTBOOL)
+				{
+					dynamicBranching = true;
+				}
+
+				if(inst->opcode == OPCODE_LEAVE)
+				{
+					containsLeave = true;
+				}
+
+				if(inst->isBreak())
+				{
+					containsBreak = true;
+				}
+
+				if(inst->opcode == OPCODE_CONTINUE)
+				{
+					containsContinue = true;
+				}
+			case OPCODE_DEF:
+			case OPCODE_DEFB:
+			case OPCODE_DEFI:
+				containsDefine = true;
+			default:
+				break;
+			}
+		}
+
+		// Conservatively determine which instructions are affected by dynamic branching
+		int branchDepth = 0;
+		int breakDepth = 0;
+		int continueDepth = 0;
+		bool leaveReturn = false;
+		unsigned int functionBegin = 0;
+
+		for(unsigned int i = 0; i < instruction.size(); i++)
+		{
+			// If statements and loops
+			if(instruction[i]->isBranch() || instruction[i]->isLoop())
+			{
+				branchDepth++;
+			}
+			else if(instruction[i]->opcode == OPCODE_ENDIF || instruction[i]->isEndLoop())
+			{
+				branchDepth--;
+			}
+
+			if(branchDepth > 0)
+			{
+				instruction[i]->analysisBranch = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_BRANCH);
+				}
+			}
+
+			// Break statemement
+			if(instruction[i]->isBreak())
+			{
+				breakDepth++;
+			}
+
+			if(breakDepth > 0)
+			{
+				if(instruction[i]->isLoop() || instruction[i]->opcode == OPCODE_SWITCH)   // Nested loop or switch, don't make the end of it disable the break execution mask
+				{
+					breakDepth++;
+				}
+				else if(instruction[i]->isEndLoop() || instruction[i]->opcode == OPCODE_ENDSWITCH)
+				{
+					breakDepth--;
+				}
+
+				instruction[i]->analysisBreak = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_BRANCH);
+				}
+			}
+
+			// Continue statement
+			if(instruction[i]->opcode == OPCODE_CONTINUE)
+			{
+				continueDepth++;
+			}
+
+			if(continueDepth > 0)
+			{
+				if(instruction[i]->isLoop() || instruction[i]->opcode == OPCODE_SWITCH)   // Nested loop or switch, don't make the end of it disable the break execution mask
+				{
+					continueDepth++;
+				}
+				else if(instruction[i]->isEndLoop() || instruction[i]->opcode == OPCODE_ENDSWITCH)
+				{
+					continueDepth--;
+				}
+
+				instruction[i]->analysisContinue = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_CONTINUE);
+				}
+			}
+
+			// Return (leave) statement
+			if(instruction[i]->opcode == OPCODE_LEAVE)
+			{
+				leaveReturn = true;
+
+				// Mark loop body instructions prior to the return statement
+				for(unsigned int l = functionBegin; l < i; l++)
+				{
+					if(instruction[l]->isLoop())
+					{
+						for(unsigned int r = l + 1; r < i; r++)
+						{
+							instruction[r]->analysisLeave = true;
+						}
+
+						break;
+					}
+				}
+			}
+			else if(instruction[i]->opcode == OPCODE_RET)   // End of the function
+			{
+				leaveReturn = false;
+			}
+			else if(instruction[i]->opcode == OPCODE_LABEL)
+			{
+				functionBegin = i;
+			}
+
+			if(leaveReturn)
+			{
+				instruction[i]->analysisLeave = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_LEAVE);
+				}
+			}
+		}
+	}
+
+	void Shader::markFunctionAnalysis(unsigned int functionLabel, Analysis flag)
+	{
+		bool marker = false;
+		for(auto &inst : instruction)
+		{
+			if(!marker)
+			{
+				if(inst->opcode == OPCODE_LABEL && inst->dst.label == functionLabel)
+				{
+					marker = true;
+				}
+			}
+			else
+			{
+				if(inst->opcode == OPCODE_RET)
+				{
+					break;
+				}
+				else if(inst->isCall())
+				{
+					markFunctionAnalysis(inst->dst.label, flag);
+				}
+
+				inst->analysis |= flag;
+			}
+		}
+	}
+
+	void Shader::analyzeSamplers()
+	{
+		for(const auto &inst : instruction)
+		{
+			switch(inst->opcode)
+			{
+			case OPCODE_TEX:
+			case OPCODE_TEXBEM:
+			case OPCODE_TEXBEML:
+			case OPCODE_TEXREG2AR:
+			case OPCODE_TEXREG2GB:
+			case OPCODE_TEXM3X2TEX:
+			case OPCODE_TEXM3X3TEX:
+			case OPCODE_TEXM3X3SPEC:
+			case OPCODE_TEXM3X3VSPEC:
+			case OPCODE_TEXREG2RGB:
+			case OPCODE_TEXDP3TEX:
+			case OPCODE_TEXM3X2DEPTH:
+			case OPCODE_TEXLDD:
+			case OPCODE_TEXLDL:
+			case OPCODE_TEXLOD:
+			case OPCODE_TEXOFFSET:
+			case OPCODE_TEXOFFSETBIAS:
+			case OPCODE_TEXLODOFFSET:
+			case OPCODE_TEXELFETCH:
+			case OPCODE_TEXELFETCHOFFSET:
+			case OPCODE_TEXGRAD:
+			case OPCODE_TEXGRADOFFSET:
+				{
+					Parameter &dst = inst->dst;
+					Parameter &src1 = inst->src[1];
+
+					if(majorVersion >= 2)
+					{
+						if(src1.type == PARAMETER_SAMPLER)
+						{
+							usedSamplers |= 1 << src1.index;
+						}
+					}
+					else
+					{
+						usedSamplers |= 1 << dst.index;
+					}
+				}
+				break;
+			default:
+				break;
+			}
+		}
+	}
+
+	// Assigns a unique index to each call instruction, on a per label basis.
+	// This is used to know what basic block to return to.
+	void Shader::analyzeCallSites()
+	{
+		int callSiteIndex[2048] = {0};
+
+		for(auto &inst : instruction)
+		{
+			if(inst->opcode == OPCODE_CALL || inst->opcode == OPCODE_CALLNZ)
+			{
+				int label = inst->dst.label;
+
+				inst->dst.callSite = callSiteIndex[label]++;
+			}
+		}
+	}
+
+	void Shader::analyzeIndirectAddressing()
+	{
+		indirectAddressableTemporaries = false;
+		indirectAddressableInput = false;
+		indirectAddressableOutput = false;
+
+		for(const auto &inst : instruction)
+		{
+			if(inst->dst.rel.type != PARAMETER_VOID)
+			{
+				switch(inst->dst.type)
+				{
+				case PARAMETER_TEMP:   indirectAddressableTemporaries = true; break;
+				case PARAMETER_INPUT:  indirectAddressableInput = true;       break;
+				case PARAMETER_OUTPUT: indirectAddressableOutput = true;      break;
+				default: break;
+				}
+			}
+
+			for(int j = 0; j < 3; j++)
+			{
+				if(inst->src[j].rel.type != PARAMETER_VOID)
+				{
+					switch(inst->src[j].type)
+					{
+					case PARAMETER_TEMP:   indirectAddressableTemporaries = true; break;
+					case PARAMETER_INPUT:  indirectAddressableInput = true;       break;
+					case PARAMETER_OUTPUT: indirectAddressableOutput = true;      break;
+					default: break;
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/src/Pipeline/Shader.hpp b/src/Pipeline/Shader.hpp
new file mode 100644
index 0000000..9e4a810
--- /dev/null
+++ b/src/Pipeline/Shader.hpp
@@ -0,0 +1,662 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Shader_hpp
+#define sw_Shader_hpp
+
+#include "Common/Types.hpp"
+
+#include <string>
+#include <vector>
+
+namespace sw
+{
+	class Shader
+	{
+	public:
+		enum ShaderType
+		{
+			SHADER_PIXEL = 0xFFFF,
+			SHADER_VERTEX = 0xFFFE,
+			SHADER_GEOMETRY = 0xFFFD
+		};
+
+		enum Opcode
+		{
+			// Matches order in d3d9types.h
+			OPCODE_NOP = 0,
+			OPCODE_MOV,
+			OPCODE_ADD,
+			OPCODE_SUB,
+			OPCODE_MAD,
+			OPCODE_MUL,
+			OPCODE_RCPX,
+			OPCODE_RSQX,
+			OPCODE_DP3,
+			OPCODE_DP4,
+			OPCODE_MIN,
+			OPCODE_MAX,
+			OPCODE_SLT,
+			OPCODE_SGE,
+			OPCODE_EXP2X,   // D3DSIO_EXP
+			OPCODE_LOG2X,   // D3DSIO_LOG
+			OPCODE_LIT,
+			OPCODE_ATT,   // D3DSIO_DST
+			OPCODE_LRP,
+			OPCODE_FRC,
+			OPCODE_M4X4,
+			OPCODE_M4X3,
+			OPCODE_M3X4,
+			OPCODE_M3X3,
+			OPCODE_M3X2,
+			OPCODE_CALL,
+			OPCODE_CALLNZ,
+			OPCODE_LOOP,
+			OPCODE_RET,
+			OPCODE_ENDLOOP,
+			OPCODE_LABEL,
+			OPCODE_DCL,
+			OPCODE_POWX,
+			OPCODE_CRS,
+			OPCODE_SGN,
+			OPCODE_ABS,
+			OPCODE_NRM3,   // D3DSIO_NRM
+			OPCODE_SINCOS,
+			OPCODE_REP,
+			OPCODE_ENDREP,
+			OPCODE_IF,
+			OPCODE_IFC,
+			OPCODE_ELSE,
+			OPCODE_ENDIF,
+			OPCODE_BREAK,
+			OPCODE_BREAKC,
+			OPCODE_MOVA,
+			OPCODE_DEFB,
+			OPCODE_DEFI,
+
+			OPCODE_TEXCOORD = 64,
+			OPCODE_TEXKILL,
+			OPCODE_TEX,
+			OPCODE_TEXBEM,
+			OPCODE_TEXBEML,
+			OPCODE_TEXREG2AR,
+			OPCODE_TEXREG2GB,
+			OPCODE_TEXM3X2PAD,
+			OPCODE_TEXM3X2TEX,
+			OPCODE_TEXM3X3PAD,
+			OPCODE_TEXM3X3TEX,
+			OPCODE_RESERVED0,
+			OPCODE_TEXM3X3SPEC,
+			OPCODE_TEXM3X3VSPEC,
+			OPCODE_EXPP,
+			OPCODE_LOGP,
+			OPCODE_CND,
+			OPCODE_DEF,
+			OPCODE_TEXREG2RGB,
+			OPCODE_TEXDP3TEX,
+			OPCODE_TEXM3X2DEPTH,
+			OPCODE_TEXDP3,
+			OPCODE_TEXM3X3,
+			OPCODE_TEXDEPTH,
+			OPCODE_CMP0,   // D3DSIO_CMP
+			OPCODE_BEM,
+			OPCODE_DP2ADD,
+			OPCODE_DFDX,   // D3DSIO_DSX
+			OPCODE_DFDY,   // D3DSIO_DSY
+			OPCODE_TEXLDD,
+			OPCODE_CMP,   // D3DSIO_SETP
+			OPCODE_TEXLDL,
+			OPCODE_BREAKP,
+
+			OPCODE_PHASE = 0xFFFD,
+			OPCODE_COMMENT = 0xFFFE,
+			OPCODE_END = 0xFFFF,
+
+			OPCODE_PS_1_0 = 0xFFFF0100,
+			OPCODE_PS_1_1 = 0xFFFF0101,
+			OPCODE_PS_1_2 = 0xFFFF0102,
+			OPCODE_PS_1_3 = 0xFFFF0103,
+			OPCODE_PS_1_4 = 0xFFFF0104,
+			OPCODE_PS_2_0 = 0xFFFF0200,
+			OPCODE_PS_2_x = 0xFFFF0201,
+			OPCODE_PS_3_0 = 0xFFFF0300,
+
+			OPCODE_VS_1_0 = 0xFFFE0100,
+			OPCODE_VS_1_1 = 0xFFFE0101,
+			OPCODE_VS_2_0 = 0xFFFE0200,
+			OPCODE_VS_2_x = 0xFFFE0201,
+			OPCODE_VS_2_sw = 0xFFFE02FF,
+			OPCODE_VS_3_0 = 0xFFFE0300,
+			OPCODE_VS_3_sw = 0xFFFE03FF,
+
+			OPCODE_NULL = 0x10000000,   // Dead instruction, to be eliminated
+			OPCODE_WHILE,
+			OPCODE_ENDWHILE,
+			OPCODE_COS,
+			OPCODE_SIN,
+			OPCODE_TAN,
+			OPCODE_ACOS,
+			OPCODE_ASIN,
+			OPCODE_ATAN,
+			OPCODE_ATAN2,
+			OPCODE_COSH,
+			OPCODE_SINH,
+			OPCODE_TANH,
+			OPCODE_ACOSH,
+			OPCODE_ASINH,
+			OPCODE_ATANH,
+			OPCODE_DP1,
+			OPCODE_DP2,
+			OPCODE_TRUNC,
+			OPCODE_FLOOR,
+			OPCODE_ROUND,
+			OPCODE_ROUNDEVEN,
+			OPCODE_CEIL,
+			OPCODE_SQRT,
+			OPCODE_RSQ,
+			OPCODE_LEN2,
+			OPCODE_LEN3,
+			OPCODE_LEN4,
+			OPCODE_DIST1,
+			OPCODE_DIST2,
+			OPCODE_DIST3,
+			OPCODE_DIST4,
+			OPCODE_NRM2,
+			OPCODE_NRM4,
+			OPCODE_DIV,
+			OPCODE_MOD,
+			OPCODE_EXP2,
+			OPCODE_LOG2,
+			OPCODE_EXP,
+			OPCODE_LOG,
+			OPCODE_POW,
+			OPCODE_F2B,   // Float to bool
+			OPCODE_B2F,   // Bool to float
+			OPCODE_F2I,   // Float to int
+			OPCODE_I2F,   // Int to float
+			OPCODE_F2U,   // Float to uint
+			OPCODE_U2F,   // Uint to float
+			OPCODE_I2B,   // Int to bool
+			OPCODE_B2I,   // Bool to int
+			OPCODE_DET2,
+			OPCODE_DET3,
+			OPCODE_DET4,
+			OPCODE_ALL,
+			OPCODE_ANY,
+			OPCODE_NEG,
+			OPCODE_NOT,
+			OPCODE_OR,
+			OPCODE_XOR,
+			OPCODE_AND,
+			OPCODE_EQ,
+			OPCODE_NE,
+			OPCODE_STEP,
+			OPCODE_SMOOTH,
+			OPCODE_ISNAN,
+			OPCODE_ISINF,
+			OPCODE_TEXOFFSET,
+			OPCODE_TEXLODOFFSET,
+			OPCODE_TEXELFETCH,
+			OPCODE_TEXELFETCHOFFSET,
+			OPCODE_TEXGRAD,
+			OPCODE_TEXGRADOFFSET,
+			OPCODE_TEXBIAS,
+			OPCODE_TEXLOD,
+			OPCODE_TEXOFFSETBIAS,
+			OPCODE_TEXSIZE,
+			OPCODE_FLOATBITSTOINT,
+			OPCODE_FLOATBITSTOUINT,
+			OPCODE_INTBITSTOFLOAT,
+			OPCODE_UINTBITSTOFLOAT,
+			OPCODE_PACKSNORM2x16,
+			OPCODE_PACKUNORM2x16,
+			OPCODE_PACKHALF2x16,
+			OPCODE_UNPACKSNORM2x16,
+			OPCODE_UNPACKUNORM2x16,
+			OPCODE_UNPACKHALF2x16,
+			OPCODE_FORWARD1,
+			OPCODE_FORWARD2,
+			OPCODE_FORWARD3,
+			OPCODE_FORWARD4,
+			OPCODE_REFLECT1,
+			OPCODE_REFLECT2,
+			OPCODE_REFLECT3,
+			OPCODE_REFLECT4,
+			OPCODE_REFRACT1,
+			OPCODE_REFRACT2,
+			OPCODE_REFRACT3,
+			OPCODE_REFRACT4,
+			OPCODE_ICMP,
+			OPCODE_UCMP,
+			OPCODE_SELECT,
+			OPCODE_EXTRACT,
+			OPCODE_INSERT,
+			OPCODE_DISCARD,
+			OPCODE_FWIDTH,
+			OPCODE_LEAVE,   // Return before the end of the function
+			OPCODE_CONTINUE,
+			OPCODE_TEST,   // Marks the end of the code that can be skipped by 'continue'
+			OPCODE_SWITCH,
+			OPCODE_ENDSWITCH,
+
+			// Integer opcodes
+			OPCODE_INEG,
+			OPCODE_IABS,
+			OPCODE_ISGN,
+			OPCODE_IADD,
+			OPCODE_ISUB,
+			OPCODE_IMUL,
+			OPCODE_IDIV,
+			OPCODE_IMAD,
+			OPCODE_IMOD,
+			OPCODE_SHL,
+			OPCODE_ISHR,
+			OPCODE_IMIN,
+			OPCODE_IMAX,
+
+			// Unsigned integer opcodes
+			OPCODE_UDIV,
+			OPCODE_UMOD,
+			OPCODE_USHR,
+			OPCODE_UMIN,
+			OPCODE_UMAX,
+		};
+
+		static Opcode OPCODE_DP(int);
+		static Opcode OPCODE_LEN(int);
+		static Opcode OPCODE_DIST(int);
+		static Opcode OPCODE_NRM(int);
+		static Opcode OPCODE_FORWARD(int);
+		static Opcode OPCODE_REFLECT(int);
+		static Opcode OPCODE_REFRACT(int);
+
+		enum Control
+		{
+			CONTROL_RESERVED0,
+			CONTROL_GT,
+			CONTROL_EQ,
+			CONTROL_GE,
+			CONTROL_LT,
+			CONTROL_NE,
+			CONTROL_LE,
+			CONTROL_RESERVED1
+		};
+
+		enum SamplerType
+		{
+			SAMPLER_UNKNOWN,
+			SAMPLER_1D,
+			SAMPLER_2D,
+			SAMPLER_CUBE,
+			SAMPLER_VOLUME
+		};
+
+		enum Usage   // For vertex input/output declarations
+		{
+			USAGE_POSITION = 0,
+			USAGE_BLENDWEIGHT = 1,
+			USAGE_BLENDINDICES = 2,
+			USAGE_NORMAL = 3,
+			USAGE_PSIZE = 4,
+			USAGE_TEXCOORD = 5,
+			USAGE_TANGENT = 6,
+			USAGE_BINORMAL = 7,
+			USAGE_TESSFACTOR = 8,
+			USAGE_POSITIONT = 9,
+			USAGE_COLOR = 10,
+			USAGE_FOG = 11,
+			USAGE_DEPTH = 12,
+			USAGE_SAMPLE = 13
+		};
+
+		enum ParameterType
+		{
+			PARAMETER_TEMP = 0,
+			PARAMETER_INPUT = 1,
+			PARAMETER_CONST = 2,
+			PARAMETER_TEXTURE = 3,
+			PARAMETER_ADDR = 3,
+			PARAMETER_RASTOUT = 4,
+			PARAMETER_ATTROUT = 5,
+			PARAMETER_TEXCRDOUT = 6,
+			PARAMETER_OUTPUT = 6,
+			PARAMETER_CONSTINT = 7,
+			PARAMETER_COLOROUT = 8,
+			PARAMETER_DEPTHOUT = 9,
+			PARAMETER_SAMPLER = 10,
+			PARAMETER_CONST2 = 11,
+			PARAMETER_CONST3 = 12,
+			PARAMETER_CONST4 = 13,
+			PARAMETER_CONSTBOOL = 14,
+			PARAMETER_LOOP = 15,
+			PARAMETER_TEMPFLOAT16 = 16,
+			PARAMETER_MISCTYPE = 17,
+			PARAMETER_LABEL = 18,
+			PARAMETER_PREDICATE = 19,
+
+		//	PARAMETER_FLOAT1LITERAL,
+		//	PARAMETER_FLOAT2LITERAL,
+		//	PARAMETER_FLOAT3LITERAL,
+			PARAMETER_FLOAT4LITERAL,
+			PARAMETER_BOOL1LITERAL,
+		//	PARAMETER_BOOL2LITERAL,
+		//	PARAMETER_BOOL3LITERAL,
+		//	PARAMETER_BOOL4LITERAL,
+		//	PARAMETER_INT1LITERAL,
+		//	PARAMETER_INT2LITERAL,
+		//	PARAMETER_INT3LITERAL,
+			PARAMETER_INT4LITERAL,
+
+			PARAMETER_VOID
+		};
+
+		enum MiscParameterIndex
+		{
+			VPosIndex = 0,
+			VFaceIndex = 1,
+			InstanceIDIndex = 2,
+			VertexIDIndex = 3,
+		};
+
+		enum Modifier
+		{
+			MODIFIER_NONE,
+			MODIFIER_NEGATE,
+			MODIFIER_BIAS,
+			MODIFIER_BIAS_NEGATE,
+			MODIFIER_SIGN,
+			MODIFIER_SIGN_NEGATE,
+			MODIFIER_COMPLEMENT,
+			MODIFIER_X2,
+			MODIFIER_X2_NEGATE,
+			MODIFIER_DZ,
+			MODIFIER_DW,
+			MODIFIER_ABS,
+			MODIFIER_ABS_NEGATE,
+			MODIFIER_NOT
+		};
+
+		enum Analysis
+		{
+			// Flags indicating whether an instruction is affected by an execution enable mask
+			ANALYSIS_BRANCH   = 0x00000001,
+			ANALYSIS_BREAK    = 0x00000002,
+			ANALYSIS_CONTINUE = 0x00000004,
+			ANALYSIS_LEAVE    = 0x00000008,
+		};
+
+		struct Relative
+		{
+			ParameterType type : 8;
+			unsigned int index;
+			unsigned int swizzle : 8;
+			unsigned int scale;
+			bool dynamic;   // Varies between concurrent shader instances
+		};
+
+		struct Parameter
+		{
+			union
+			{
+				struct
+				{
+					unsigned int index;   // For registers types
+
+					Relative rel;
+				};
+
+				float value[4];       // For float constants
+				int integer[4];       // For integer constants
+				int boolean[4];       // For boolean constants
+
+				struct
+				{
+					unsigned int label;      // Label index
+					unsigned int callSite;   // Call index (per label)
+				};
+			};
+
+			Parameter() : index(0), type(PARAMETER_VOID)
+			{
+				rel.type = PARAMETER_VOID;
+				rel.index = 0;
+				rel.swizzle = 0;
+				rel.scale = 1;
+				rel.dynamic = true;
+			}
+
+			std::string string(ShaderType shaderType, unsigned short version) const;
+			std::string typeString(ShaderType shaderType, unsigned short version) const;
+			std::string relativeString() const;
+
+			ParameterType type : 8;
+		};
+
+		struct DestinationParameter : Parameter
+		{
+			union
+			{
+				unsigned char mask;
+
+				struct
+				{
+					bool x : 1;
+					bool y : 1;
+					bool z : 1;
+					bool w : 1;
+				};
+			};
+
+			DestinationParameter() : mask(0xF), saturate(false), partialPrecision(false), centroid(false), shift(0)
+			{
+			}
+
+			std::string modifierString() const;
+			std::string shiftString() const;
+			std::string maskString() const;
+
+			bool saturate         : 1;
+			bool partialPrecision : 1;
+			bool centroid         : 1;
+			signed char shift     : 4;
+		};
+
+		struct SourceParameter : Parameter
+		{
+			SourceParameter() : swizzle(0xE4), modifier(MODIFIER_NONE), bufferIndex(-1)
+			{
+			}
+
+			std::string string(ShaderType shaderType, unsigned short version) const;
+			std::string swizzleString() const;
+			std::string preModifierString() const;
+			std::string postModifierString() const;
+
+			unsigned int swizzle : 8;
+			Modifier modifier : 8;
+			int bufferIndex : 8;
+		};
+
+		struct Instruction
+		{
+			explicit Instruction(Opcode opcode);
+			Instruction(const unsigned long *token, int size, unsigned char majorVersion);
+
+			virtual ~Instruction();
+
+			void parseOperationToken(unsigned long token, unsigned char majorVersion);
+			void parseDeclarationToken(unsigned long token);
+			void parseDestinationToken(const unsigned long *token, unsigned char majorVersion);
+			void parseSourceToken(int i, const unsigned long *token, unsigned char majorVersion);
+
+			std::string string(ShaderType shaderType, unsigned short version) const;
+			static std::string swizzleString(ParameterType type, unsigned char swizzle);
+			std::string operationString(unsigned short version) const;
+			std::string controlString() const;
+
+			bool isBranch() const;
+			bool isCall() const;
+			bool isBreak() const;
+			bool isLoop() const;
+			bool isEndLoop() const;
+
+			bool isPredicated() const;
+
+			Opcode opcode;
+
+			union
+			{
+				Control control;
+
+				struct
+				{
+					unsigned char project : 1;   // D3DSI_TEXLD_PROJECT
+					unsigned char bias : 1;      // D3DSI_TEXLD_BIAS
+				};
+			};
+
+			bool predicate;
+			bool predicateNot;   // Negative predicate
+			unsigned char predicateSwizzle;
+
+			bool coissue;
+			SamplerType samplerType;
+			Usage usage;
+			unsigned char usageIndex;
+
+			DestinationParameter dst;
+			SourceParameter src[5];
+
+			union
+			{
+				unsigned int analysis;
+
+				struct
+				{
+					// Keep in sync with Shader::Analysis flags
+					unsigned int analysisBranch : 1;
+					unsigned int analysisBreak : 1;
+					unsigned int analysisContinue : 1;
+					unsigned int analysisLeave : 1;
+				};
+			};
+		};
+
+		Shader();
+
+		virtual ~Shader();
+
+		int getSerialID() const;
+		size_t getLength() const;
+		ShaderType getShaderType() const;
+		unsigned short getShaderModel() const;
+
+		void append(Instruction *instruction);
+		void declareSampler(int i);
+
+		const Instruction *getInstruction(size_t i) const;
+		int size(unsigned long opcode) const;
+		static int size(unsigned long opcode, unsigned short shaderModel);
+
+		void print(const char *fileName, ...) const;
+		void printInstruction(int index, const char *fileName) const;
+
+		static bool maskContainsComponent(int mask, int component);
+		static bool swizzleContainsComponent(int swizzle, int component);
+		static bool swizzleContainsComponentMasked(int swizzle, int component, int mask);
+
+		bool containsDynamicBranching() const;
+		bool containsBreakInstruction() const;
+		bool containsContinueInstruction() const;
+		bool containsLeaveInstruction() const;
+		bool containsDefineInstruction() const;
+		bool usesSampler(int i) const;
+
+		struct Semantic
+		{
+			Semantic(unsigned char usage = 0xFF, unsigned char index = 0xFF, bool flat = false) : usage(usage), index(index), centroid(false), flat(flat)
+			{
+			}
+
+			bool operator==(const Semantic &semantic) const
+			{
+				return usage == semantic.usage && index == semantic.index;
+			}
+
+			bool active() const
+			{
+				return usage != 0xFF;
+			}
+
+			unsigned char usage;
+			unsigned char index;
+			bool centroid;
+			bool flat;
+		};
+
+		void optimize();
+
+		// FIXME: Private
+		unsigned int dirtyConstantsF;
+		unsigned int dirtyConstantsI;
+		unsigned int dirtyConstantsB;
+
+		bool indirectAddressableTemporaries;
+		bool indirectAddressableInput;
+		bool indirectAddressableOutput;
+
+	protected:
+		void parse(const unsigned long *token);
+
+		void optimizeLeave();
+		void optimizeCall();
+		void removeNull();
+
+		void analyzeDirtyConstants();
+		void analyzeDynamicBranching();
+		void analyzeSamplers();
+		void analyzeCallSites();
+		void analyzeIndirectAddressing();
+		void markFunctionAnalysis(unsigned int functionLabel, Analysis flag);
+
+		ShaderType shaderType;
+
+		union
+		{
+			unsigned short shaderModel;
+
+			struct
+			{
+				unsigned char minorVersion;
+				unsigned char majorVersion;
+			};
+		};
+
+		std::vector<Instruction*> instruction;
+
+		unsigned short usedSamplers;   // Bit flags
+
+	private:
+		const int serialID;
+		static volatile int serialCounter;
+
+		bool dynamicBranching;
+		bool containsBreak;
+		bool containsContinue;
+		bool containsLeave;
+		bool containsDefine;
+	};
+}
+
+#endif   // sw_Shader_hpp
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
new file mode 100644
index 0000000..4ea3260
--- /dev/null
+++ b/src/Pipeline/ShaderCore.cpp
@@ -0,0 +1,2006 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ShaderCore.hpp"
+
+#include "Renderer/Renderer.hpp"
+#include "Common/Debug.hpp"
+
+#include <limits.h>
+
+namespace sw
+{
+	extern TranscendentalPrecision logPrecision;
+	extern TranscendentalPrecision expPrecision;
+	extern TranscendentalPrecision rcpPrecision;
+	extern TranscendentalPrecision rsqPrecision;
+
+	Vector4s::Vector4s()
+	{
+	}
+
+	Vector4s::Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+	{
+		this->x = Short4(x);
+		this->y = Short4(y);
+		this->z = Short4(z);
+		this->w = Short4(w);
+	}
+
+	Vector4s::Vector4s(const Vector4s &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+	}
+
+	Vector4s &Vector4s::operator=(const Vector4s &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+
+		return *this;
+	}
+
+	Short4 &Vector4s::operator[](int i)
+	{
+		switch(i)
+		{
+		case 0: return x;
+		case 1: return y;
+		case 2: return z;
+		case 3: return w;
+		}
+
+		return x;
+	}
+
+	Vector4f::Vector4f()
+	{
+	}
+
+	Vector4f::Vector4f(float x, float y, float z, float w)
+	{
+		this->x = Float4(x);
+		this->y = Float4(y);
+		this->z = Float4(z);
+		this->w = Float4(w);
+	}
+
+	Vector4f::Vector4f(const Vector4f &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+	}
+
+	Vector4f &Vector4f::operator=(const Vector4f &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+
+		return *this;
+	}
+
+	Float4 &Vector4f::operator[](int i)
+	{
+		switch(i)
+		{
+		case 0: return x;
+		case 1: return y;
+		case 2: return z;
+		case 3: return w;
+		}
+
+		return x;
+	}
+
+	Float4 exponential2(RValue<Float4> x, bool pp)
+	{
+		// This implementation is based on 2^(i + f) = 2^i * 2^f,
+		// where i is the integer part of x and f is the fraction.
+
+		// For 2^i we can put the integer part directly in the exponent of
+		// the IEEE-754 floating-point number. Clamp to prevent overflow
+		// past the representation of infinity.
+		Float4 x0 = x;
+		x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
+		x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
+
+		Int4 i = RoundInt(x0 - Float4(0.5f));
+		Float4 ii = As<Float4>((i + Int4(127)) << 23);   // Add single-precision bias, and shift into exponent.
+
+		// For the fractional part use a polynomial
+		// which approximates 2^f in the 0 to 1 range.
+		Float4 f = x0 - Float4(i);
+		Float4 ff = As<Float4>(Int4(0x3AF61905));     // 1.8775767e-3f
+		ff = ff * f + As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
+		ff = ff * f + As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
+		ff = ff * f + As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
+		ff = ff * f + As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
+		ff = ff * f + Float4(1.0f);
+
+		return ii * ff;
+	}
+
+	Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
+	{
+		Float4 x0;
+		Float4 x1;
+		Float4 x2;
+		Float4 x3;
+
+		x0 = x;
+
+		x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
+		x1 = As<Float4>(As<UInt4>(x1) >> 8);
+		x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
+		x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
+		x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
+
+		x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
+		x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
+		x2 /= x3;
+
+		x1 += (x0 - Float4(1.0f)) * x2;
+
+		Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
+		return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
+	}
+
+	Float4 exponential(RValue<Float4> x, bool pp)
+	{
+		// FIXME: Propagate the constant
+		return exponential2(Float4(1.44269504f) * x, pp);   // 1/ln(2)
+	}
+
+	Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
+	{
+		// FIXME: Propagate the constant
+		return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp);   // ln(2)
+	}
+
+	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
+	{
+		Float4 log = logarithm2(x, true, pp);
+		log *= y;
+		return exponential2(log, pp);
+	}
+
+	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
+	{
+		Float4 rcp;
+
+		if(!pp && rcpPrecision >= WHQL)
+		{
+			rcp = Float4(1.0f) / x;
+		}
+		else
+		{
+			rcp = Rcp_pp(x, exactAtPow2);
+
+			if(!pp)
+			{
+				rcp = (rcp + rcp) - (x * rcp * rcp);
+			}
+		}
+
+		if(finite)
+		{
+			int big = 0x7F7FFFFF;
+			rcp = Min(rcp, Float4((float&)big));
+		}
+
+		return rcp;
+	}
+
+	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
+	{
+		Float4 abs = x;
+
+		if(absolute)
+		{
+			abs = Abs(abs);
+		}
+
+		Float4 rsq;
+
+		if(!pp)
+		{
+			rsq = Float4(1.0f) / Sqrt(abs);
+		}
+		else
+		{
+			rsq = RcpSqrt_pp(abs);
+
+			if(!pp)
+			{
+				rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
+			}
+
+			rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
+		}
+
+		return rsq;
+	}
+
+	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
+	{
+		return x - y * Floor(x / y);
+	}
+
+	Float4 sine_pi(RValue<Float4> x, bool pp)
+	{
+		const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
+		const Float4 B = Float4(1.27323954e+0f);    // 4/pi
+		const Float4 C = Float4(7.75160950e-1f);
+		const Float4 D = Float4(2.24839049e-1f);
+
+		// Parabola approximating sine
+		Float4 sin = x * (Abs(x) * A + B);
+
+		// Improve precision from 0.06 to 0.001
+		if(true)
+		{
+			sin = sin * (Abs(sin) * D + C);
+		}
+
+		return sin;
+	}
+
+	Float4 cosine_pi(RValue<Float4> x, bool pp)
+	{
+		// cos(x) = sin(x + pi/2)
+		Float4 y = x + Float4(1.57079632e+0f);
+
+		// Wrap around
+		y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
+
+		return sine_pi(y, pp);
+	}
+
+	Float4 sine(RValue<Float4> x, bool pp)
+	{
+		// Reduce to [-0.5, 0.5] range
+		Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
+		y = y - Round(y);
+
+		if(!pp)
+		{
+			// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
+			// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
+			// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
+			//  pp : 4 mul, 2 add, 2 abs
+
+			Float4 y2 = y * y;
+			Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
+			Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
+			Float4 c2 = (c1 * c1) - (s1 * s1);
+			Float4 s2 = Float4(2.0f) * s1 * c1;
+			return Float4(2.0f) * s2 * c2 * reciprocal(s2 * s2 + c2 * c2, pp, true);
+		}
+
+		const Float4 A = Float4(-16.0f);
+		const Float4 B = Float4(8.0f);
+		const Float4 C = Float4(7.75160950e-1f);
+		const Float4 D = Float4(2.24839049e-1f);
+
+		// Parabola approximating sine
+		Float4 sin = y * (Abs(y) * A + B);
+
+		// Improve precision from 0.06 to 0.001
+		if(true)
+		{
+			sin = sin * (Abs(sin) * D + C);
+		}
+
+		return sin;
+	}
+
+	Float4 cosine(RValue<Float4> x, bool pp)
+	{
+		// cos(x) = sin(x + pi/2)
+		Float4 y = x + Float4(1.57079632e+0f);
+		return sine(y, pp);
+	}
+
+	Float4 tangent(RValue<Float4> x, bool pp)
+	{
+		return sine(x, pp) / cosine(x, pp);
+	}
+
+	Float4 arccos(RValue<Float4> x, bool pp)
+	{
+		// pi/2 - arcsin(x)
+		return Float4(1.57079632e+0f) - arcsin(x);
+	}
+
+	Float4 arcsin(RValue<Float4> x, bool pp)
+	{
+		if(false) // Simpler implementation fails even lowp precision tests
+		{
+			// x*(pi/2-sqrt(1-x*x)*pi/5)
+			return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
+		}
+		else
+		{
+			// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+			const Float4 half_pi(1.57079632f);
+			const Float4 a0(1.5707288f);
+			const Float4 a1(-0.2121144f);
+			const Float4 a2(0.0742610f);
+			const Float4 a3(-0.0187293f);
+			Float4 absx = Abs(x);
+			return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
+			       (As<Int4>(x) & Int4(0x80000000)));
+		}
+	}
+
+	// Approximation of atan in [0..1]
+	Float4 arctan_01(Float4 x, bool pp)
+	{
+		if(pp)
+		{
+			return x * (Float4(-0.27f) * x + Float4(1.05539816f));
+		}
+		else
+		{
+			// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+			const Float4 a2(-0.3333314528f);
+			const Float4 a4(0.1999355085f);
+			const Float4 a6(-0.1420889944f);
+			const Float4 a8(0.1065626393f);
+			const Float4 a10(-0.0752896400f);
+			const Float4 a12(0.0429096138f);
+			const Float4 a14(-0.0161657367f);
+			const Float4 a16(0.0028662257f);
+			Float4 x2 = x * x;
+			return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
+		}
+	}
+
+	Float4 arctan(RValue<Float4> x, bool pp)
+	{
+		Float4 absx = Abs(x);
+		Int4 O = CmpNLT(absx, Float4(1.0f));
+		Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
+
+		const Float4 half_pi(1.57079632f);
+		Float4 theta = arctan_01(y, pp);
+		return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
+		       (As<Int4>(x) & Int4(0x80000000)));
+	}
+
+	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
+	{
+		const Float4 pi(3.14159265f);            // pi
+		const Float4 minus_pi(-3.14159265f);     // -pi
+		const Float4 half_pi(1.57079632f);       // pi/2
+		const Float4 quarter_pi(7.85398163e-1f); // pi/4
+
+		// Rotate to upper semicircle when in lower semicircle
+		Int4 S = CmpLT(y, Float4(0.0f));
+		Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
+		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
+		Float4 y0 = Abs(y);
+
+		// Rotate to right quadrant when in left quadrant
+		Int4 Q = CmpLT(x0, Float4(0.0f));
+		theta += As<Float4>(Q & As<Int4>(half_pi));
+		Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));  // FIXME: Vector select
+		Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
+
+		// Mirror to first octant when in second octant
+		Int4 O = CmpNLT(y1, x1);
+		Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
+		Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
+
+		// Approximation of atan in [0..1]
+		Int4 zero_x = CmpEQ(x2, Float4(0.0f));
+		Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
+		Float4 atan2_theta = arctan_01(y2 / x2, pp);
+		theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
+		                    (inf_y & As<Int4>(quarter_pi)));
+
+		// Recover loss of precision for tiny theta angles
+		Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
+		return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
+	}
+
+	Float4 sineh(RValue<Float4> x, bool pp)
+	{
+		return (exponential(x, pp) - exponential(-x, pp)) * Float4(0.5f);
+	}
+
+	Float4 cosineh(RValue<Float4> x, bool pp)
+	{
+		return (exponential(x, pp) + exponential(-x, pp)) * Float4(0.5f);
+	}
+
+	Float4 tangenth(RValue<Float4> x, bool pp)
+	{
+		Float4 e_x = exponential(x, pp);
+		Float4 e_minus_x = exponential(-x, pp);
+		return (e_x - e_minus_x) / (e_x + e_minus_x);
+	}
+
+	Float4 arccosh(RValue<Float4> x, bool pp)
+	{
+		return logarithm(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)), pp);
+	}
+
+	Float4 arcsinh(RValue<Float4> x, bool pp)
+	{
+		return logarithm(x + Sqrt(x * x + Float4(1.0f)), pp);
+	}
+
+	Float4 arctanh(RValue<Float4> x, bool pp)
+	{
+		return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
+	}
+
+	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
+	{
+		return v0.x * v1.x + v0.y * v1.y;
+	}
+
+	Float4 dot3(const Vector4f &v0, const Vector4f &v1)
+	{
+		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
+	}
+
+	Float4 dot4(const Vector4f &v0, const Vector4f &v1)
+	{
+		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
+	}
+
+	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+	{
+		Int2 tmp0 = UnpackHigh(row0, row1);
+		Int2 tmp1 = UnpackHigh(row2, row3);
+		Int2 tmp2 = UnpackLow(row0, row1);
+		Int2 tmp3 = UnpackLow(row2, row3);
+
+		row0 = UnpackLow(tmp2, tmp3);
+		row1 = UnpackHigh(tmp2, tmp3);
+		row2 = UnpackLow(tmp0, tmp1);
+		row3 = UnpackHigh(tmp0, tmp1);
+	}
+
+	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+	{
+		Int2 tmp0 = UnpackHigh(row0, row1);
+		Int2 tmp1 = UnpackHigh(row2, row3);
+		Int2 tmp2 = UnpackLow(row0, row1);
+		Int2 tmp3 = UnpackLow(row2, row3);
+
+		row0 = UnpackLow(tmp2, tmp3);
+		row1 = UnpackHigh(tmp2, tmp3);
+		row2 = UnpackLow(tmp0, tmp1);
+	}
+
+	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+		Float4 tmp2 = UnpackHigh(row0, row1);
+		Float4 tmp3 = UnpackHigh(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+		row1 = Float4(tmp0.zw, tmp1.zw);
+		row2 = Float4(tmp2.xy, tmp3.xy);
+		row3 = Float4(tmp2.zw, tmp3.zw);
+	}
+
+	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+		Float4 tmp2 = UnpackHigh(row0, row1);
+		Float4 tmp3 = UnpackHigh(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+		row1 = Float4(tmp0.zw, tmp1.zw);
+		row2 = Float4(tmp2.xy, tmp3.xy);
+	}
+
+	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+		row1 = Float4(tmp0.zw, tmp1.zw);
+	}
+
+	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+	}
+
+	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp01 = UnpackLow(row0, row1);
+		Float4 tmp23 = UnpackHigh(row0, row1);
+
+		row0 = tmp01;
+		row1 = Float4(tmp01.zw, row1.zw);
+		row2 = tmp23;
+		row3 = Float4(tmp23.zw, row3.zw);
+	}
+
+	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
+	{
+		switch(N)
+		{
+		case 1: transpose4x1(row0, row1, row2, row3); break;
+		case 2: transpose4x2(row0, row1, row2, row3); break;
+		case 3: transpose4x3(row0, row1, row2, row3); break;
+		case 4: transpose4x4(row0, row1, row2, row3); break;
+		}
+	}
+
+	const Vector4f RegisterFile::operator[](RValue<Int4> index)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		Vector4f r;
+
+		r.x.x = Extract(x[0][index0], 0);
+		r.x.y = Extract(x[0][index1], 1);
+		r.x.z = Extract(x[0][index2], 2);
+		r.x.w = Extract(x[0][index3], 3);
+
+		r.y.x = Extract(y[0][index0], 0);
+		r.y.y = Extract(y[0][index1], 1);
+		r.y.z = Extract(y[0][index2], 2);
+		r.y.w = Extract(y[0][index3], 3);
+
+		r.z.x = Extract(z[0][index0], 0);
+		r.z.y = Extract(z[0][index1], 1);
+		r.z.z = Extract(z[0][index2], 2);
+		r.z.w = Extract(z[0][index3], 3);
+
+		r.w.x = Extract(w[0][index0], 0);
+		r.w.y = Extract(w[0][index1], 1);
+		r.w.z = Extract(w[0][index2], 2);
+		r.w.w = Extract(w[0][index3], 3);
+
+		return r;
+	}
+
+	void RegisterFile::scatter_x(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		x[0][index0] = Insert(x[0][index0], Extract(r, 0), 0);
+		x[0][index1] = Insert(x[0][index1], Extract(r, 1), 1);
+		x[0][index2] = Insert(x[0][index2], Extract(r, 2), 2);
+		x[0][index3] = Insert(x[0][index3], Extract(r, 3), 3);
+	}
+
+	void RegisterFile::scatter_y(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		y[0][index0] = Insert(y[0][index0], Extract(r, 0), 0);
+		y[0][index1] = Insert(y[0][index1], Extract(r, 1), 1);
+		y[0][index2] = Insert(y[0][index2], Extract(r, 2), 2);
+		y[0][index3] = Insert(y[0][index3], Extract(r, 3), 3);
+	}
+
+	void RegisterFile::scatter_z(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		z[0][index0] = Insert(z[0][index0], Extract(r, 0), 0);
+		z[0][index1] = Insert(z[0][index1], Extract(r, 1), 1);
+		z[0][index2] = Insert(z[0][index2], Extract(r, 2), 2);
+		z[0][index3] = Insert(z[0][index3], Extract(r, 3), 3);
+	}
+
+	void RegisterFile::scatter_w(Int4 index, RValue<Float4> r)
+	{
+		ASSERT(indirectAddressable);
+
+		Int index0 = Extract(index, 0);
+		Int index1 = Extract(index, 1);
+		Int index2 = Extract(index, 2);
+		Int index3 = Extract(index, 3);
+
+		w[0][index0] = Insert(w[0][index0], Extract(r, 0), 0);
+		w[0][index1] = Insert(w[0][index1], Extract(r, 1), 1);
+		w[0][index2] = Insert(w[0][index2], Extract(r, 2), 2);
+		w[0][index3] = Insert(w[0][index3], Extract(r, 3), 3);
+	}
+
+	void ShaderCore::mov(Vector4f &dst, const Vector4f &src, bool integerDestination)
+	{
+		if(integerDestination)
+		{
+			dst.x = As<Float4>(RoundInt(src.x));
+			dst.y = As<Float4>(RoundInt(src.y));
+			dst.z = As<Float4>(RoundInt(src.z));
+			dst.w = As<Float4>(RoundInt(src.w));
+		}
+		else
+		{
+			dst = src;
+		}
+	}
+
+	void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = -src.x;
+		dst.y = -src.y;
+		dst.z = -src.z;
+		dst.w = -src.w;
+	}
+
+	void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(-As<Int4>(src.x));
+		dst.y = As<Float4>(-As<Int4>(src.y));
+		dst.z = As<Float4>(-As<Int4>(src.z));
+		dst.w = As<Float4>(-As<Int4>(src.w));
+	}
+
+	void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
+		dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f)));
+		dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f)));
+		dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f)));
+	}
+
+	void ShaderCore::b2f(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f)));
+		dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f)));
+		dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f)));
+		dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
+	}
+
+	void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(Int4(src.x));
+		dst.y = As<Float4>(Int4(src.y));
+		dst.z = As<Float4>(Int4(src.z));
+		dst.w = As<Float4>(Int4(src.w));
+	}
+
+	void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Float4(As<Int4>(src.x));
+		dst.y = Float4(As<Int4>(src.y));
+		dst.z = Float4(As<Int4>(src.z));
+		dst.w = Float4(As<Int4>(src.w));
+	}
+
+	void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(UInt4(src.x));
+		dst.y = As<Float4>(UInt4(src.y));
+		dst.z = As<Float4>(UInt4(src.z));
+		dst.w = As<Float4>(UInt4(src.w));
+	}
+
+	void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Float4(As<UInt4>(src.x));
+		dst.y = Float4(As<UInt4>(src.y));
+		dst.z = Float4(As<UInt4>(src.z));
+		dst.w = Float4(As<UInt4>(src.w));
+	}
+
+	void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0)));
+		dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0)));
+		dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0)));
+		dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0)));
+	}
+
+	void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(As<Int4>(src.x) & Int4(1));
+		dst.y = As<Float4>(As<Int4>(src.y) & Int4(1));
+		dst.z = As<Float4>(As<Int4>(src.z) & Int4(1));
+		dst.w = As<Float4>(As<Int4>(src.w) & Int4(1));
+	}
+
+	void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = src0.x + src1.x;
+		dst.y = src0.y + src1.y;
+		dst.z = src0.z + src1.z;
+		dst.w = src0.w + src1.w;
+	}
+
+	void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
+	}
+
+	void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = src0.x - src1.x;
+		dst.y = src0.y - src1.y;
+		dst.z = src0.z - src1.z;
+		dst.w = src0.w - src1.w;
+	}
+
+	void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
+	}
+
+	void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		dst.x = src0.x * src1.x + src2.x;
+		dst.y = src0.y * src1.y + src2.y;
+		dst.z = src0.z * src1.z + src2.z;
+		dst.w = src0.w * src1.w + src2.w;
+	}
+
+	void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
+	}
+
+	void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = src0.x * src1.x;
+		dst.y = src0.y * src1.y;
+		dst.z = src0.z * src1.z;
+		dst.w = src0.w * src1.w;
+	}
+
+	void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
+	}
+
+	void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		Float4 rcp = reciprocal(src.x, pp, true, true);
+
+		dst.x = rcp;
+		dst.y = rcp;
+		dst.z = rcp;
+		dst.w = rcp;
+	}
+
+	void ShaderCore::div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = src0.x / src1.x;
+		dst.y = src0.y / src1.y;
+		dst.z = src0.z / src1.z;
+		dst.w = src0.w / src1.w;
+	}
+
+	void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 intMax(As<Float4>(Int4(INT_MAX)));
+		cmp0i(dst.x, src1.x, intMax, src1.x);
+		dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
+		cmp0i(dst.y, src1.y, intMax, src1.y);
+		dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
+		cmp0i(dst.z, src1.z, intMax, src1.z);
+		dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
+		cmp0i(dst.w, src1.w, intMax, src1.w);
+		dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
+	}
+
+	void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
+		cmp0i(dst.x, src1.x, uintMax, src1.x);
+		dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
+		cmp0i(dst.y, src1.y, uintMax, src1.y);
+		dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
+		cmp0i(dst.z, src1.z, uintMax, src1.z);
+		dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
+		cmp0i(dst.w, src1.w, uintMax, src1.w);
+		dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
+	}
+
+	void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = modulo(src0.x, src1.x);
+		dst.y = modulo(src0.y, src1.y);
+		dst.z = modulo(src0.z, src1.z);
+		dst.w = modulo(src0.w, src1.w);
+	}
+
+	void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 intMax(As<Float4>(Int4(INT_MAX)));
+		cmp0i(dst.x, src1.x, intMax, src1.x);
+		dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
+		cmp0i(dst.y, src1.y, intMax, src1.y);
+		dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
+		cmp0i(dst.z, src1.z, intMax, src1.z);
+		dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
+		cmp0i(dst.w, src1.w, intMax, src1.w);
+		dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
+	}
+
+	void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
+		cmp0i(dst.x, src1.x, uintMax, src1.x);
+		dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
+		cmp0i(dst.y, src1.y, uintMax, src1.y);
+		dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
+		cmp0i(dst.z, src1.z, uintMax, src1.z);
+		dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
+		cmp0i(dst.w, src1.w, uintMax, src1.w);
+		dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
+	}
+
+	void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
+	}
+
+	void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
+	}
+
+	void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
+		dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
+		dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
+		dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
+	}
+
+	void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
+
+		dst.x = rsq;
+		dst.y = rsq;
+		dst.z = rsq;
+		dst.w = rsq;
+	}
+
+	void ShaderCore::sqrt(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = Sqrt(src.x);
+		dst.y = Sqrt(src.y);
+		dst.z = Sqrt(src.z);
+		dst.w = Sqrt(src.w);
+	}
+
+	void ShaderCore::rsq(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = reciprocalSquareRoot(src.x, false, pp);
+		dst.y = reciprocalSquareRoot(src.y, false, pp);
+		dst.z = reciprocalSquareRoot(src.z, false, pp);
+		dst.w = reciprocalSquareRoot(src.w, false, pp);
+	}
+
+	void ShaderCore::len2(Float4 &dst, const Vector4f &src, bool pp)
+	{
+		dst = Sqrt(dot2(src, src));
+	}
+
+	void ShaderCore::len3(Float4 &dst, const Vector4f &src, bool pp)
+	{
+		dst = Sqrt(dot3(src, src));
+	}
+
+	void ShaderCore::len4(Float4 &dst, const Vector4f &src, bool pp)
+	{
+		dst = Sqrt(dot4(src, src));
+	}
+
+	void ShaderCore::dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+	{
+		dst = Abs(src0.x - src1.x);
+	}
+
+	void ShaderCore::dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+	{
+		Float4 dx = src0.x - src1.x;
+		Float4 dy = src0.y - src1.y;
+		Float4 dot2 = dx * dx + dy * dy;
+		dst = Sqrt(dot2);
+	}
+
+	void ShaderCore::dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+	{
+		Float4 dx = src0.x - src1.x;
+		Float4 dy = src0.y - src1.y;
+		Float4 dz = src0.z - src1.z;
+		Float4 dot3 = dx * dx + dy * dy + dz * dz;
+		dst = Sqrt(dot3);
+	}
+
+	void ShaderCore::dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+	{
+		Float4 dx = src0.x - src1.x;
+		Float4 dy = src0.y - src1.y;
+		Float4 dz = src0.z - src1.z;
+		Float4 dw = src0.w - src1.w;
+		Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
+		dst = Sqrt(dot4);
+	}
+
+	void ShaderCore::dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 t = src0.x * src1.x;
+
+		dst.x = t;
+		dst.y = t;
+		dst.z = t;
+		dst.w = t;
+	}
+
+	void ShaderCore::dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 t = dot2(src0, src1);
+
+		dst.x = t;
+		dst.y = t;
+		dst.z = t;
+		dst.w = t;
+	}
+
+	void ShaderCore::dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		Float4 t = dot2(src0, src1) + src2.x;
+
+		dst.x = t;
+		dst.y = t;
+		dst.z = t;
+		dst.w = t;
+	}
+
+	void ShaderCore::dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 dot = dot3(src0, src1);
+
+		dst.x = dot;
+		dst.y = dot;
+		dst.z = dot;
+		dst.w = dot;
+	}
+
+	void ShaderCore::dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 dot = dot4(src0, src1);
+
+		dst.x = dot;
+		dst.y = dot;
+		dst.z = dot;
+		dst.w = dot;
+	}
+
+	void ShaderCore::min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = Min(src0.x, src1.x);
+		dst.y = Min(src0.y, src1.y);
+		dst.z = Min(src0.z, src1.z);
+		dst.w = Min(src0.w, src1.w);
+	}
+
+	void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
+		dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
+		dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
+		dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
+	}
+
+	void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+		dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+		dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+		dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+	}
+
+	void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = Max(src0.x, src1.x);
+		dst.y = Max(src0.y, src1.y);
+		dst.z = Max(src0.z, src1.z);
+		dst.w = Max(src0.w, src1.w);
+	}
+
+	void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
+		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
+		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
+		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
+	}
+
+	void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
+		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
+		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
+		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
+	}
+
+	void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
+		dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f)));
+		dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f)));
+		dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f)));
+	}
+
+	void ShaderCore::step(Vector4f &dst, const Vector4f &edge, const Vector4f &x)
+	{
+		dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f)));
+		dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f)));
+		dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f)));
+		dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f)));
+	}
+
+	void ShaderCore::exp2x(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		Float4 exp = exponential2(src.x, pp);
+
+		dst.x = exp;
+		dst.y = exp;
+		dst.z = exp;
+		dst.w = exp;
+	}
+
+	void ShaderCore::exp2(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = exponential2(src.x, pp);
+		dst.y = exponential2(src.y, pp);
+		dst.z = exponential2(src.z, pp);
+		dst.w = exponential2(src.w, pp);
+	}
+
+	void ShaderCore::exp(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = exponential(src.x, pp);
+		dst.y = exponential(src.y, pp);
+		dst.z = exponential(src.z, pp);
+		dst.w = exponential(src.w, pp);
+	}
+
+	void ShaderCore::log2x(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		Float4 log = logarithm2(src.x, true, pp);
+
+		dst.x = log;
+		dst.y = log;
+		dst.z = log;
+		dst.w = log;
+	}
+
+	void ShaderCore::log2(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = logarithm2(src.x, false, pp);
+		dst.y = logarithm2(src.y, false, pp);
+		dst.z = logarithm2(src.z, false, pp);
+		dst.w = logarithm2(src.w, false, pp);
+	}
+
+	void ShaderCore::log(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = logarithm(src.x, false, pp);
+		dst.y = logarithm(src.y, false, pp);
+		dst.z = logarithm(src.z, false, pp);
+		dst.w = logarithm(src.w, false, pp);
+	}
+
+	void ShaderCore::lit(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Float4(1.0f);
+		dst.y = Max(src.x, Float4(0.0f));
+
+		Float4 pow;
+
+		pow = src.w;
+		pow = Min(pow, Float4(127.9961f));
+		pow = Max(pow, Float4(-127.9961f));
+
+		dst.z = power(src.y, pow);
+		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f)));
+		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f)));
+
+		dst.w = Float4(1.0f);
+	}
+
+	void ShaderCore::att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		// Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
+		dst.x = 1;
+		dst.y = src0.y * src1.y;
+		dst.z = src0.z;
+		dst.w = src1.w;
+	}
+
+	void ShaderCore::lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		dst.x = src0.x * (src1.x - src2.x) + src2.x;
+		dst.y = src0.y * (src1.y - src2.y) + src2.y;
+		dst.z = src0.z * (src1.z - src2.z) + src2.z;
+		dst.w = src0.w * (src1.w - src2.w) + src2.w;
+	}
+
+	void ShaderCore::isinf(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(IsInf(src.x));
+		dst.y = As<Float4>(IsInf(src.y));
+		dst.z = As<Float4>(IsInf(src.z));
+		dst.w = As<Float4>(IsInf(src.w));
+	}
+
+	void ShaderCore::isnan(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(IsNan(src.x));
+		dst.y = As<Float4>(IsNan(src.y));
+		dst.z = As<Float4>(IsNan(src.z));
+		dst.w = As<Float4>(IsNan(src.w));
+	}
+
+	void ShaderCore::smooth(Vector4f &dst, const Vector4f &edge0, const Vector4f &edge1, const Vector4f &x)
+	{
+		Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx);
+		Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty);
+		Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz);
+		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
+	}
+
+	void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
+	{
+		static const uint32_t mask_sign = 0x80000000u;
+		static const uint32_t mask_round = ~0xfffu;
+		static const uint32_t c_f32infty = 255 << 23;
+		static const uint32_t c_magic = 15 << 23;
+		static const uint32_t c_nanbit = 0x200;
+		static const uint32_t c_infty_as_fp16 = 0x7c00;
+		static const uint32_t c_clamp = (31 << 23) - 0x1000;
+
+		UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
+		UInt4 absf = As<UInt4>(floatBits) ^ justsign;
+		UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
+
+		// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
+		//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
+		UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
+		                                 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
+		               ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
+		               UInt4(c_infty_as_fp16)));
+
+		dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
+	}
+
+	void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
+	{
+		static const uint32_t mask_nosign = 0x7FFF;
+		static const uint32_t magic = (254 - 15) << 23;
+		static const uint32_t was_infnan = 0x7BFF;
+		static const uint32_t exp_infnan = 255 << 23;
+
+		UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
+		dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
+		                 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
+		                 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
+	}
+
+	void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
+	{
+		// half2 | half1
+		floatToHalfBits(d.x, s0.x, false);
+		floatToHalfBits(d.x, s0.y, true);
+	}
+
+	void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
+	{
+		// half2 | half1
+		halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
+		halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
+	}
+
+	void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
+	{
+		// round(clamp(c, -1.0, 1.0) * 32767.0)
+		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) |
+		                ((Int4(Round(Min(Max(s0.y, Float4(-1.0f)), Float4(1.0f)) * Float4(32767.0f))) & Int4(0xFFFF)) << 16));
+	}
+
+	void ShaderCore::packUnorm2x16(Vector4f &d, const Vector4f &s0)
+	{
+		// round(clamp(c, 0.0, 1.0) * 65535.0)
+		d.x = As<Float4>((Int4(Round(Min(Max(s0.x, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) |
+		                ((Int4(Round(Min(Max(s0.y, Float4(0.0f)), Float4(1.0f)) * Float4(65535.0f))) & Int4(0xFFFF)) << 16));
+	}
+
+	void ShaderCore::unpackSnorm2x16(Vector4f &dst, const Vector4f &s0)
+	{
+		// clamp(f / 32727.0, -1.0, 1.0)
+		dst.x = Min(Max(Float4(As<Int4>((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16)) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
+		dst.y = Min(Max(Float4(As<Int4>(As<UInt4>(s0.x) & UInt4(0xFFFF0000))) * Float4(1.0f / float(0x7FFF0000)), Float4(-1.0f)), Float4(1.0f));
+	}
+
+	void ShaderCore::unpackUnorm2x16(Vector4f &dst, const Vector4f &s0)
+	{
+		// f / 65535.0
+		dst.x = Float4((As<UInt4>(s0.x) & UInt4(0x0000FFFF)) << 16) * Float4(1.0f / float(0xFFFF0000));
+		dst.y = Float4(As<UInt4>(s0.x) & UInt4(0xFFFF0000)) * Float4(1.0f / float(0xFFFF0000));
+	}
+
+	void ShaderCore::det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = src0.x * src1.y - src0.y * src1.x;
+		dst.y = dst.z = dst.w = dst.x;
+	}
+
+	void ShaderCore::det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		crs(dst, src1, src2);
+		dp3(dst, dst, src0);
+	}
+
+	void ShaderCore::det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3)
+	{
+		dst.x = src2.z * src3.w - src2.w * src3.z;
+		dst.y = src1.w * src3.z - src1.z * src3.w;
+		dst.z = src1.z * src2.w - src1.w * src2.z;
+		dst.x = src0.x * (src1.y * dst.x + src2.y * dst.y + src3.y * dst.z) -
+		        src0.y * (src1.x * dst.x + src2.x * dst.y + src3.x * dst.z) +
+		        src0.z * (src1.x * (src2.y * src3.w - src2.w * src3.y) +
+		                  src2.x * (src1.w * src3.y - src1.y * src3.w) +
+		                  src3.x * (src1.y * src2.w - src1.w * src2.y)) +
+		        src0.w * (src1.x * (src2.z * src3.y - src2.y * src3.z) +
+		                  src2.x * (src1.y * src3.z - src1.z * src3.y) +
+		                  src3.x * (src1.z * src2.y - src1.y * src2.z));
+		dst.y = dst.z = dst.w = dst.x;
+	}
+
+	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Frac(src.x);
+		dst.y = Frac(src.y);
+		dst.z = Frac(src.z);
+		dst.w = Frac(src.w);
+	}
+
+	void ShaderCore::trunc(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Trunc(src.x);
+		dst.y = Trunc(src.y);
+		dst.z = Trunc(src.z);
+		dst.w = Trunc(src.w);
+	}
+
+	void ShaderCore::floor(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Floor(src.x);
+		dst.y = Floor(src.y);
+		dst.z = Floor(src.z);
+		dst.w = Floor(src.w);
+	}
+
+	void ShaderCore::round(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Round(src.x);
+		dst.y = Round(src.y);
+		dst.z = Round(src.z);
+		dst.w = Round(src.w);
+	}
+
+	void ShaderCore::roundEven(Vector4f &dst, const Vector4f &src)
+	{
+		// dst = round(src) + ((round(src) < src) * 2 - 1) * (fract(src) == 0.5) * isOdd(round(src));
+		// ex.: 1.5:  2 + (0 * 2 - 1) * 1 * 0 = 2
+		//      2.5:  3 + (0 * 2 - 1) * 1 * 1 = 2
+		//     -1.5: -2 + (1 * 2 - 1) * 1 * 0 = -2
+		//     -2.5: -3 + (1 * 2 - 1) * 1 * 1 = -2
+		// Even if the round implementation rounds the other way:
+		//      1.5:  1 + (1 * 2 - 1) * 1 * 1 = 2
+		//      2.5:  2 + (1 * 2 - 1) * 1 * 0 = 2
+		//     -1.5: -1 + (0 * 2 - 1) * 1 * 1 = -2
+		//     -2.5: -2 + (0 * 2 - 1) * 1 * 0 = -2
+		round(dst, src);
+		dst.x += ((Float4(CmpLT(dst.x, src.x) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.x), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.x) & Int4(1));
+		dst.y += ((Float4(CmpLT(dst.y, src.y) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.y), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.y) & Int4(1));
+		dst.z += ((Float4(CmpLT(dst.z, src.z) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.z), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.z) & Int4(1));
+		dst.w += ((Float4(CmpLT(dst.w, src.w) & Int4(1)) * Float4(2.0f)) - Float4(1.0f)) * Float4(CmpEQ(Frac(src.w), Float4(0.5f)) & Int4(1)) * Float4(Int4(dst.w) & Int4(1));
+	}
+
+	void ShaderCore::ceil(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Ceil(src.x);
+		dst.y = Ceil(src.y);
+		dst.z = Ceil(src.z);
+		dst.w = Ceil(src.w);
+	}
+
+	void ShaderCore::powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+	{
+		Float4 pow = power(src0.x, src1.x, pp);
+
+		dst.x = pow;
+		dst.y = pow;
+		dst.z = pow;
+		dst.w = pow;
+	}
+
+	void ShaderCore::pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+	{
+		dst.x = power(src0.x, src1.x, pp);
+		dst.y = power(src0.y, src1.y, pp);
+		dst.z = power(src0.z, src1.z, pp);
+		dst.w = power(src0.w, src1.w, pp);
+	}
+
+	void ShaderCore::crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = src0.y * src1.z - src0.z * src1.y;
+		dst.y = src0.z * src1.x - src0.x * src1.z;
+		dst.z = src0.x * src1.y - src0.y * src1.x;
+	}
+
+	void ShaderCore::forward1(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+	}
+
+	void ShaderCore::forward2(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
+	}
+
+	void ShaderCore::forward3(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
+		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
+	}
+
+	void ShaderCore::forward4(Vector4f &dst, const Vector4f &N, const Vector4f &I, const Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
+		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
+		dst.w =  As<Float4>(flip ^ As<Int4>(N.w));
+	}
+
+	void ShaderCore::reflect1(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+	{
+		Float4 d = N.x * I.x;
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+	}
+
+	void ShaderCore::reflect2(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+	{
+		Float4 d = dot2(N, I);
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+		dst.y = I.y - Float4(2.0f) * d * N.y;
+	}
+
+	void ShaderCore::reflect3(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+	{
+		Float4 d = dot3(N, I);
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+		dst.y = I.y - Float4(2.0f) * d * N.y;
+		dst.z = I.z - Float4(2.0f) * d * N.z;
+	}
+
+	void ShaderCore::reflect4(Vector4f &dst, const Vector4f &I, const Vector4f &N)
+	{
+		Float4 d = dot4(N, I);
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+		dst.y = I.y - Float4(2.0f) * d * N.y;
+		dst.z = I.z - Float4(2.0f) * d * N.z;
+		dst.w = I.w - Float4(2.0f) * d * N.w;
+	}
+
+	void ShaderCore::refract1(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+	{
+		Float4 d = N.x * I.x;
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+	}
+
+	void ShaderCore::refract2(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+	{
+		Float4 d = dot2(N, I);
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+	}
+
+	void ShaderCore::refract3(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+	{
+		Float4 d = dot3(N, I);
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
+	}
+
+	void ShaderCore::refract4(Vector4f &dst, const Vector4f &I, const Vector4f &N, const Float4 &eta)
+	{
+		Float4 d = dot4(N, I);
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
+		dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
+	}
+
+	void ShaderCore::sgn(Vector4f &dst, const Vector4f &src)
+	{
+		sgn(dst.x, src.x);
+		sgn(dst.y, src.y);
+		sgn(dst.z, src.z);
+		sgn(dst.w, src.w);
+	}
+
+	void ShaderCore::isgn(Vector4f &dst, const Vector4f &src)
+	{
+		isgn(dst.x, src.x);
+		isgn(dst.y, src.y);
+		isgn(dst.z, src.z);
+		isgn(dst.w, src.w);
+	}
+
+	void ShaderCore::abs(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Abs(src.x);
+		dst.y = Abs(src.y);
+		dst.z = Abs(src.z);
+		dst.w = Abs(src.w);
+	}
+
+	void ShaderCore::iabs(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(Abs(As<Int4>(src.x)));
+		dst.y = As<Float4>(Abs(As<Int4>(src.y)));
+		dst.z = As<Float4>(Abs(As<Int4>(src.z)));
+		dst.w = As<Float4>(Abs(As<Int4>(src.w)));
+	}
+
+	void ShaderCore::nrm2(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		Float4 dot = dot2(src, src);
+		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
+
+		dst.x = src.x * rsq;
+		dst.y = src.y * rsq;
+		dst.z = src.z * rsq;
+		dst.w = src.w * rsq;
+	}
+
+	void ShaderCore::nrm3(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		Float4 dot = dot3(src, src);
+		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
+
+		dst.x = src.x * rsq;
+		dst.y = src.y * rsq;
+		dst.z = src.z * rsq;
+		dst.w = src.w * rsq;
+	}
+
+	void ShaderCore::nrm4(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		Float4 dot = dot4(src, src);
+		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
+
+		dst.x = src.x * rsq;
+		dst.y = src.y * rsq;
+		dst.z = src.z * rsq;
+		dst.w = src.w * rsq;
+	}
+
+	void ShaderCore::sincos(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = cosine_pi(src.x, pp);
+		dst.y = sine_pi(src.x, pp);
+	}
+
+	void ShaderCore::cos(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = cosine(src.x, pp);
+		dst.y = cosine(src.y, pp);
+		dst.z = cosine(src.z, pp);
+		dst.w = cosine(src.w, pp);
+	}
+
+	void ShaderCore::sin(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = sine(src.x, pp);
+		dst.y = sine(src.y, pp);
+		dst.z = sine(src.z, pp);
+		dst.w = sine(src.w, pp);
+	}
+
+	void ShaderCore::tan(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = tangent(src.x, pp);
+		dst.y = tangent(src.y, pp);
+		dst.z = tangent(src.z, pp);
+		dst.w = tangent(src.w, pp);
+	}
+
+	void ShaderCore::acos(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = arccos(src.x, pp);
+		dst.y = arccos(src.y, pp);
+		dst.z = arccos(src.z, pp);
+		dst.w = arccos(src.w, pp);
+	}
+
+	void ShaderCore::asin(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = arcsin(src.x, pp);
+		dst.y = arcsin(src.y, pp);
+		dst.z = arcsin(src.z, pp);
+		dst.w = arcsin(src.w, pp);
+	}
+
+	void ShaderCore::atan(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = arctan(src.x, pp);
+		dst.y = arctan(src.y, pp);
+		dst.z = arctan(src.z, pp);
+		dst.w = arctan(src.w, pp);
+	}
+
+	void ShaderCore::atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp)
+	{
+		dst.x = arctan(src0.x, src1.x, pp);
+		dst.y = arctan(src0.y, src1.y, pp);
+		dst.z = arctan(src0.z, src1.z, pp);
+		dst.w = arctan(src0.w, src1.w, pp);
+	}
+
+	void ShaderCore::cosh(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = cosineh(src.x, pp);
+		dst.y = cosineh(src.y, pp);
+		dst.z = cosineh(src.z, pp);
+		dst.w = cosineh(src.w, pp);
+	}
+
+	void ShaderCore::sinh(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = sineh(src.x, pp);
+		dst.y = sineh(src.y, pp);
+		dst.z = sineh(src.z, pp);
+		dst.w = sineh(src.w, pp);
+	}
+
+	void ShaderCore::tanh(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = tangenth(src.x, pp);
+		dst.y = tangenth(src.y, pp);
+		dst.z = tangenth(src.z, pp);
+		dst.w = tangenth(src.w, pp);
+	}
+
+	void ShaderCore::acosh(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = arccosh(src.x, pp);
+		dst.y = arccosh(src.y, pp);
+		dst.z = arccosh(src.z, pp);
+		dst.w = arccosh(src.w, pp);
+	}
+
+	void ShaderCore::asinh(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = arcsinh(src.x, pp);
+		dst.y = arcsinh(src.y, pp);
+		dst.z = arcsinh(src.z, pp);
+		dst.w = arcsinh(src.w, pp);
+	}
+
+	void ShaderCore::atanh(Vector4f &dst, const Vector4f &src, bool pp)
+	{
+		dst.x = arctanh(src.x, pp);
+		dst.y = arctanh(src.y, pp);
+		dst.z = arctanh(src.z, pp);
+		dst.w = arctanh(src.w, pp);
+	}
+
+	void ShaderCore::expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
+	{
+		if(shaderModel < 0x0200)
+		{
+			Float4 frc = Frac(src.x);
+			Float4 floor = src.x - frc;
+
+			dst.x = exponential2(floor, true);
+			dst.y = frc;
+			dst.z = exponential2(src.x, true);
+			dst.w = Float4(1.0f);
+		}
+		else   // Version >= 2.0
+		{
+			exp2x(dst, src, true);   // FIXME: 10-bit precision suffices
+		}
+	}
+
+	void ShaderCore::logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel)
+	{
+		if(shaderModel < 0x0200)
+		{
+			Float4 tmp0;
+			Float4 tmp1;
+			Float4 t;
+			Int4 r;
+
+			tmp0 = Abs(src.x);
+			tmp1 = tmp0;
+
+			// X component
+			r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127);
+			dst.x = Float4(r);
+
+			// Y component
+			dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
+
+			// Z component
+			dst.z = logarithm2(src.x, true, true);
+
+			// W component
+			dst.w = 1.0f;
+		}
+		else
+		{
+			log2x(dst, src, true);
+		}
+	}
+
+	void ShaderCore::cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		cmp0(dst.x, src0.x, src1.x, src2.x);
+		cmp0(dst.y, src0.y, src1.y, src2.y);
+		cmp0(dst.z, src0.z, src1.z, src2.z);
+		cmp0(dst.w, src0.w, src1.w, src2.w);
+	}
+
+	void ShaderCore::select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
+		select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
+		select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
+		select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
+	}
+
+	void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
+	{
+		select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x);
+		select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst);
+		select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst);
+	}
+
+	void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
+	{
+		select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x);
+		select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y);
+		select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z);
+		select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w);
+	}
+
+	void ShaderCore::sgn(Float4 &dst, const Float4 &src)
+	{
+		Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f));
+		Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f));
+		dst = As<Float4>(neg | pos);
+	}
+
+	void ShaderCore::isgn(Float4 &dst, const Float4 &src)
+	{
+		Int4 neg = CmpLT(As<Int4>(src), Int4(0)) & Int4(-1);
+		Int4 pos = CmpNLE(As<Int4>(src), Int4(0)) & Int4(1);
+		dst = As<Float4>(neg | pos);
+	}
+
+	void ShaderCore::cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
+	{
+		Int4 pos = CmpLE(Float4(0.0f), src0);
+		select(dst, pos, src1, src2);
+	}
+
+	void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
+	{
+		Int4 pos = CmpEQ(Int4(0), As<Int4>(src0));
+		select(dst, pos, src1, src2);
+	}
+
+	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
+	{
+		// FIXME: LLVM vector select
+		dst = As<Float4>((src0 & As<Int4>(src1)) | (~src0 & As<Int4>(src2)));
+	}
+
+	void ShaderCore::cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
+	{
+		switch(control)
+		{
+		case Shader::CONTROL_GT:
+			dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
+			dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
+			dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
+			dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
+			break;
+		case Shader::CONTROL_EQ:
+			dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
+			dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
+			dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
+			dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
+			break;
+		case Shader::CONTROL_GE:
+			dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
+			dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
+			dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
+			dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
+			break;
+		case Shader::CONTROL_LT:
+			dst.x = As<Float4>(CmpLT(src0.x, src1.x));
+			dst.y = As<Float4>(CmpLT(src0.y, src1.y));
+			dst.z = As<Float4>(CmpLT(src0.z, src1.z));
+			dst.w = As<Float4>(CmpLT(src0.w, src1.w));
+			break;
+		case Shader::CONTROL_NE:
+			dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
+			dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
+			dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
+			dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
+			break;
+		case Shader::CONTROL_LE:
+			dst.x = As<Float4>(CmpLE(src0.x, src1.x));
+			dst.y = As<Float4>(CmpLE(src0.y, src1.y));
+			dst.z = As<Float4>(CmpLE(src0.z, src1.z));
+			dst.w = As<Float4>(CmpLE(src0.w, src1.w));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void ShaderCore::icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
+	{
+		switch(control)
+		{
+		case Shader::CONTROL_GT:
+			dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_EQ:
+			dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_GE:
+			dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_LT:
+			dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_NE:
+			dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_LE:
+			dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
+	{
+		switch(control)
+		{
+		case Shader::CONTROL_GT:
+			dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_EQ:
+			dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_GE:
+			dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_LT:
+			dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_NE:
+			dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_LE:
+			dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void ShaderCore::all(Float4 &dst, const Vector4f &src)
+	{
+		dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
+	}
+
+	void ShaderCore::any(Float4 &dst, const Vector4f &src)
+	{
+		dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w));
+	}
+
+	void ShaderCore::bitwise_not(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF));
+		dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF));
+		dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF));
+		dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
+	}
+
+	void ShaderCore::bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w));
+	}
+
+	void ShaderCore::bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
+	}
+
+	void ShaderCore::bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
+	}
+
+	void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
+		                   CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
+		                   CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
+		                   CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+		dst.y = dst.x;
+		dst.z = dst.x;
+		dst.w = dst.x;
+	}
+
+	void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) |
+		                   CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) |
+		                   CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) |
+		                   CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+		dst.y = dst.x;
+		dst.z = dst.x;
+		dst.w = dst.x;
+	}
+}
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
new file mode 100644
index 0000000..4dc109f
--- /dev/null
+++ b/src/Pipeline/ShaderCore.hpp
@@ -0,0 +1,382 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_ShaderCore_hpp
+#define sw_ShaderCore_hpp
+
+#include "Shader.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	class Vector4s
+	{
+	public:
+		Vector4s();
+		Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+		Vector4s(const Vector4s &rhs);
+
+		Short4 &operator[](int i);
+		Vector4s &operator=(const Vector4s &rhs);
+
+		Short4 x;
+		Short4 y;
+		Short4 z;
+		Short4 w;
+	};
+
+	class Vector4f
+	{
+	public:
+		Vector4f();
+		Vector4f(float x, float y, float z, float w);
+		Vector4f(const Vector4f &rhs);
+
+		Float4 &operator[](int i);
+		Vector4f &operator=(const Vector4f &rhs);
+
+		Float4 x;
+		Float4 y;
+		Float4 z;
+		Float4 w;
+	};
+
+	Float4 exponential2(RValue<Float4> x, bool pp = false);
+	Float4 logarithm2(RValue<Float4> x, bool abs, bool pp = false);
+	Float4 exponential(RValue<Float4> x, bool pp = false);
+	Float4 logarithm(RValue<Float4> x, bool abs, bool pp = false);
+	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
+	Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
+	Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
+	Float4 modulo(RValue<Float4> x, RValue<Float4> y);
+	Float4 sine_pi(RValue<Float4> x, bool pp = false);     // limited to [-pi, pi] range
+	Float4 cosine_pi(RValue<Float4> x, bool pp = false);   // limited to [-pi, pi] range
+	Float4 sine(RValue<Float4> x, bool pp = false);
+	Float4 cosine(RValue<Float4> x, bool pp = false);
+	Float4 tangent(RValue<Float4> x, bool pp = false);
+	Float4 arccos(RValue<Float4> x, bool pp = false);
+	Float4 arcsin(RValue<Float4> x, bool pp = false);
+	Float4 arctan(RValue<Float4> x, bool pp = false);
+	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);
+	Float4 sineh(RValue<Float4> x, bool pp = false);
+	Float4 cosineh(RValue<Float4> x, bool pp = false);
+	Float4 tangenth(RValue<Float4> x, bool pp = false);
+	Float4 arccosh(RValue<Float4> x, bool pp = false);  // Limited to x >= 1
+	Float4 arcsinh(RValue<Float4> x, bool pp = false);
+	Float4 arctanh(RValue<Float4> x, bool pp = false);  // Limited to ]-1, 1[ range
+
+	Float4 dot2(const Vector4f &v0, const Vector4f &v1);
+	Float4 dot3(const Vector4f &v0, const Vector4f &v1);
+	Float4 dot4(const Vector4f &v0, const Vector4f &v1);
+
+	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+	void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
+	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
+
+	class Register
+	{
+	public:
+		Register(const Reference<Float4> &x, const Reference<Float4> &y, const Reference<Float4> &z, const Reference<Float4> &w) : x(x), y(y), z(z), w(w)
+		{
+		}
+
+		Reference<Float4> &operator[](int i)
+		{
+			switch(i)
+			{
+			default:
+			case 0: return x;
+			case 1: return y;
+			case 2: return z;
+			case 3: return w;
+			}
+		}
+
+		Register &operator=(const Register &rhs)
+		{
+			x = rhs.x;
+			y = rhs.y;
+			z = rhs.z;
+			w = rhs.w;
+
+			return *this;
+		}
+
+		Register &operator=(const Vector4f &rhs)
+		{
+			x = rhs.x;
+			y = rhs.y;
+			z = rhs.z;
+			w = rhs.w;
+
+			return *this;
+		}
+
+		operator Vector4f()
+		{
+			Vector4f v;
+
+			v.x = x;
+			v.y = y;
+			v.z = z;
+			v.w = w;
+
+			return v;
+		}
+
+		Reference<Float4> x;
+		Reference<Float4> y;
+		Reference<Float4> z;
+		Reference<Float4> w;
+	};
+
+	class RegisterFile
+	{
+	public:
+		RegisterFile(int size, bool indirectAddressable) : size(size), indirectAddressable(indirectAddressable)
+		{
+			if(indirectAddressable)
+			{
+				x = new Array<Float4>(size);
+				y = new Array<Float4>(size);
+				z = new Array<Float4>(size);
+				w = new Array<Float4>(size);
+			}
+			else
+			{
+				x = new Array<Float4>[size];
+				y = new Array<Float4>[size];
+				z = new Array<Float4>[size];
+				w = new Array<Float4>[size];
+			}
+		}
+
+		~RegisterFile()
+		{
+			if(indirectAddressable)
+			{
+				delete x;
+				delete y;
+				delete z;
+				delete w;
+			}
+			else
+			{
+				delete[] x;
+				delete[] y;
+				delete[] z;
+				delete[] w;
+			}
+		}
+
+		Register operator[](int i)
+		{
+			if(indirectAddressable)
+			{
+				return Register(x[0][i], y[0][i], z[0][i], w[0][i]);
+			}
+			else
+			{
+				return Register(x[i][0], y[i][0], z[i][0], w[i][0]);
+			}
+		}
+
+		Register operator[](RValue<Int> i)
+		{
+			ASSERT(indirectAddressable);
+
+			return Register(x[0][i], y[0][i], z[0][i], w[0][i]);
+		}
+
+		const Vector4f operator[](RValue<Int4> i);   // Gather operation (read only).
+
+		void scatter_x(Int4 i, RValue<Float4> r);
+		void scatter_y(Int4 i, RValue<Float4> r);
+		void scatter_z(Int4 i, RValue<Float4> r);
+		void scatter_w(Int4 i, RValue<Float4> r);
+
+	protected:
+		const int size;
+		const bool indirectAddressable;
+		Array<Float4> *x;
+		Array<Float4> *y;
+		Array<Float4> *z;
+		Array<Float4> *w;
+	};
+
+	template<int S, bool I = false>
+	class RegisterArray : public RegisterFile
+	{
+	public:
+		RegisterArray(bool indirectAddressable = I) : RegisterFile(S, indirectAddressable)
+		{
+		}
+	};
+
+	class ShaderCore
+	{
+		typedef Shader::Control Control;
+
+	public:
+		void mov(Vector4f &dst, const Vector4f &src, bool integerDestination = false);
+		void neg(Vector4f &dst, const Vector4f &src);
+		void ineg(Vector4f &dst, const Vector4f &src);
+		void f2b(Vector4f &dst, const Vector4f &src);
+		void b2f(Vector4f &dst, const Vector4f &src);
+		void f2i(Vector4f &dst, const Vector4f &src);
+		void i2f(Vector4f &dst, const Vector4f &src);
+		void f2u(Vector4f &dst, const Vector4f &src);
+		void u2f(Vector4f &dst, const Vector4f &src);
+		void i2b(Vector4f &dst, const Vector4f &src);
+		void b2i(Vector4f &dst, const Vector4f &src);
+		void add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void rcpx(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void rsqx(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void sqrt(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void rsq(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void len2(Float4 &dst, const Vector4f &src, bool pp = false);
+		void len3(Float4 &dst, const Vector4f &src, bool pp = false);
+		void len4(Float4 &dst, const Vector4f &src, bool pp = false);
+		void dist1(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+		void dist2(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+		void dist3(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+		void dist4(Float4 &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+		void dp1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void dp2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void dp2add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void det2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void det3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void det4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2, const Vector4f &src3);
+		void min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void step(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void exp2x(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void exp2(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void exp(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void log2x(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void log2(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void log(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void lit(Vector4f &dst, const Vector4f &src);
+		void att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void isinf(Vector4f &dst, const Vector4f &src);
+		void isnan(Vector4f &dst, const Vector4f &src);
+		void smooth(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void packHalf2x16(Vector4f &dst, const Vector4f &src);
+		void unpackHalf2x16(Vector4f &dst, const Vector4f &src);
+		void packSnorm2x16(Vector4f &dst, const Vector4f &src);
+		void packUnorm2x16(Vector4f &dst, const Vector4f &src);
+		void unpackSnorm2x16(Vector4f &dst, const Vector4f &src);
+		void unpackUnorm2x16(Vector4f &dst, const Vector4f &src);
+		void frc(Vector4f &dst, const Vector4f &src);
+		void trunc(Vector4f &dst, const Vector4f &src);
+		void floor(Vector4f &dst, const Vector4f &src);
+		void round(Vector4f &dst, const Vector4f &src);
+		void roundEven(Vector4f &dst, const Vector4f &src);
+		void ceil(Vector4f &dst, const Vector4f &src);
+		void powx(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+		void pow(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+		void crs(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void forward1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void forward2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void forward3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void forward4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void reflect1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void reflect2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void reflect3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void reflect4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void refract1(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+		void refract2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+		void refract3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+		void refract4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Float4 &src2);
+		void sgn(Vector4f &dst, const Vector4f &src);
+		void isgn(Vector4f &dst, const Vector4f &src);
+		void abs(Vector4f &dst, const Vector4f &src);
+		void iabs(Vector4f &dst, const Vector4f &src);
+		void nrm2(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void nrm3(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void nrm4(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void sincos(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void cos(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void sin(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void tan(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void acos(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void asin(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void atan(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void atan2(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, bool pp = false);
+		void cosh(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void sinh(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void tanh(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void acosh(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void asinh(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void atanh(Vector4f &dst, const Vector4f &src, bool pp = false);
+		void expp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel);
+		void logp(Vector4f &dst, const Vector4f &src, unsigned short shaderModel);
+		void cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
+		void icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
+		void ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
+		void select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void extract(Float4 &dst, const Vector4f &src0, const Float4 &src1);
+		void insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index);
+		void all(Float4 &dst, const Vector4f &src);
+		void any(Float4 &dst, const Vector4f &src);
+		void bitwise_not(Vector4f &dst, const Vector4f &src);
+		void bitwise_or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void bitwise_xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void bitwise_and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+
+	private:
+		void sgn(Float4 &dst, const Float4 &src);
+		void isgn(Float4 &dst, const Float4 &src);
+		void cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
+		void cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
+		void select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2);
+		void floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits);
+		void halfToFloatBits(Float4& dst, const Float4& halfBits);
+	};
+}
+
+#endif   // sw_ShaderCore_hpp
diff --git a/src/Pipeline/VertexPipeline.cpp b/src/Pipeline/VertexPipeline.cpp
new file mode 100644
index 0000000..129d8a8
--- /dev/null
+++ b/src/Pipeline/VertexPipeline.cpp
@@ -0,0 +1,953 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexPipeline.hpp"
+
+#include "Renderer/Vertex.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#undef max
+#undef min
+
+namespace sw
+{
+	extern bool secondaryColor;
+
+	VertexPipeline::VertexPipeline(const VertexProcessor::State &state) : VertexRoutine(state, 0)
+	{
+	}
+
+	VertexPipeline::~VertexPipeline()
+	{
+	}
+
+	Vector4f VertexPipeline::transformBlend(const Register &src, const Pointer<Byte> &matrix, bool homogeneous)
+	{
+		Vector4f dst;
+
+		if(state.vertexBlendMatrixCount == 0)
+		{
+			dst = transform(src, matrix, homogeneous);
+		}
+		else
+		{
+			UInt index0[4];
+			UInt index1[4];
+			UInt index2[4];
+			UInt index3[4];
+
+			if(state.indexedVertexBlendEnable)
+			{
+				for(int i = 0; i < 4; i++)
+				{
+					Float4 B = v[BlendIndices].x;
+					UInt indices;
+
+					switch(i)
+					{
+					case 0: indices = As<UInt>(Float(B.x)); break;
+					case 1: indices = As<UInt>(Float(B.y)); break;
+					case 2: indices = As<UInt>(Float(B.z)); break;
+					case 3: indices = As<UInt>(Float(B.w)); break;
+					}
+
+					index0[i] = (indices & 0x000000FF) << 6;
+					index1[i] = (indices & 0x0000FF00) >> 2;
+					index2[i] = (indices & 0x00FF0000) >> 10;
+					index3[i] = (indices & 0xFF000000) >> 18;
+				}
+			}
+			else
+			{
+				for(int i = 0; i < 4; i++)
+				{
+					index0[i] = 0 * 64;
+					index1[i] = 1 * 64;
+					index2[i] = 2 * 64;
+					index3[i] = 3 * 64;
+				}
+			}
+
+			Float4 weight0;
+			Float4 weight1;
+			Float4 weight2;
+			Float4 weight3;
+
+			switch(state.vertexBlendMatrixCount)
+			{
+			case 4: weight2 = v[BlendWeight].z;
+			case 3: weight1 = v[BlendWeight].y;
+			case 2: weight0 = v[BlendWeight].x;
+			case 1:
+				break;
+			}
+
+			if(state.vertexBlendMatrixCount == 1)
+			{
+				dst = transform(src, matrix, index0, homogeneous);
+			}
+			else if(state.vertexBlendMatrixCount == 2)
+			{
+				weight1 = Float4(1.0f) - weight0;
+
+				Vector4f pos0;
+				Vector4f pos1;
+
+				pos0 = transform(src, matrix, index0, homogeneous);
+				pos1 = transform(src, matrix, index1, homogeneous);
+
+				dst.x = pos0.x * weight0 + pos1.x * weight1;   // FIXME: Vector4f operators
+				dst.y = pos0.y * weight0 + pos1.y * weight1;
+				dst.z = pos0.z * weight0 + pos1.z * weight1;
+				dst.w = pos0.w * weight0 + pos1.w * weight1;
+			}
+			else if(state.vertexBlendMatrixCount == 3)
+			{
+				weight2 = Float4(1.0f) - (weight0 + weight1);
+
+				Vector4f pos0;
+				Vector4f pos1;
+				Vector4f pos2;
+
+				pos0 = transform(src, matrix, index0, homogeneous);
+				pos1 = transform(src, matrix, index1, homogeneous);
+				pos2 = transform(src, matrix, index2, homogeneous);
+
+				dst.x = pos0.x * weight0 + pos1.x * weight1 + pos2.x * weight2;
+				dst.y = pos0.y * weight0 + pos1.y * weight1 + pos2.y * weight2;
+				dst.z = pos0.z * weight0 + pos1.z * weight1 + pos2.z * weight2;
+				dst.w = pos0.w * weight0 + pos1.w * weight1 + pos2.w * weight2;
+			}
+			else if(state.vertexBlendMatrixCount == 4)
+			{
+				weight3 = Float4(1.0f) - (weight0 + weight1 + weight2);
+
+				Vector4f pos0;
+				Vector4f pos1;
+				Vector4f pos2;
+				Vector4f pos3;
+
+				pos0 = transform(src, matrix, index0, homogeneous);
+				pos1 = transform(src, matrix, index1, homogeneous);
+				pos2 = transform(src, matrix, index2, homogeneous);
+				pos3 = transform(src, matrix, index3, homogeneous);
+
+				dst.x = pos0.x * weight0 + pos1.x * weight1 + pos2.x * weight2 + pos3.x * weight3;
+				dst.y = pos0.y * weight0 + pos1.y * weight1 + pos2.y * weight2 + pos3.y * weight3;
+				dst.z = pos0.z * weight0 + pos1.z * weight1 + pos2.z * weight2 + pos3.z * weight3;
+				dst.w = pos0.w * weight0 + pos1.w * weight1 + pos2.w * weight2 + pos3.w * weight3;
+			}
+		}
+
+		return dst;
+	}
+
+	void VertexPipeline::pipeline(UInt &index)
+	{
+		Vector4f position;
+		Vector4f normal;
+
+		if(!state.preTransformed)
+		{
+			position = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.transformT)), true);
+		}
+		else
+		{
+			position = v[PositionT];
+		}
+
+		o[Pos].x = position.x;
+		o[Pos].y = position.y;
+		o[Pos].z = position.z;
+		o[Pos].w = position.w;
+
+		Vector4f vertexPosition = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+
+		if(state.vertexNormalActive)
+		{
+			normal = transformBlend(v[Normal], Pointer<Byte>(data + OFFSET(DrawData,ff.normalTransformT)), false);
+
+			if(state.normalizeNormals)
+			{
+				normal = normalize(normal);
+			}
+		}
+
+		if(!state.vertexLightingActive)
+		{
+			// FIXME: Don't process if not used at all
+			if(state.diffuseActive && state.input[Color0])
+			{
+				Vector4f diffuse = v[Color0];
+
+				o[C0].x = diffuse.x;
+				o[C0].y = diffuse.y;
+				o[C0].z = diffuse.z;
+				o[C0].w = diffuse.w;
+			}
+			else
+			{
+				o[C0].x = Float4(1.0f);
+				o[C0].y = Float4(1.0f);
+				o[C0].z = Float4(1.0f);
+				o[C0].w = Float4(1.0f);
+			}
+
+			// FIXME: Don't process if not used at all
+			if(state.specularActive && state.input[Color1])
+			{
+				Vector4f specular = v[Color1];
+
+				o[C1].x = specular.x;
+				o[C1].y = specular.y;
+				o[C1].z = specular.z;
+				o[C1].w = specular.w;
+			}
+			else
+			{
+				o[C1].x = Float4(0.0f);
+				o[C1].y = Float4(0.0f);
+				o[C1].z = Float4(0.0f);
+				o[C1].w = Float4(1.0f);
+			}
+		}
+		else
+		{
+			o[C0].x = Float4(0.0f);
+			o[C0].y = Float4(0.0f);
+			o[C0].z = Float4(0.0f);
+			o[C0].w = Float4(0.0f);
+
+			o[C1].x = Float4(0.0f);
+			o[C1].y = Float4(0.0f);
+			o[C1].z = Float4(0.0f);
+			o[C1].w = Float4(0.0f);
+
+			Vector4f ambient;
+			Float4 globalAmbient = *Pointer<Float4>(data + OFFSET(DrawData,ff.globalAmbient));   // FIXME: Unpack
+
+			ambient.x = globalAmbient.x;
+			ambient.y = globalAmbient.y;
+			ambient.z = globalAmbient.z;
+
+			for(int i = 0; i < 8; i++)
+			{
+				if(!(state.vertexLightActive & (1 << i)))
+				{
+					continue;
+				}
+
+				Vector4f L;    // Light vector
+				Float4 att;   // Attenuation
+
+				// Attenuation
+				{
+					Float4 d;   // Distance
+
+					L.x = L.y = L.z = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightPosition[i]));   // FIXME: Unpack
+					L.x = L.x.xxxx;
+					L.y = L.y.yyyy;
+					L.z = L.z.zzzz;
+
+					L.x -= vertexPosition.x;
+					L.y -= vertexPosition.y;
+					L.z -= vertexPosition.z;
+					d = dot3(L, L);
+					d = RcpSqrt_pp(d);     // FIXME: Sufficient precision?
+					L.x *= d;
+					L.y *= d;
+					L.z *= d;
+					d = Rcp_pp(d);       // FIXME: Sufficient precision?
+
+					Float4 q = *Pointer<Float4>(data + OFFSET(DrawData,ff.attenuationQuadratic[i]));
+					Float4 l = *Pointer<Float4>(data + OFFSET(DrawData,ff.attenuationLinear[i]));
+					Float4 c = *Pointer<Float4>(data + OFFSET(DrawData,ff.attenuationConstant[i]));
+
+					att = Rcp_pp((q * d + l) * d + c);
+				}
+
+				// Ambient per light
+				{
+					Float4 lightAmbient = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightAmbient[i]));   // FIXME: Unpack
+
+					ambient.x = ambient.x + lightAmbient.x * att;
+					ambient.y = ambient.y + lightAmbient.y * att;
+					ambient.z = ambient.z + lightAmbient.z * att;
+				}
+
+				// Diffuse
+				if(state.vertexNormalActive)
+				{
+					Float4 dot;
+
+					dot = dot3(L, normal);
+					dot = Max(dot, Float4(0.0f));
+					dot *= att;
+
+					Vector4f diff;
+
+					if(state.vertexDiffuseMaterialSourceActive == MATERIAL_MATERIAL)
+					{
+						diff.x = diff.y = diff.z = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialDiffuse));   // FIXME: Unpack
+						diff.x = diff.x.xxxx;
+						diff.y = diff.y.yyyy;
+						diff.z = diff.z.zzzz;
+					}
+					else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR1)
+					{
+						diff = v[Color0];
+					}
+					else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR2)
+					{
+						diff = v[Color1];
+					}
+					else ASSERT(false);
+
+					Float4 lightDiffuse = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightDiffuse[i]));
+
+					o[C0].x = o[C0].x + diff.x * dot * lightDiffuse.x;   // FIXME: Clamp first?
+					o[C0].y = o[C0].y + diff.y * dot * lightDiffuse.y;   // FIXME: Clamp first?
+					o[C0].z = o[C0].z + diff.z * dot * lightDiffuse.z;   // FIXME: Clamp first?
+				}
+
+				// Specular
+				if(state.vertexSpecularActive)
+				{
+					Vector4f S;
+					Vector4f C;   // Camera vector
+					Float4 pow;
+
+					pow = *Pointer<Float>(data + OFFSET(DrawData,ff.materialShininess));
+
+					S.x = Float4(0.0f) - vertexPosition.x;
+					S.y = Float4(0.0f) - vertexPosition.y;
+					S.z = Float4(0.0f) - vertexPosition.z;
+					C = normalize(S);
+
+					S.x = L.x + C.x;
+					S.y = L.y + C.y;
+					S.z = L.z + C.z;
+					C = normalize(S);
+
+					Float4 dot = Max(dot3(C, normal), Float4(0.0f));   // FIXME: max(dot3(C, normal), 0)
+
+					Float4 P = power(dot, pow);
+					P *= att;
+
+					Vector4f spec;
+
+					if(state.vertexSpecularMaterialSourceActive == MATERIAL_MATERIAL)
+					{
+						Float4 materialSpecular = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialSpecular));   // FIXME: Unpack
+
+						spec.x = materialSpecular.x;
+						spec.y = materialSpecular.y;
+						spec.z = materialSpecular.z;
+					}
+					else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR1)
+					{
+						spec = v[Color0];
+					}
+					else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR2)
+					{
+						spec = v[Color1];
+					}
+					else ASSERT(false);
+
+					Float4 lightSpecular = *Pointer<Float4>(data + OFFSET(DrawData,ff.lightSpecular[i]));
+
+					spec.x *= lightSpecular.x;
+					spec.y *= lightSpecular.y;
+					spec.z *= lightSpecular.z;
+
+					spec.x *= P;
+					spec.y *= P;
+					spec.z *= P;
+
+					spec.x = Max(spec.x, Float4(0.0f));
+					spec.y = Max(spec.y, Float4(0.0f));
+					spec.z = Max(spec.z, Float4(0.0f));
+
+					if(secondaryColor)
+					{
+						o[C1].x = o[C1].x + spec.x;
+						o[C1].y = o[C1].y + spec.y;
+						o[C1].z = o[C1].z + spec.z;
+					}
+					else
+					{
+						o[C0].x = o[C0].x + spec.x;
+						o[C0].y = o[C0].y + spec.y;
+						o[C0].z = o[C0].z + spec.z;
+					}
+				}
+			}
+
+			if(state.vertexAmbientMaterialSourceActive == MATERIAL_MATERIAL)
+			{
+				Float4 materialAmbient = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialAmbient));   // FIXME: Unpack
+
+				ambient.x = ambient.x * materialAmbient.x;
+				ambient.y = ambient.y * materialAmbient.y;
+				ambient.z = ambient.z * materialAmbient.z;
+			}
+			else if(state.vertexAmbientMaterialSourceActive == MATERIAL_COLOR1)
+			{
+				Vector4f materialDiffuse = v[Color0];
+
+				ambient.x = ambient.x * materialDiffuse.x;
+				ambient.y = ambient.y * materialDiffuse.y;
+				ambient.z = ambient.z * materialDiffuse.z;
+			}
+			else if(state.vertexAmbientMaterialSourceActive == MATERIAL_COLOR2)
+			{
+				Vector4f materialSpecular = v[Color1];
+
+				ambient.x = ambient.x * materialSpecular.x;
+				ambient.y = ambient.y * materialSpecular.y;
+				ambient.z = ambient.z * materialSpecular.z;
+			}
+			else ASSERT(false);
+
+			o[C0].x = o[C0].x + ambient.x;
+			o[C0].y = o[C0].y + ambient.y;
+			o[C0].z = o[C0].z + ambient.z;
+
+			// Emissive
+			if(state.vertexEmissiveMaterialSourceActive == MATERIAL_MATERIAL)
+			{
+				Float4 materialEmission = *Pointer<Float4>(data + OFFSET(DrawData,ff.materialEmission));   // FIXME: Unpack
+
+				o[C0].x = o[C0].x + materialEmission.x;
+				o[C0].y = o[C0].y + materialEmission.y;
+				o[C0].z = o[C0].z + materialEmission.z;
+			}
+			else if(state.vertexEmissiveMaterialSourceActive == MATERIAL_COLOR1)
+			{
+				Vector4f materialSpecular = v[Color0];
+
+				o[C0].x = o[C0].x + materialSpecular.x;
+				o[C0].y = o[C0].y + materialSpecular.y;
+				o[C0].z = o[C0].z + materialSpecular.z;
+			}
+			else if(state.vertexEmissiveMaterialSourceActive == MATERIAL_COLOR2)
+			{
+				Vector4f materialSpecular = v[Color1];
+
+				o[C0].x = o[C0].x + materialSpecular.x;
+				o[C0].y = o[C0].y + materialSpecular.y;
+				o[C0].z = o[C0].z + materialSpecular.z;
+			}
+			else ASSERT(false);
+
+			// Diffuse alpha component
+			if(state.vertexDiffuseMaterialSourceActive == MATERIAL_MATERIAL)
+			{
+				o[C0].w = Float4(*Pointer<Float4>(data + OFFSET(DrawData,ff.materialDiffuse[0]))).wwww;   // FIXME: Unpack
+			}
+			else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR1)
+			{
+				Vector4f alpha = v[Color0];
+				o[C0].w = alpha.w;
+			}
+			else if(state.vertexDiffuseMaterialSourceActive == MATERIAL_COLOR2)
+			{
+				Vector4f alpha = v[Color1];
+				o[C0].w = alpha.w;
+			}
+			else ASSERT(false);
+
+			if(state.vertexSpecularActive)
+			{
+				// Specular alpha component
+				if(state.vertexSpecularMaterialSourceActive == MATERIAL_MATERIAL)
+				{
+					o[C1].w = Float4(*Pointer<Float4>(data + OFFSET(DrawData,ff.materialSpecular[3]))).wwww;   // FIXME: Unpack
+				}
+				else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR1)
+				{
+					Vector4f alpha = v[Color0];
+					o[C1].w = alpha.w;
+				}
+				else if(state.vertexSpecularMaterialSourceActive == MATERIAL_COLOR2)
+				{
+					Vector4f alpha = v[Color1];
+					o[C1].w = alpha.w;
+				}
+				else ASSERT(false);
+			}
+		}
+
+		if(state.fogActive)
+		{
+			Float4 f;
+
+			if(!state.rangeFogActive)
+			{
+				f = Abs(vertexPosition.z);
+			}
+			else
+			{
+				f = Sqrt(dot3(vertexPosition, vertexPosition));   // FIXME: f = length(vertexPosition);
+			}
+
+			switch(state.vertexFogMode)
+			{
+			case FOG_NONE:
+				if(state.specularActive)
+				{
+					o[Fog].x = o[C1].w;
+				}
+				else
+				{
+					o[Fog].x = Float4(0.0f);
+				}
+				break;
+			case FOG_LINEAR:
+				o[Fog].x = f * *Pointer<Float4>(data + OFFSET(DrawData,fog.scale)) + *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
+				break;
+			case FOG_EXP:
+				o[Fog].x = exponential2(f * *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE)), true);
+				break;
+			case FOG_EXP2:
+				o[Fog].x = exponential2((f * f) * *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E)), true);
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+
+		for(int stage = 0; stage < 8; stage++)
+		{
+			processTextureCoordinate(stage, normal, position);
+		}
+
+		processPointSize();
+	}
+
+	void VertexPipeline::processTextureCoordinate(int stage, Vector4f &normal, Vector4f &position)
+	{
+		if(state.output[T0 + stage].write)
+		{
+			int i = state.textureState[stage].texCoordIndexActive;
+
+			switch(state.textureState[stage].texGenActive)
+			{
+			case TEXGEN_NONE:
+				{
+					Vector4f &&varying = v[TexCoord0 + i];
+
+					o[T0 + stage].x = varying.x;
+					o[T0 + stage].y = varying.y;
+					o[T0 + stage].z = varying.z;
+					o[T0 + stage].w = varying.w;
+				}
+				break;
+			case TEXGEN_PASSTHRU:
+				{
+					Vector4f &&varying = v[TexCoord0 + i];
+
+					o[T0 + stage].x = varying.x;
+					o[T0 + stage].y = varying.y;
+					o[T0 + stage].z = varying.z;
+					o[T0 + stage].w = varying.w;
+
+					if(state.input[TexCoord0 + i])
+					{
+						switch(state.input[TexCoord0 + i].count)
+						{
+						case 1:
+							o[T0 + stage].y = Float4(1.0f);
+							o[T0 + stage].z = Float4(0.0f);
+							o[T0 + stage].w = Float4(0.0f);
+							break;
+						case 2:
+							o[T0 + stage].z = Float4(1.0f);
+							o[T0 + stage].w = Float4(0.0f);
+							break;
+						case 3:
+							o[T0 + stage].w = Float4(1.0f);
+							break;
+						case 4:
+							break;
+						default:
+							ASSERT(false);
+						}
+					}
+				}
+				break;
+			case TEXGEN_NORMAL:
+				{
+					Vector4f Nc;   // Normal vector in camera space
+
+					if(state.vertexNormalActive)
+					{
+						Nc = normal;
+					}
+					else
+					{
+						Nc.x = Float4(0.0f);
+						Nc.y = Float4(0.0f);
+						Nc.z = Float4(0.0f);
+					}
+
+					Nc.w = Float4(1.0f);
+
+					o[T0 + stage].x = Nc.x;
+					o[T0 + stage].y = Nc.y;
+					o[T0 + stage].z = Nc.z;
+					o[T0 + stage].w = Nc.w;
+				}
+				break;
+			case TEXGEN_POSITION:
+				{
+					Vector4f Pn = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);   // Position in camera space
+
+					Pn.w = Float4(1.0f);
+
+					o[T0 + stage].x = Pn.x;
+					o[T0 + stage].y = Pn.y;
+					o[T0 + stage].z = Pn.z;
+					o[T0 + stage].w = Pn.w;
+				}
+				break;
+			case TEXGEN_REFLECTION:
+				{
+					Vector4f R;   // Reflection vector
+
+					if(state.vertexNormalActive)
+					{
+						Vector4f Nc;   // Normal vector in camera space
+
+						Nc = normal;
+
+						if(state.localViewerActive)
+						{
+							Vector4f Ec;   // Eye vector in camera space
+							Vector4f N2;
+
+							Ec = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+							Ec = normalize(Ec);
+
+							// R = E - 2 * N * (E . N)
+							Float4 dot = Float4(2.0f) * dot3(Ec, Nc);
+
+							R.x = Ec.x - Nc.x * dot;
+							R.y = Ec.y - Nc.y * dot;
+							R.z = Ec.z - Nc.z * dot;
+						}
+						else
+						{
+							// u = -2 * Nz * Nx
+							// v = -2 * Nz * Ny
+							// w = 1 - 2 * Nz * Nz
+
+							R.x = -Float4(2.0f) * Nc.z * Nc.x;
+							R.y = -Float4(2.0f) * Nc.z * Nc.y;
+							R.z = Float4(1.0f) - Float4(2.0f) * Nc.z * Nc.z;
+						}
+					}
+					else
+					{
+						R.x = Float4(0.0f);
+						R.y = Float4(0.0f);
+						R.z = Float4(0.0f);
+					}
+
+					R.w = Float4(1.0f);
+
+					o[T0 + stage].x = R.x;
+					o[T0 + stage].y = R.y;
+					o[T0 + stage].z = R.z;
+					o[T0 + stage].w = R.w;
+				}
+				break;
+			case TEXGEN_SPHEREMAP:
+				{
+					Vector4f R;   // Reflection vector
+
+					if(state.vertexNormalActive)
+					{
+						Vector4f Nc;   // Normal vector in camera space
+
+						Nc = normal;
+
+						if(state.localViewerActive)
+						{
+							Vector4f Ec;   // Eye vector in camera space
+							Vector4f N2;
+
+							Ec = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+							Ec = normalize(Ec);
+
+							// R = E - 2 * N * (E . N)
+							Float4 dot = Float4(2.0f) * dot3(Ec, Nc);
+
+							R.x = Ec.x - Nc.x * dot;
+							R.y = Ec.y - Nc.y * dot;
+							R.z = Ec.z - Nc.z * dot;
+						}
+						else
+						{
+							// u = -2 * Nz * Nx
+							// v = -2 * Nz * Ny
+							// w = 1 - 2 * Nz * Nz
+
+							R.x = -Float4(2.0f) * Nc.z * Nc.x;
+							R.y = -Float4(2.0f) * Nc.z * Nc.y;
+							R.z = Float4(1.0f) - Float4(2.0f) * Nc.z * Nc.z;
+						}
+					}
+					else
+					{
+						R.x = Float4(0.0f);
+						R.y = Float4(0.0f);
+						R.z = Float4(0.0f);
+					}
+
+					R.z -= Float4(1.0f);
+					R = normalize(R);
+					R.x = Float4(0.5f) * R.x + Float4(0.5f);
+					R.y = Float4(0.5f) * R.y + Float4(0.5f);
+
+					R.z = Float4(1.0f);
+					R.w = Float4(0.0f);
+
+					o[T0 + stage].x = R.x;
+					o[T0 + stage].y = R.y;
+					o[T0 + stage].z = R.z;
+					o[T0 + stage].w = R.w;
+				}
+				break;
+			default:
+				ASSERT(false);
+			}
+
+			Vector4f texTrans0;
+			Vector4f texTrans1;
+			Vector4f texTrans2;
+			Vector4f texTrans3;
+
+			Vector4f T;
+			Vector4f t;
+
+			T.x = o[T0 + stage].x;
+			T.y = o[T0 + stage].y;
+			T.z = o[T0 + stage].z;
+			T.w = o[T0 + stage].w;
+
+			switch(state.textureState[stage].textureTransformCountActive)
+			{
+			case 4:
+				texTrans3.x = texTrans3.y = texTrans3.z = texTrans3.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][3]));   // FIXME: Unpack
+				texTrans3.x = texTrans3.x.xxxx;
+				texTrans3.y = texTrans3.y.yyyy;
+				texTrans3.z = texTrans3.z.zzzz;
+				texTrans3.w = texTrans3.w.wwww;
+				t.w = dot4(T, texTrans3);
+			case 3:
+				texTrans2.x = texTrans2.y = texTrans2.z = texTrans2.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][2]));   // FIXME: Unpack
+				texTrans2.x = texTrans2.x.xxxx;
+				texTrans2.y = texTrans2.y.yyyy;
+				texTrans2.z = texTrans2.z.zzzz;
+				texTrans2.w = texTrans2.w.wwww;
+				t.z = dot4(T, texTrans2);
+			case 2:
+				texTrans1.x = texTrans1.y = texTrans1.z = texTrans1.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][1]));   // FIXME: Unpack
+				texTrans1.x = texTrans1.x.xxxx;
+				texTrans1.y = texTrans1.y.yyyy;
+				texTrans1.z = texTrans1.z.zzzz;
+				texTrans1.w = texTrans1.w.wwww;
+				t.y = dot4(T, texTrans1);
+			case 1:
+				texTrans0.x = texTrans0.y = texTrans0.z = texTrans0.w = *Pointer<Float4>(data + OFFSET(DrawData,ff.textureTransform[stage][0]));   // FIXME: Unpack
+				texTrans0.x = texTrans0.x.xxxx;
+				texTrans0.y = texTrans0.y.yyyy;
+				texTrans0.z = texTrans0.z.zzzz;
+				texTrans0.w = texTrans0.w.wwww;
+				t.x = dot4(T, texTrans0);
+
+				o[T0 + stage].x = t.x;
+				o[T0 + stage].y = t.y;
+				o[T0 + stage].z = t.z;
+				o[T0 + stage].w = t.w;
+			case 0:
+				break;
+			default:
+				ASSERT(false);
+			}
+		}
+	}
+
+	void VertexPipeline::processPointSize()
+	{
+		if(!state.pointSizeActive)
+		{
+			return;   // Use global pointsize
+		}
+
+		if(state.input[PointSize])
+		{
+			o[Pts].y = v[PointSize].x;
+		}
+		else
+		{
+			o[Pts].y = *Pointer<Float4>(data + OFFSET(DrawData,point.pointSize));
+		}
+
+		if(state.pointScaleActive && !state.preTransformed)
+		{
+			Vector4f p = transformBlend(v[Position], Pointer<Byte>(data + OFFSET(DrawData,ff.cameraTransformT)), true);
+
+			Float4 d = Sqrt(dot3(p, p));   // FIXME: length(p);
+
+			Float4 A = *Pointer<Float>(data + OFFSET(DrawData,point.pointScaleA));   // FIXME: Unpack
+			Float4 B = *Pointer<Float>(data + OFFSET(DrawData,point.pointScaleB));   // FIXME: Unpack
+			Float4 C = *Pointer<Float>(data + OFFSET(DrawData,point.pointScaleC));   // FIXME: Unpack
+
+			A = RcpSqrt_pp(A + d * (B + d * C));
+
+			o[Pts].y = o[Pts].y * Float4(*Pointer<Float>(data + OFFSET(DrawData,viewportHeight))) * A;   // FIXME: Unpack
+		}
+	}
+
+	Vector4f VertexPipeline::transform(const Register &src, const Pointer<Byte> &matrix, bool homogeneous)
+	{
+		Vector4f dst;
+
+		if(homogeneous)
+		{
+			Float4 m[4][4];
+
+			for(int j = 0; j < 4; j++)
+			{
+				for(int i = 0; i < 4; i++)
+				{
+					m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j);
+					m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j);
+					m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j);
+					m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j);
+				}
+			}
+
+			dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2] + src.w * m[0][3];
+			dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2] + src.w * m[1][3];
+			dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2] + src.w * m[2][3];
+			dst.w = src.x * m[3][0] + src.y * m[3][1] + src.z * m[3][2] + src.w * m[3][3];
+		}
+		else
+		{
+			Float4 m[3][3];
+
+			for(int j = 0; j < 3; j++)
+			{
+				for(int i = 0; i < 3; i++)
+				{
+					m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j);
+					m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j);
+					m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j);
+					m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j);
+				}
+			}
+
+			dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2];
+			dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2];
+			dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2];
+		}
+
+		return dst;
+	}
+
+	Vector4f VertexPipeline::transform(const Register &src, const Pointer<Byte> &matrix, UInt index[4], bool homogeneous)
+	{
+		Vector4f dst;
+
+		if(homogeneous)
+		{
+			Float4 m[4][4];
+
+			for(int j = 0; j < 4; j++)
+			{
+				for(int i = 0; i < 4; i++)
+				{
+					m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j + index[0]);
+					m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j + index[1]);
+					m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j + index[2]);
+					m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j + index[3]);
+				}
+			}
+
+			dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2] + m[0][3];
+			dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2] + m[1][3];
+			dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2] + m[2][3];
+			dst.w = src.x * m[3][0] + src.y * m[3][1] + src.z * m[3][2] + m[3][3];
+		}
+		else
+		{
+			Float4 m[3][3];
+
+			for(int j = 0; j < 3; j++)
+			{
+				for(int i = 0; i < 3; i++)
+				{
+					m[j][i].x = *Pointer<Float>(matrix + 16 * i + 4 * j + index[0]);
+					m[j][i].y = *Pointer<Float>(matrix + 16 * i + 4 * j + index[1]);
+					m[j][i].z = *Pointer<Float>(matrix + 16 * i + 4 * j + index[2]);
+					m[j][i].w = *Pointer<Float>(matrix + 16 * i + 4 * j + index[3]);
+				}
+			}
+
+			dst.x = src.x * m[0][0] + src.y * m[0][1] + src.z * m[0][2];
+			dst.y = src.x * m[1][0] + src.y * m[1][1] + src.z * m[1][2];
+			dst.z = src.x * m[2][0] + src.y * m[2][1] + src.z * m[2][2];
+		}
+
+		return dst;
+	}
+
+	Vector4f VertexPipeline::normalize(Vector4f &src)
+	{
+		Vector4f dst;
+
+		Float4 rcpLength = RcpSqrt_pp(dot3(src, src));
+
+		dst.x = src.x * rcpLength;
+		dst.y = src.y * rcpLength;
+		dst.z = src.z * rcpLength;
+
+		return dst;
+	}
+
+	Float4 VertexPipeline::power(Float4 &src0, Float4 &src1)
+	{
+		Float4 dst = src0;
+
+		dst = dst * dst;
+		dst = dst * dst;
+		dst = Float4(As<Int4>(dst) - As<Int4>(Float4(1.0f)));
+
+		dst *= src1;
+
+		dst = As<Float4>(Int4(dst) + As<Int4>(Float4(1.0f)));
+		dst = RcpSqrt_pp(dst);
+		dst = RcpSqrt_pp(dst);
+
+		return dst;
+	}
+}
diff --git a/src/Pipeline/VertexPipeline.hpp b/src/Pipeline/VertexPipeline.hpp
new file mode 100644
index 0000000..0736afb
--- /dev/null
+++ b/src/Pipeline/VertexPipeline.hpp
@@ -0,0 +1,45 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexPipeline_hpp
+#define sw_VertexPipeline_hpp
+
+#include "VertexRoutine.hpp"
+
+#include "Renderer/Context.hpp"
+#include "Renderer/VertexProcessor.hpp"
+
+namespace sw
+{
+	class VertexPipeline : public VertexRoutine
+	{
+	public:
+		VertexPipeline(const VertexProcessor::State &state);
+
+		virtual ~VertexPipeline();
+
+	private:
+		void pipeline(UInt &index) override;
+		void processTextureCoordinate(int stage, Vector4f &normal, Vector4f &position);
+		void processPointSize();
+
+		Vector4f transformBlend(const Register &src, const Pointer<Byte> &matrix, bool homogenous);
+		Vector4f transform(const Register &src, const Pointer<Byte> &matrix, bool homogenous);
+		Vector4f transform(const Register &src, const Pointer<Byte> &matrix, UInt index[4], bool homogenous);
+		Vector4f normalize(Vector4f &src);
+		Float4 power(Float4 &src0, Float4 &src1);
+	};
+};
+
+#endif   // sw_VertexPipeline_hpp
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
new file mode 100644
index 0000000..ad4e37b
--- /dev/null
+++ b/src/Pipeline/VertexProgram.cpp
@@ -0,0 +1,1650 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexProgram.hpp"
+
+#include "VertexShader.hpp"
+#include "SamplerCore.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Renderer/Vertex.hpp"
+#include "Common/Half.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	VertexProgram::VertexProgram(const VertexProcessor::State &state, const VertexShader *shader)
+		: VertexRoutine(state, shader), shader(shader), r(shader->indirectAddressableTemporaries)
+	{
+		ifDepth = 0;
+		loopRepDepth = 0;
+		currentLabel = -1;
+		whileTest = false;
+
+		for(int i = 0; i < 2048; i++)
+		{
+			labelBlock[i] = 0;
+		}
+
+		loopDepth = -1;
+		enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+
+		if(shader->containsBreakInstruction())
+		{
+			enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		}
+
+		if(shader->containsContinueInstruction())
+		{
+			enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		}
+
+		if(shader->isInstanceIdDeclared())
+		{
+			instanceID = *Pointer<Int>(data + OFFSET(DrawData,instanceID));
+		}
+	}
+
+	VertexProgram::~VertexProgram()
+	{
+	}
+
+	void VertexProgram::pipeline(UInt &index)
+	{
+		if(!state.preTransformed)
+		{
+			program(index);
+		}
+		else
+		{
+			passThrough();
+		}
+	}
+
+	void VertexProgram::program(UInt &index)
+	{
+	//	shader->print("VertexShader-%0.8X.txt", state.shaderID);
+
+		unsigned short shaderModel = shader->getShaderModel();
+
+		enableIndex = 0;
+		stackIndex = 0;
+
+		if(shader->containsLeaveInstruction())
+		{
+			enableLeave = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		}
+
+		if(shader->isVertexIdDeclared())
+		{
+			if(state.textureSampling)
+			{
+				vertexID = Int4(index);
+			}
+			else
+			{
+				vertexID = Insert(vertexID, As<Int>(index), 0);
+				vertexID = Insert(vertexID, As<Int>(index + 1), 1);
+				vertexID = Insert(vertexID, As<Int>(index + 2), 2);
+				vertexID = Insert(vertexID, As<Int>(index + 3), 3);
+			}
+		}
+
+		// Create all call site return blocks up front
+		for(size_t i = 0; i < shader->getLength(); i++)
+		{
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
+
+			if(opcode == Shader::OPCODE_CALL || opcode == Shader::OPCODE_CALLNZ)
+			{
+				const Dst &dst = instruction->dst;
+
+				ASSERT(callRetBlock[dst.label].size() == dst.callSite);
+				callRetBlock[dst.label].push_back(Nucleus::createBasicBlock());
+			}
+		}
+
+		for(size_t i = 0; i < shader->getLength(); i++)
+		{
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
+
+			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
+			{
+				continue;
+			}
+
+			Dst dst = instruction->dst;
+			Src src0 = instruction->src[0];
+			Src src1 = instruction->src[1];
+			Src src2 = instruction->src[2];
+			Src src3 = instruction->src[3];
+			Src src4 = instruction->src[4];
+
+			bool predicate = instruction->predicate;
+			Control control = instruction->control;
+			bool integer = dst.type == Shader::PARAMETER_ADDR;
+			bool pp = dst.partialPrecision;
+
+			Vector4f d;
+			Vector4f s0;
+			Vector4f s1;
+			Vector4f s2;
+			Vector4f s3;
+			Vector4f s4;
+
+			if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
+			if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
+			if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
+			if(src3.type != Shader::PARAMETER_VOID) s3 = fetchRegister(src3);
+			if(src4.type != Shader::PARAMETER_VOID) s4 = fetchRegister(src4);
+
+			switch(opcode)
+			{
+			case Shader::OPCODE_VS_1_0:                                     break;
+			case Shader::OPCODE_VS_1_1:                                     break;
+			case Shader::OPCODE_VS_2_0:                                     break;
+			case Shader::OPCODE_VS_2_x:                                     break;
+			case Shader::OPCODE_VS_2_sw:                                    break;
+			case Shader::OPCODE_VS_3_0:                                     break;
+			case Shader::OPCODE_VS_3_sw:                                    break;
+			case Shader::OPCODE_DCL:                                        break;
+			case Shader::OPCODE_DEF:                                        break;
+			case Shader::OPCODE_DEFI:                                       break;
+			case Shader::OPCODE_DEFB:                                       break;
+			case Shader::OPCODE_NOP:                                        break;
+			case Shader::OPCODE_ABS:        abs(d, s0);                     break;
+			case Shader::OPCODE_IABS:       iabs(d, s0);                    break;
+			case Shader::OPCODE_ADD:        add(d, s0, s1);                 break;
+			case Shader::OPCODE_IADD:       iadd(d, s0, s1);                break;
+			case Shader::OPCODE_CRS:        crs(d, s0, s1);                 break;
+			case Shader::OPCODE_FORWARD1:   forward1(d, s0, s1, s2);        break;
+			case Shader::OPCODE_FORWARD2:   forward2(d, s0, s1, s2);        break;
+			case Shader::OPCODE_FORWARD3:   forward3(d, s0, s1, s2);        break;
+			case Shader::OPCODE_FORWARD4:   forward4(d, s0, s1, s2);        break;
+			case Shader::OPCODE_REFLECT1:   reflect1(d, s0, s1);            break;
+			case Shader::OPCODE_REFLECT2:   reflect2(d, s0, s1);            break;
+			case Shader::OPCODE_REFLECT3:   reflect3(d, s0, s1);            break;
+			case Shader::OPCODE_REFLECT4:   reflect4(d, s0, s1);            break;
+			case Shader::OPCODE_REFRACT1:   refract1(d, s0, s1, s2.x);      break;
+			case Shader::OPCODE_REFRACT2:   refract2(d, s0, s1, s2.x);      break;
+			case Shader::OPCODE_REFRACT3:   refract3(d, s0, s1, s2.x);      break;
+			case Shader::OPCODE_REFRACT4:   refract4(d, s0, s1, s2.x);      break;
+			case Shader::OPCODE_DP1:        dp1(d, s0, s1);                 break;
+			case Shader::OPCODE_DP2:        dp2(d, s0, s1);                 break;
+			case Shader::OPCODE_DP3:        dp3(d, s0, s1);                 break;
+			case Shader::OPCODE_DP4:        dp4(d, s0, s1);                 break;
+			case Shader::OPCODE_DET2:       det2(d, s0, s1);                break;
+			case Shader::OPCODE_DET3:       det3(d, s0, s1, s2);            break;
+			case Shader::OPCODE_DET4:       det4(d, s0, s1, s2, s3);        break;
+			case Shader::OPCODE_ATT:        att(d, s0, s1);                 break;
+			case Shader::OPCODE_EXP2X:      exp2x(d, s0, pp);               break;
+			case Shader::OPCODE_EXP2:       exp2(d, s0, pp);                break;
+			case Shader::OPCODE_EXPP:       expp(d, s0, shaderModel);       break;
+			case Shader::OPCODE_EXP:        exp(d, s0, pp);                 break;
+			case Shader::OPCODE_FRC:        frc(d, s0);                     break;
+			case Shader::OPCODE_TRUNC:      trunc(d, s0);                   break;
+			case Shader::OPCODE_FLOOR:      floor(d, s0);                   break;
+			case Shader::OPCODE_ROUND:      round(d, s0);                   break;
+			case Shader::OPCODE_ROUNDEVEN:  roundEven(d, s0);               break;
+			case Shader::OPCODE_CEIL:       ceil(d, s0);                    break;
+			case Shader::OPCODE_LIT:        lit(d, s0);                     break;
+			case Shader::OPCODE_LOG2X:      log2x(d, s0, pp);               break;
+			case Shader::OPCODE_LOG2:       log2(d, s0, pp);                break;
+			case Shader::OPCODE_LOGP:       logp(d, s0, shaderModel);       break;
+			case Shader::OPCODE_LOG:        log(d, s0, pp);                 break;
+			case Shader::OPCODE_LRP:        lrp(d, s0, s1, s2);             break;
+			case Shader::OPCODE_STEP:       step(d, s0, s1);                break;
+			case Shader::OPCODE_SMOOTH:     smooth(d, s0, s1, s2);          break;
+			case Shader::OPCODE_ISINF:      isinf(d, s0);                   break;
+			case Shader::OPCODE_ISNAN:      isnan(d, s0);                   break;
+			case Shader::OPCODE_FLOATBITSTOINT:
+			case Shader::OPCODE_FLOATBITSTOUINT:
+			case Shader::OPCODE_INTBITSTOFLOAT:
+			case Shader::OPCODE_UINTBITSTOFLOAT: d = s0;                    break;
+			case Shader::OPCODE_PACKSNORM2x16:   packSnorm2x16(d, s0);      break;
+			case Shader::OPCODE_PACKUNORM2x16:   packUnorm2x16(d, s0);      break;
+			case Shader::OPCODE_PACKHALF2x16:    packHalf2x16(d, s0);       break;
+			case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0);    break;
+			case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0);    break;
+			case Shader::OPCODE_UNPACKHALF2x16:  unpackHalf2x16(d, s0);     break;
+			case Shader::OPCODE_M3X2:       M3X2(d, s0, src1);              break;
+			case Shader::OPCODE_M3X3:       M3X3(d, s0, src1);              break;
+			case Shader::OPCODE_M3X4:       M3X4(d, s0, src1);              break;
+			case Shader::OPCODE_M4X3:       M4X3(d, s0, src1);              break;
+			case Shader::OPCODE_M4X4:       M4X4(d, s0, src1);              break;
+			case Shader::OPCODE_MAD:        mad(d, s0, s1, s2);             break;
+			case Shader::OPCODE_IMAD:       imad(d, s0, s1, s2);            break;
+			case Shader::OPCODE_MAX:        max(d, s0, s1);                 break;
+			case Shader::OPCODE_IMAX:       imax(d, s0, s1);                break;
+			case Shader::OPCODE_UMAX:       umax(d, s0, s1);                break;
+			case Shader::OPCODE_MIN:        min(d, s0, s1);                 break;
+			case Shader::OPCODE_IMIN:       imin(d, s0, s1);                break;
+			case Shader::OPCODE_UMIN:       umin(d, s0, s1);                break;
+			case Shader::OPCODE_MOV:        mov(d, s0, integer);            break;
+			case Shader::OPCODE_MOVA:       mov(d, s0, true);               break;
+			case Shader::OPCODE_NEG:        neg(d, s0);                     break;
+			case Shader::OPCODE_INEG:       ineg(d, s0);                    break;
+			case Shader::OPCODE_F2B:        f2b(d, s0);                     break;
+			case Shader::OPCODE_B2F:        b2f(d, s0);                     break;
+			case Shader::OPCODE_F2I:        f2i(d, s0);                     break;
+			case Shader::OPCODE_I2F:        i2f(d, s0);                     break;
+			case Shader::OPCODE_F2U:        f2u(d, s0);                     break;
+			case Shader::OPCODE_U2F:        u2f(d, s0);                     break;
+			case Shader::OPCODE_I2B:        i2b(d, s0);                     break;
+			case Shader::OPCODE_B2I:        b2i(d, s0);                     break;
+			case Shader::OPCODE_MUL:        mul(d, s0, s1);                 break;
+			case Shader::OPCODE_IMUL:       imul(d, s0, s1);                break;
+			case Shader::OPCODE_NRM2:       nrm2(d, s0, pp);                break;
+			case Shader::OPCODE_NRM3:       nrm3(d, s0, pp);                break;
+			case Shader::OPCODE_NRM4:       nrm4(d, s0, pp);                break;
+			case Shader::OPCODE_POWX:       powx(d, s0, s1, pp);            break;
+			case Shader::OPCODE_POW:        pow(d, s0, s1, pp);             break;
+			case Shader::OPCODE_RCPX:       rcpx(d, s0, pp);                break;
+			case Shader::OPCODE_DIV:        div(d, s0, s1);                 break;
+			case Shader::OPCODE_IDIV:       idiv(d, s0, s1);                break;
+			case Shader::OPCODE_UDIV:       udiv(d, s0, s1);                break;
+			case Shader::OPCODE_MOD:        mod(d, s0, s1);                 break;
+			case Shader::OPCODE_IMOD:       imod(d, s0, s1);                break;
+			case Shader::OPCODE_UMOD:       umod(d, s0, s1);                break;
+			case Shader::OPCODE_SHL:        shl(d, s0, s1);                 break;
+			case Shader::OPCODE_ISHR:       ishr(d, s0, s1);                break;
+			case Shader::OPCODE_USHR:       ushr(d, s0, s1);                break;
+			case Shader::OPCODE_RSQX:       rsqx(d, s0, pp);                break;
+			case Shader::OPCODE_SQRT:       sqrt(d, s0, pp);                break;
+			case Shader::OPCODE_RSQ:        rsq(d, s0, pp);                 break;
+			case Shader::OPCODE_LEN2:       len2(d.x, s0, pp);              break;
+			case Shader::OPCODE_LEN3:       len3(d.x, s0, pp);              break;
+			case Shader::OPCODE_LEN4:       len4(d.x, s0, pp);              break;
+			case Shader::OPCODE_DIST1:      dist1(d.x, s0, s1, pp);         break;
+			case Shader::OPCODE_DIST2:      dist2(d.x, s0, s1, pp);         break;
+			case Shader::OPCODE_DIST3:      dist3(d.x, s0, s1, pp);         break;
+			case Shader::OPCODE_DIST4:      dist4(d.x, s0, s1, pp);         break;
+			case Shader::OPCODE_SGE:        step(d, s1, s0);                break;
+			case Shader::OPCODE_SGN:        sgn(d, s0);                     break;
+			case Shader::OPCODE_ISGN:       isgn(d, s0);                    break;
+			case Shader::OPCODE_SINCOS:     sincos(d, s0, pp);              break;
+			case Shader::OPCODE_COS:        cos(d, s0, pp);                 break;
+			case Shader::OPCODE_SIN:        sin(d, s0, pp);                 break;
+			case Shader::OPCODE_TAN:        tan(d, s0);                     break;
+			case Shader::OPCODE_ACOS:       acos(d, s0);                    break;
+			case Shader::OPCODE_ASIN:       asin(d, s0);                    break;
+			case Shader::OPCODE_ATAN:       atan(d, s0);                    break;
+			case Shader::OPCODE_ATAN2:      atan2(d, s0, s1);               break;
+			case Shader::OPCODE_COSH:       cosh(d, s0, pp);                break;
+			case Shader::OPCODE_SINH:       sinh(d, s0, pp);                break;
+			case Shader::OPCODE_TANH:       tanh(d, s0, pp);                break;
+			case Shader::OPCODE_ACOSH:      acosh(d, s0, pp);               break;
+			case Shader::OPCODE_ASINH:      asinh(d, s0, pp);               break;
+			case Shader::OPCODE_ATANH:      atanh(d, s0, pp);               break;
+			case Shader::OPCODE_SLT:        slt(d, s0, s1);                 break;
+			case Shader::OPCODE_SUB:        sub(d, s0, s1);                 break;
+			case Shader::OPCODE_ISUB:       isub(d, s0, s1);                break;
+			case Shader::OPCODE_BREAK:      BREAK();                        break;
+			case Shader::OPCODE_BREAKC:     BREAKC(s0, s1, control);        break;
+			case Shader::OPCODE_BREAKP:     BREAKP(src0);                   break;
+			case Shader::OPCODE_CONTINUE:   CONTINUE();                     break;
+			case Shader::OPCODE_TEST:       TEST();                         break;
+			case Shader::OPCODE_CALL:       CALL(dst.label, dst.callSite);  break;
+			case Shader::OPCODE_CALLNZ:     CALLNZ(dst.label, dst.callSite, src0); break;
+			case Shader::OPCODE_ELSE:       ELSE();                         break;
+			case Shader::OPCODE_ENDIF:      ENDIF();                        break;
+			case Shader::OPCODE_ENDLOOP:    ENDLOOP();                      break;
+			case Shader::OPCODE_ENDREP:     ENDREP();                       break;
+			case Shader::OPCODE_ENDWHILE:   ENDWHILE();                     break;
+			case Shader::OPCODE_ENDSWITCH:  ENDSWITCH();                    break;
+			case Shader::OPCODE_IF:         IF(src0);                       break;
+			case Shader::OPCODE_IFC:        IFC(s0, s1, control);           break;
+			case Shader::OPCODE_LABEL:      LABEL(dst.index);               break;
+			case Shader::OPCODE_LOOP:       LOOP(src1);                     break;
+			case Shader::OPCODE_REP:        REP(src0);                      break;
+			case Shader::OPCODE_WHILE:      WHILE(src0);                    break;
+			case Shader::OPCODE_SWITCH:     SWITCH();                       break;
+			case Shader::OPCODE_RET:        RET();                          break;
+			case Shader::OPCODE_LEAVE:      LEAVE();                        break;
+			case Shader::OPCODE_CMP:        cmp(d, s0, s1, control);        break;
+			case Shader::OPCODE_ICMP:       icmp(d, s0, s1, control);       break;
+			case Shader::OPCODE_UCMP:       ucmp(d, s0, s1, control);       break;
+			case Shader::OPCODE_SELECT:     select(d, s0, s1, s2);          break;
+			case Shader::OPCODE_EXTRACT:    extract(d.x, s0, s1.x);         break;
+			case Shader::OPCODE_INSERT:     insert(d, s0, s1.x, s2.x);      break;
+			case Shader::OPCODE_ALL:        all(d.x, s0);                   break;
+			case Shader::OPCODE_ANY:        any(d.x, s0);                   break;
+			case Shader::OPCODE_NOT:        bitwise_not(d, s0);             break;
+			case Shader::OPCODE_OR:         bitwise_or(d, s0, s1);          break;
+			case Shader::OPCODE_XOR:        bitwise_xor(d, s0, s1);         break;
+			case Shader::OPCODE_AND:        bitwise_and(d, s0, s1);         break;
+			case Shader::OPCODE_EQ:         equal(d, s0, s1);               break;
+			case Shader::OPCODE_NE:         notEqual(d, s0, s1);            break;
+			case Shader::OPCODE_TEXLDL:     TEXLOD(d, s0, src1, s0.w);      break;
+			case Shader::OPCODE_TEXLOD:     TEXLOD(d, s0, src1, s2.x);      break;
+			case Shader::OPCODE_TEX:        TEX(d, s0, src1);               break;
+			case Shader::OPCODE_TEXOFFSET:  TEXOFFSET(d, s0, src1, s2);     break;
+			case Shader::OPCODE_TEXLODOFFSET: TEXLODOFFSET(d, s0, src1, s2, s3.x); break;
+			case Shader::OPCODE_TEXELFETCH: TEXELFETCH(d, s0, src1, s2.x);  break;
+			case Shader::OPCODE_TEXELFETCHOFFSET: TEXELFETCHOFFSET(d, s0, src1, s2, s3.x); break;
+			case Shader::OPCODE_TEXGRAD:    TEXGRAD(d, s0, src1, s2, s3);   break;
+			case Shader::OPCODE_TEXGRADOFFSET: TEXGRADOFFSET(d, s0, src1, s2, s3, s4); break;
+			case Shader::OPCODE_TEXSIZE:    TEXSIZE(d, s0.x, src1);         break;
+			case Shader::OPCODE_END:                                        break;
+			default:
+				ASSERT(false);
+			}
+
+			if(dst.type != Shader::PARAMETER_VOID && dst.type != Shader::PARAMETER_LABEL && opcode != Shader::OPCODE_NOP)
+			{
+				if(dst.saturate)
+				{
+					if(dst.x) d.x = Max(d.x, Float4(0.0f));
+					if(dst.y) d.y = Max(d.y, Float4(0.0f));
+					if(dst.z) d.z = Max(d.z, Float4(0.0f));
+					if(dst.w) d.w = Max(d.w, Float4(0.0f));
+
+					if(dst.x) d.x = Min(d.x, Float4(1.0f));
+					if(dst.y) d.y = Min(d.y, Float4(1.0f));
+					if(dst.z) d.z = Min(d.z, Float4(1.0f));
+					if(dst.w) d.w = Min(d.w, Float4(1.0f));
+				}
+
+				if(instruction->isPredicated())
+				{
+					Vector4f pDst;   // FIXME: Rename
+
+					switch(dst.type)
+					{
+					case Shader::PARAMETER_VOID: break;
+					case Shader::PARAMETER_TEMP:
+						if(dst.rel.type == Shader::PARAMETER_VOID)
+						{
+							if(dst.x) pDst.x = r[dst.index].x;
+							if(dst.y) pDst.y = r[dst.index].y;
+							if(dst.z) pDst.z = r[dst.index].z;
+							if(dst.w) pDst.w = r[dst.index].w;
+						}
+						else if(!dst.rel.dynamic)
+						{
+							Int a = dst.index + relativeAddress(dst.rel);
+
+							if(dst.x) pDst.x = r[a].x;
+							if(dst.y) pDst.y = r[a].y;
+							if(dst.z) pDst.z = r[a].z;
+							if(dst.w) pDst.w = r[a].w;
+						}
+						else
+						{
+							Int4 a = dst.index + dynamicAddress(dst.rel);
+
+							if(dst.x) pDst.x = r[a].x;
+							if(dst.y) pDst.y = r[a].y;
+							if(dst.z) pDst.z = r[a].z;
+							if(dst.w) pDst.w = r[a].w;
+						}
+						break;
+					case Shader::PARAMETER_ADDR: pDst = a0; break;
+					case Shader::PARAMETER_RASTOUT:
+						switch(dst.index)
+						{
+						case 0:
+							if(dst.x) pDst.x = o[Pos].x;
+							if(dst.y) pDst.y = o[Pos].y;
+							if(dst.z) pDst.z = o[Pos].z;
+							if(dst.w) pDst.w = o[Pos].w;
+							break;
+						case 1:
+							pDst.x = o[Fog].x;
+							break;
+						case 2:
+							pDst.x = o[Pts].y;
+							break;
+						default:
+							ASSERT(false);
+						}
+						break;
+					case Shader::PARAMETER_ATTROUT:
+						if(dst.x) pDst.x = o[C0 + dst.index].x;
+						if(dst.y) pDst.y = o[C0 + dst.index].y;
+						if(dst.z) pDst.z = o[C0 + dst.index].z;
+						if(dst.w) pDst.w = o[C0 + dst.index].w;
+						break;
+					case Shader::PARAMETER_TEXCRDOUT:
+				//	case Shader::PARAMETER_OUTPUT:
+						if(shaderModel < 0x0300)
+						{
+							if(dst.x) pDst.x = o[T0 + dst.index].x;
+							if(dst.y) pDst.y = o[T0 + dst.index].y;
+							if(dst.z) pDst.z = o[T0 + dst.index].z;
+							if(dst.w) pDst.w = o[T0 + dst.index].w;
+						}
+						else if(dst.rel.type == Shader::PARAMETER_VOID)   // Not relative
+						{
+							if(dst.x) pDst.x = o[dst.index].x;
+							if(dst.y) pDst.y = o[dst.index].y;
+							if(dst.z) pDst.z = o[dst.index].z;
+							if(dst.w) pDst.w = o[dst.index].w;
+						}
+						else if(!dst.rel.dynamic)
+						{
+							Int a = dst.index + relativeAddress(dst.rel);
+
+							if(dst.x) pDst.x = o[a].x;
+							if(dst.y) pDst.y = o[a].y;
+							if(dst.z) pDst.z = o[a].z;
+							if(dst.w) pDst.w = o[a].w;
+						}
+						else
+						{
+							Int4 a = dst.index + dynamicAddress(dst.rel);
+
+							if(dst.x) pDst.x = o[a].x;
+							if(dst.y) pDst.y = o[a].y;
+							if(dst.z) pDst.z = o[a].z;
+							if(dst.w) pDst.w = o[a].w;
+						}
+						break;
+					case Shader::PARAMETER_LABEL:                break;
+					case Shader::PARAMETER_PREDICATE: pDst = p0; break;
+					case Shader::PARAMETER_INPUT:                break;
+					default:
+						ASSERT(false);
+					}
+
+					Int4 enable = enableMask(instruction);
+
+					Int4 xEnable = enable;
+					Int4 yEnable = enable;
+					Int4 zEnable = enable;
+					Int4 wEnable = enable;
+
+					if(predicate)
+					{
+						unsigned char pSwizzle = instruction->predicateSwizzle;
+
+						Float4 xPredicate = p0[(pSwizzle >> 0) & 0x03];
+						Float4 yPredicate = p0[(pSwizzle >> 2) & 0x03];
+						Float4 zPredicate = p0[(pSwizzle >> 4) & 0x03];
+						Float4 wPredicate = p0[(pSwizzle >> 6) & 0x03];
+
+						if(!instruction->predicateNot)
+						{
+							if(dst.x) xEnable = xEnable & As<Int4>(xPredicate);
+							if(dst.y) yEnable = yEnable & As<Int4>(yPredicate);
+							if(dst.z) zEnable = zEnable & As<Int4>(zPredicate);
+							if(dst.w) wEnable = wEnable & As<Int4>(wPredicate);
+						}
+						else
+						{
+							if(dst.x) xEnable = xEnable & ~As<Int4>(xPredicate);
+							if(dst.y) yEnable = yEnable & ~As<Int4>(yPredicate);
+							if(dst.z) zEnable = zEnable & ~As<Int4>(zPredicate);
+							if(dst.w) wEnable = wEnable & ~As<Int4>(wPredicate);
+						}
+					}
+
+					if(dst.x) d.x = As<Float4>(As<Int4>(d.x) & xEnable);
+					if(dst.y) d.y = As<Float4>(As<Int4>(d.y) & yEnable);
+					if(dst.z) d.z = As<Float4>(As<Int4>(d.z) & zEnable);
+					if(dst.w) d.w = As<Float4>(As<Int4>(d.w) & wEnable);
+
+					if(dst.x) d.x = As<Float4>(As<Int4>(d.x) | (As<Int4>(pDst.x) & ~xEnable));
+					if(dst.y) d.y = As<Float4>(As<Int4>(d.y) | (As<Int4>(pDst.y) & ~yEnable));
+					if(dst.z) d.z = As<Float4>(As<Int4>(d.z) | (As<Int4>(pDst.z) & ~zEnable));
+					if(dst.w) d.w = As<Float4>(As<Int4>(d.w) | (As<Int4>(pDst.w) & ~wEnable));
+				}
+
+				switch(dst.type)
+				{
+				case Shader::PARAMETER_VOID:
+					break;
+				case Shader::PARAMETER_TEMP:
+					if(dst.rel.type == Shader::PARAMETER_VOID)
+					{
+						if(dst.x) r[dst.index].x = d.x;
+						if(dst.y) r[dst.index].y = d.y;
+						if(dst.z) r[dst.index].z = d.z;
+						if(dst.w) r[dst.index].w = d.w;
+					}
+					else if(!dst.rel.dynamic)
+					{
+						Int a = dst.index + relativeAddress(dst.rel);
+
+						if(dst.x) r[a].x = d.x;
+						if(dst.y) r[a].y = d.y;
+						if(dst.z) r[a].z = d.z;
+						if(dst.w) r[a].w = d.w;
+					}
+					else
+					{
+						Int4 a = dst.index + dynamicAddress(dst.rel);
+
+						if(dst.x) r.scatter_x(a, d.x);
+						if(dst.y) r.scatter_y(a, d.y);
+						if(dst.z) r.scatter_z(a, d.z);
+						if(dst.w) r.scatter_w(a, d.w);
+					}
+					break;
+				case Shader::PARAMETER_ADDR:
+					if(dst.x) a0.x = d.x;
+					if(dst.y) a0.y = d.y;
+					if(dst.z) a0.z = d.z;
+					if(dst.w) a0.w = d.w;
+					break;
+				case Shader::PARAMETER_RASTOUT:
+					switch(dst.index)
+					{
+					case 0:
+						if(dst.x) o[Pos].x = d.x;
+						if(dst.y) o[Pos].y = d.y;
+						if(dst.z) o[Pos].z = d.z;
+						if(dst.w) o[Pos].w = d.w;
+						break;
+					case 1:
+						o[Fog].x = d.x;
+						break;
+					case 2:
+						o[Pts].y = d.x;
+						break;
+					default:	ASSERT(false);
+					}
+					break;
+				case Shader::PARAMETER_ATTROUT:
+					if(dst.x) o[C0 + dst.index].x = d.x;
+					if(dst.y) o[C0 + dst.index].y = d.y;
+					if(dst.z) o[C0 + dst.index].z = d.z;
+					if(dst.w) o[C0 + dst.index].w = d.w;
+					break;
+				case Shader::PARAMETER_TEXCRDOUT:
+			//	case Shader::PARAMETER_OUTPUT:
+					if(shaderModel < 0x0300)
+					{
+						if(dst.x) o[T0 + dst.index].x = d.x;
+						if(dst.y) o[T0 + dst.index].y = d.y;
+						if(dst.z) o[T0 + dst.index].z = d.z;
+						if(dst.w) o[T0 + dst.index].w = d.w;
+					}
+					else if(dst.rel.type == Shader::PARAMETER_VOID)   // Not relative
+					{
+						if(dst.x) o[dst.index].x = d.x;
+						if(dst.y) o[dst.index].y = d.y;
+						if(dst.z) o[dst.index].z = d.z;
+						if(dst.w) o[dst.index].w = d.w;
+					}
+					else if(!dst.rel.dynamic)
+					{
+						Int a = dst.index + relativeAddress(dst.rel);
+
+						if(dst.x) o[a].x = d.x;
+						if(dst.y) o[a].y = d.y;
+						if(dst.z) o[a].z = d.z;
+						if(dst.w) o[a].w = d.w;
+					}
+					else
+					{
+						Int4 a = dst.index + dynamicAddress(dst.rel);
+
+						if(dst.x) o.scatter_x(a, d.x);
+						if(dst.y) o.scatter_y(a, d.y);
+						if(dst.z) o.scatter_z(a, d.z);
+						if(dst.w) o.scatter_w(a, d.w);
+					}
+					break;
+				case Shader::PARAMETER_LABEL:             break;
+				case Shader::PARAMETER_PREDICATE: p0 = d; break;
+				case Shader::PARAMETER_INPUT:             break;
+				default:
+					ASSERT(false);
+				}
+			}
+		}
+
+		if(currentLabel != -1)
+		{
+			Nucleus::setInsertBlock(returnBlock);
+		}
+	}
+
+	void VertexProgram::passThrough()
+	{
+		if(shader)
+		{
+			for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+			{
+				unsigned char usage = shader->getOutput(i, 0).usage;
+
+				switch(usage)
+				{
+				case 0xFF:
+					continue;
+				case Shader::USAGE_PSIZE:
+					o[i].y = v[i].x;
+					break;
+				case Shader::USAGE_TEXCOORD:
+					o[i].x = v[i].x;
+					o[i].y = v[i].y;
+					o[i].z = v[i].z;
+					o[i].w = v[i].w;
+					break;
+				case Shader::USAGE_POSITION:
+					o[i].x = v[i].x;
+					o[i].y = v[i].y;
+					o[i].z = v[i].z;
+					o[i].w = v[i].w;
+					break;
+				case Shader::USAGE_COLOR:
+					o[i].x = v[i].x;
+					o[i].y = v[i].y;
+					o[i].z = v[i].z;
+					o[i].w = v[i].w;
+					break;
+				case Shader::USAGE_FOG:
+					o[i].x = v[i].x;
+					break;
+				default:
+					ASSERT(false);
+				}
+			}
+		}
+		else
+		{
+			o[Pos].x = v[PositionT].x;
+			o[Pos].y = v[PositionT].y;
+			o[Pos].z = v[PositionT].z;
+			o[Pos].w = v[PositionT].w;
+
+			for(int i = 0; i < 2; i++)
+			{
+				o[C0 + i].x = v[Color0 + i].x;
+				o[C0 + i].y = v[Color0 + i].y;
+				o[C0 + i].z = v[Color0 + i].z;
+				o[C0 + i].w = v[Color0 + i].w;
+			}
+
+			for(int i = 0; i < 8; i++)
+			{
+				o[T0 + i].x = v[TexCoord0 + i].x;
+				o[T0 + i].y = v[TexCoord0 + i].y;
+				o[T0 + i].z = v[TexCoord0 + i].z;
+				o[T0 + i].w = v[TexCoord0 + i].w;
+			}
+
+			o[Pts].y = v[PointSize].x;
+		}
+	}
+
+	Vector4f VertexProgram::fetchRegister(const Src &src, unsigned int offset)
+	{
+		Vector4f reg;
+		unsigned int i = src.index + offset;
+
+		switch(src.type)
+		{
+		case Shader::PARAMETER_TEMP:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg = r[i];
+			}
+			else if(!src.rel.dynamic)
+			{
+				reg = r[i + relativeAddress(src.rel, src.bufferIndex)];
+			}
+			else
+			{
+				reg = r[i + dynamicAddress(src.rel)];
+			}
+			break;
+		case Shader::PARAMETER_CONST:
+			reg = readConstant(src, offset);
+			break;
+		case Shader::PARAMETER_INPUT:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg = v[i];
+			}
+			else if(!src.rel.dynamic)
+			{
+				reg = v[i + relativeAddress(src.rel, src.bufferIndex)];
+			}
+			else
+			{
+				reg = v[i + dynamicAddress(src.rel)];
+			}
+			break;
+		case Shader::PARAMETER_VOID: return r[0];   // Dummy
+		case Shader::PARAMETER_FLOAT4LITERAL:
+			reg.x = Float4(src.value[0]);
+			reg.y = Float4(src.value[1]);
+			reg.z = Float4(src.value[2]);
+			reg.w = Float4(src.value[3]);
+			break;
+		case Shader::PARAMETER_ADDR:      reg = a0; break;
+		case Shader::PARAMETER_CONSTBOOL: return r[0];   // Dummy
+		case Shader::PARAMETER_CONSTINT:  return r[0];   // Dummy
+		case Shader::PARAMETER_LOOP:      return r[0];   // Dummy
+		case Shader::PARAMETER_PREDICATE: return r[0];   // Dummy
+		case Shader::PARAMETER_SAMPLER:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg.x = As<Float4>(Int4(i));
+			}
+			else if(src.rel.type == Shader::PARAMETER_TEMP)
+			{
+				reg.x = As<Float4>(Int4(i) + As<Int4>(r[src.rel.index].x));
+			}
+			return reg;
+		case Shader::PARAMETER_OUTPUT:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg = o[i];
+			}
+			else if(!src.rel.dynamic)
+			{
+				reg = o[i + relativeAddress(src.rel, src.bufferIndex)];
+			}
+			else
+			{
+				reg = o[i + dynamicAddress(src.rel)];
+			}
+			break;
+		case Shader::PARAMETER_MISCTYPE:
+			if(src.index == Shader::InstanceIDIndex)
+			{
+				reg.x = As<Float>(instanceID);
+			}
+			else if(src.index == Shader::VertexIDIndex)
+			{
+				reg.x = As<Float4>(vertexID);
+			}
+			else ASSERT(false);
+			return reg;
+		default:
+			ASSERT(false);
+		}
+
+		const Float4 &x = reg[(src.swizzle >> 0) & 0x3];
+		const Float4 &y = reg[(src.swizzle >> 2) & 0x3];
+		const Float4 &z = reg[(src.swizzle >> 4) & 0x3];
+		const Float4 &w = reg[(src.swizzle >> 6) & 0x3];
+
+		Vector4f mod;
+
+		switch(src.modifier)
+		{
+		case Shader::MODIFIER_NONE:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
+			break;
+		case Shader::MODIFIER_NEGATE:
+			mod.x = -x;
+			mod.y = -y;
+			mod.z = -z;
+			mod.w = -w;
+			break;
+		case Shader::MODIFIER_ABS:
+			mod.x = Abs(x);
+			mod.y = Abs(y);
+			mod.z = Abs(z);
+			mod.w = Abs(w);
+			break;
+		case Shader::MODIFIER_ABS_NEGATE:
+			mod.x = -Abs(x);
+			mod.y = -Abs(y);
+			mod.z = -Abs(z);
+			mod.w = -Abs(w);
+			break;
+		case Shader::MODIFIER_NOT:
+			mod.x = As<Float4>(As<Int4>(x) ^ Int4(0xFFFFFFFF));
+			mod.y = As<Float4>(As<Int4>(y) ^ Int4(0xFFFFFFFF));
+			mod.z = As<Float4>(As<Int4>(z) ^ Int4(0xFFFFFFFF));
+			mod.w = As<Float4>(As<Int4>(w) ^ Int4(0xFFFFFFFF));
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		return mod;
+	}
+
+	RValue<Pointer<Byte>> VertexProgram::uniformAddress(int bufferIndex, unsigned int index)
+	{
+		if(bufferIndex == -1)
+		{
+			return data + OFFSET(DrawData, vs.c[index]);
+		}
+		else
+		{
+			return *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.u[bufferIndex])) + index;
+		}
+	}
+
+	RValue<Pointer<Byte>> VertexProgram::uniformAddress(int bufferIndex, unsigned int index, Int &offset)
+	{
+		return uniformAddress(bufferIndex, index) + offset * sizeof(float4);
+	}
+
+	Vector4f VertexProgram::readConstant(const Src &src, unsigned int offset)
+	{
+		Vector4f c;
+		unsigned int i = src.index + offset;
+
+		if(src.rel.type == Shader::PARAMETER_VOID)   // Not relative
+		{
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i));
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+
+			if(shader->containsDefineInstruction())   // Constant may be known at compile time
+			{
+				for(size_t j = 0; j < shader->getLength(); j++)
+				{
+					const Shader::Instruction &instruction = *shader->getInstruction(j);
+
+					if(instruction.opcode == Shader::OPCODE_DEF)
+					{
+						if(instruction.dst.index == i)
+						{
+							c.x = Float4(instruction.src[0].value[0]);
+							c.y = Float4(instruction.src[0].value[1]);
+							c.z = Float4(instruction.src[0].value[2]);
+							c.w = Float4(instruction.src[0].value[3]);
+
+							break;
+						}
+					}
+				}
+			}
+		}
+		else if(!src.rel.dynamic || src.rel.type == Shader::PARAMETER_LOOP)
+		{
+			Int a = relativeAddress(src.rel, src.bufferIndex);
+
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, i, a));
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+		}
+		else
+		{
+			int component = src.rel.swizzle & 0x03;
+			Float4 a;
+
+			switch(src.rel.type)
+			{
+			case Shader::PARAMETER_ADDR:     a = a0[component]; break;
+			case Shader::PARAMETER_TEMP:     a = r[src.rel.index][component]; break;
+			case Shader::PARAMETER_INPUT:    a = v[src.rel.index][component]; break;
+			case Shader::PARAMETER_OUTPUT:   a = o[src.rel.index][component]; break;
+			case Shader::PARAMETER_CONST:    a = *Pointer<Float>(uniformAddress(src.bufferIndex, src.rel.index) + component * sizeof(float)); break;
+			case Shader::PARAMETER_MISCTYPE:
+				switch(src.rel.index)
+				{
+				case Shader::InstanceIDIndex: a = As<Float4>(Int4(instanceID)); break;
+				case Shader::VertexIDIndex:   a = As<Float4>(vertexID);         break;
+				default: ASSERT(false);
+				}
+				break;
+			default: ASSERT(false);
+			}
+
+			Int4 index = Int4(i) + As<Int4>(a) * Int4(src.rel.scale);
+
+			index = Min(As<UInt4>(index), UInt4(VERTEX_UNIFORM_VECTORS));   // Clamp to constant register range, c[VERTEX_UNIFORM_VECTORS] = {0, 0, 0, 0}
+
+			Int index0 = Extract(index, 0);
+			Int index1 = Extract(index, 1);
+			Int index2 = Extract(index, 2);
+			Int index3 = Extract(index, 3);
+
+			c.x = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index0), 16);
+			c.y = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index1), 16);
+			c.z = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index2), 16);
+			c.w = *Pointer<Float4>(uniformAddress(src.bufferIndex, 0, index3), 16);
+
+			transpose4x4(c.x, c.y, c.z, c.w);
+		}
+
+		return c;
+	}
+
+	Int VertexProgram::relativeAddress(const Shader::Relative &rel, int bufferIndex)
+	{
+		ASSERT(!rel.dynamic);
+
+		if(rel.type == Shader::PARAMETER_TEMP)
+		{
+			return As<Int>(Extract(r[rel.index].x, 0)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_INPUT)
+		{
+			return As<Int>(Extract(v[rel.index].x, 0)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_OUTPUT)
+		{
+			return As<Int>(Extract(o[rel.index].x, 0)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_CONST)
+		{
+			return *Pointer<Int>(uniformAddress(bufferIndex, rel.index)) * rel.scale;
+		}
+		else if(rel.type == Shader::PARAMETER_LOOP)
+		{
+			return aL[loopDepth];
+		}
+		else ASSERT(false);
+
+		return 0;
+	}
+
+	Int4 VertexProgram::dynamicAddress(const Shader::Relative &rel)
+	{
+		int component = rel.swizzle & 0x03;
+		Float4 a;
+
+		switch(rel.type)
+		{
+		case Shader::PARAMETER_ADDR:     a = a0[component]; break;
+		case Shader::PARAMETER_TEMP:     a = r[rel.index][component]; break;
+		case Shader::PARAMETER_INPUT:    a = v[rel.index][component]; break;
+		case Shader::PARAMETER_OUTPUT:   a = o[rel.index][component]; break;
+		case Shader::PARAMETER_MISCTYPE:
+			switch(rel.index)
+			{
+			case Shader::InstanceIDIndex: a = As<Float>(instanceID); break;
+			case Shader::VertexIDIndex:   a = As<Float4>(vertexID);  break;
+			default: ASSERT(false);
+			}
+			break;
+		default: ASSERT(false);
+		}
+
+		return As<Int4>(a) * Int4(rel.scale);
+	}
+
+	Int4 VertexProgram::enableMask(const Shader::Instruction *instruction)
+	{
+		Int4 enable = instruction->analysisBranch ? Int4(enableStack[enableIndex]) : Int4(0xFFFFFFFF);
+
+		if(!whileTest)
+		{
+			if(shader->containsBreakInstruction() && instruction->analysisBreak)
+			{
+				enable &= enableBreak;
+			}
+
+			if(shader->containsContinueInstruction() && instruction->analysisContinue)
+			{
+				enable &= enableContinue;
+			}
+
+			if(shader->containsLeaveInstruction() && instruction->analysisLeave)
+			{
+				enable &= enableLeave;
+			}
+		}
+
+		return enable;
+	}
+
+	void VertexProgram::M3X2(Vector4f &dst, Vector4f &src0, Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+
+		dst.x = dot3(src0, row0);
+		dst.y = dot3(src0, row1);
+	}
+
+	void VertexProgram::M3X3(Vector4f &dst, Vector4f &src0, Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+
+		dst.x = dot3(src0, row0);
+		dst.y = dot3(src0, row1);
+		dst.z = dot3(src0, row2);
+	}
+
+	void VertexProgram::M3X4(Vector4f &dst, Vector4f &src0, Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+		Vector4f row3 = fetchRegister(src1, 3);
+
+		dst.x = dot3(src0, row0);
+		dst.y = dot3(src0, row1);
+		dst.z = dot3(src0, row2);
+		dst.w = dot3(src0, row3);
+	}
+
+	void VertexProgram::M4X3(Vector4f &dst, Vector4f &src0, Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+
+		dst.x = dot4(src0, row0);
+		dst.y = dot4(src0, row1);
+		dst.z = dot4(src0, row2);
+	}
+
+	void VertexProgram::M4X4(Vector4f &dst, Vector4f &src0, Src &src1)
+	{
+		Vector4f row0 = fetchRegister(src1, 0);
+		Vector4f row1 = fetchRegister(src1, 1);
+		Vector4f row2 = fetchRegister(src1, 2);
+		Vector4f row3 = fetchRegister(src1, 3);
+
+		dst.x = dot4(src0, row0);
+		dst.y = dot4(src0, row1);
+		dst.z = dot4(src0, row2);
+		dst.w = dot4(src0, row3);
+	}
+
+	void VertexProgram::BREAK()
+	{
+		enableBreak = enableBreak & ~enableStack[enableIndex];
+	}
+
+	void VertexProgram::BREAKC(Vector4f &src0, Vector4f &src1, Control control)
+	{
+		Int4 condition;
+
+		switch(control)
+		{
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);  break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x);  break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);  break;
+		default:
+			ASSERT(false);
+		}
+
+		BREAK(condition);
+	}
+
+	void VertexProgram::BREAKP(const Src &predicateRegister)   // FIXME: Factor out parts common with BREAKC
+	{
+		Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = ~condition;
+		}
+
+		BREAK(condition);
+	}
+
+	void VertexProgram::BREAK(Int4 &condition)
+	{
+		condition &= enableStack[enableIndex];
+
+		enableBreak = enableBreak & ~condition;
+	}
+
+	void VertexProgram::CONTINUE()
+	{
+		enableContinue = enableContinue & ~enableStack[enableIndex];
+	}
+
+	void VertexProgram::TEST()
+	{
+		whileTest = true;
+	}
+
+	void VertexProgram::CALL(int labelIndex, int callSiteIndex)
+	{
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			callStack[stackIndex++] = UInt(callSiteIndex);
+		}
+
+		Int4 restoreLeave = enableLeave;
+
+		Nucleus::createBr(labelBlock[labelIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		enableLeave = restoreLeave;
+	}
+
+	void VertexProgram::CALLNZ(int labelIndex, int callSiteIndex, const Src &src)
+	{
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
+		{
+			CALLNZb(labelIndex, callSiteIndex, src);
+		}
+		else if(src.type == Shader::PARAMETER_PREDICATE)
+		{
+			CALLNZp(labelIndex, callSiteIndex, src);
+		}
+		else ASSERT(false);
+	}
+
+	void VertexProgram::CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister)
+	{
+		Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData,vs.b[boolRegister.index])) != Byte(0));   // FIXME
+
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = !condition;
+		}
+
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			callStack[stackIndex++] = UInt(callSiteIndex);
+		}
+
+		Int4 restoreLeave = enableLeave;
+
+		branch(condition, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		enableLeave = restoreLeave;
+	}
+
+	void VertexProgram::CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister)
+	{
+		Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = ~condition;
+		}
+
+		condition &= enableStack[enableIndex];
+
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			callStack[stackIndex++] = UInt(callSiteIndex);
+		}
+
+		enableIndex++;
+		enableStack[enableIndex] = condition;
+		Int4 restoreLeave = enableLeave;
+
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		enableIndex--;
+		enableLeave = restoreLeave;
+	}
+
+	void VertexProgram::ELSE()
+	{
+		ifDepth--;
+
+		BasicBlock *falseBlock = ifFalseBlock[ifDepth];
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		if(isConditionalIf[ifDepth])
+		{
+			Int4 condition = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+			Bool notAllFalse = SignMask(condition) != 0;
+
+			branch(notAllFalse, falseBlock, endBlock);
+
+			enableStack[enableIndex] = ~enableStack[enableIndex] & enableStack[enableIndex - 1];
+		}
+		else
+		{
+			Nucleus::createBr(endBlock);
+			Nucleus::setInsertBlock(falseBlock);
+		}
+
+		ifFalseBlock[ifDepth] = endBlock;
+
+		ifDepth++;
+	}
+
+	void VertexProgram::ENDIF()
+	{
+		ifDepth--;
+
+		BasicBlock *endBlock = ifFalseBlock[ifDepth];
+
+		Nucleus::createBr(endBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		if(isConditionalIf[ifDepth])
+		{
+			enableIndex--;
+		}
+	}
+
+	void VertexProgram::ENDLOOP()
+	{
+		loopRepDepth--;
+
+		aL[loopDepth] = aL[loopDepth] + increment[loopDepth];   // FIXME: +=
+
+		BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		loopDepth--;
+		enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+	}
+
+	void VertexProgram::ENDREP()
+	{
+		loopRepDepth--;
+
+		BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		loopDepth--;
+		enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+	}
+
+	void VertexProgram::ENDWHILE()
+	{
+		loopRepDepth--;
+
+		BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		enableIndex--;
+		whileTest = false;
+	}
+
+	void VertexProgram::ENDSWITCH()
+	{
+		loopRepDepth--;
+
+		BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(endBlock);
+		Nucleus::setInsertBlock(endBlock);
+	}
+
+	void VertexProgram::IF(const Src &src)
+	{
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
+		{
+			IFb(src);
+		}
+		else if(src.type == Shader::PARAMETER_PREDICATE)
+		{
+			IFp(src);
+		}
+		else
+		{
+			Int4 condition = As<Int4>(fetchRegister(src).x);
+			IF(condition);
+		}
+	}
+
+	void VertexProgram::IFb(const Src &boolRegister)
+	{
+		ASSERT(ifDepth < 24 + 4);
+
+		Bool condition = (*Pointer<Byte>(data + OFFSET(DrawData,vs.b[boolRegister.index])) != Byte(0));   // FIXME
+
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = !condition;
+		}
+
+		BasicBlock *trueBlock = Nucleus::createBasicBlock();
+		BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+		branch(condition, trueBlock, falseBlock);
+
+		isConditionalIf[ifDepth] = false;
+		ifFalseBlock[ifDepth] = falseBlock;
+
+		ifDepth++;
+	}
+
+	void VertexProgram::IFp(const Src &predicateRegister)
+	{
+		Int4 condition = As<Int4>(p0[predicateRegister.swizzle & 0x3]);
+
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
+		{
+			condition = ~condition;
+		}
+
+		IF(condition);
+	}
+
+	void VertexProgram::IFC(Vector4f &src0, Vector4f &src1, Control control)
+	{
+		Int4 condition;
+
+		switch(control)
+		{
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x, src1.x); break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);  break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x); break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x, src1.x);  break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x); break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);  break;
+		default:
+			ASSERT(false);
+		}
+
+		IF(condition);
+	}
+
+	void VertexProgram::IF(Int4 &condition)
+	{
+		condition &= enableStack[enableIndex];
+
+		enableIndex++;
+		enableStack[enableIndex] = condition;
+
+		BasicBlock *trueBlock = Nucleus::createBasicBlock();
+		BasicBlock *falseBlock = Nucleus::createBasicBlock();
+
+		Bool notAllFalse = SignMask(condition) != 0;
+
+		branch(notAllFalse, trueBlock, falseBlock);
+
+		isConditionalIf[ifDepth] = true;
+		ifFalseBlock[ifDepth] = falseBlock;
+
+		ifDepth++;
+	}
+
+	void VertexProgram::LABEL(int labelIndex)
+	{
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
+		Nucleus::setInsertBlock(labelBlock[labelIndex]);
+		currentLabel = labelIndex;
+	}
+
+	void VertexProgram::LOOP(const Src &integerRegister)
+	{
+		loopDepth++;
+
+		iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][0]));
+		aL[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][1]));
+		increment[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][2]));
+
+		// FIXME: Compiles to two instructions?
+		If(increment[loopDepth] == 0)
+		{
+			increment[loopDepth] = 1;
+		}
+
+		BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		BasicBlock *testBlock = Nucleus::createBasicBlock();
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		// FIXME: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+
+		branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+		Nucleus::setInsertBlock(loopBlock);
+
+		iteration[loopDepth] = iteration[loopDepth] - 1;   // FIXME: --
+
+		loopRepDepth++;
+	}
+
+	void VertexProgram::REP(const Src &integerRegister)
+	{
+		loopDepth++;
+
+		iteration[loopDepth] = *Pointer<Int>(data + OFFSET(DrawData,vs.i[integerRegister.index][0]));
+		aL[loopDepth] = aL[loopDepth - 1];
+
+		BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		BasicBlock *testBlock = Nucleus::createBasicBlock();
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		// FIXME: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+
+		branch(iteration[loopDepth] > 0, loopBlock, endBlock);
+		Nucleus::setInsertBlock(loopBlock);
+
+		iteration[loopDepth] = iteration[loopDepth] - 1;   // FIXME: --
+
+		loopRepDepth++;
+	}
+
+	void VertexProgram::WHILE(const Src &temporaryRegister)
+	{
+		enableIndex++;
+
+		BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		BasicBlock *testBlock = Nucleus::createBasicBlock();
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		Int4 restoreBreak = enableBreak;
+		Int4 restoreContinue = enableContinue;
+
+		// TODO: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+		enableContinue = restoreContinue;
+
+		const Vector4f &src = fetchRegister(temporaryRegister);
+		Int4 condition = As<Int4>(src.x);
+		condition &= enableStack[enableIndex - 1];
+		if(shader->containsLeaveInstruction()) condition &= enableLeave;
+		if(shader->containsBreakInstruction()) condition &= enableBreak;
+		enableStack[enableIndex] = condition;
+
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, loopBlock, endBlock);
+
+		Nucleus::setInsertBlock(endBlock);
+		enableBreak = restoreBreak;
+
+		Nucleus::setInsertBlock(loopBlock);
+
+		loopRepDepth++;
+	}
+
+	void VertexProgram::SWITCH()
+	{
+		BasicBlock *endBlock = Nucleus::createBasicBlock();
+
+		loopRepTestBlock[loopRepDepth] = nullptr;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		Int4 restoreBreak = enableBreak;
+
+		BasicBlock *currentBlock = Nucleus::getInsertBlock();
+
+		Nucleus::setInsertBlock(endBlock);
+		enableBreak = restoreBreak;
+
+		Nucleus::setInsertBlock(currentBlock);
+
+		loopRepDepth++;
+	}
+
+	void VertexProgram::RET()
+	{
+		if(currentLabel == -1)
+		{
+			returnBlock = Nucleus::createBasicBlock();
+			Nucleus::createBr(returnBlock);
+		}
+		else
+		{
+			BasicBlock *unreachableBlock = Nucleus::createBasicBlock();
+
+			if(callRetBlock[currentLabel].size() > 1)   // Pop the return destination from the call stack
+			{
+				// FIXME: Encapsulate
+				UInt index = callStack[--stackIndex];
+
+				Value *value = index.loadValue();
+				SwitchCases *switchCases = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock[currentLabel].size());
+
+				for(unsigned int i = 0; i < callRetBlock[currentLabel].size(); i++)
+				{
+					Nucleus::addSwitchCase(switchCases, i, callRetBlock[currentLabel][i]);
+				}
+			}
+			else if(callRetBlock[currentLabel].size() == 1)   // Jump directly to the unique return destination
+			{
+				Nucleus::createBr(callRetBlock[currentLabel][0]);
+			}
+			else   // Function isn't called
+			{
+				Nucleus::createBr(unreachableBlock);
+			}
+
+			Nucleus::setInsertBlock(unreachableBlock);
+			Nucleus::createUnreachable();
+		}
+	}
+
+	void VertexProgram::LEAVE()
+	{
+		enableLeave = enableLeave & ~enableStack[enableIndex];
+
+		// FIXME: Return from function if all instances left
+		// FIXME: Use enableLeave in other control-flow constructs
+	}
+
+	void VertexProgram::TEX(Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		dst = sampleTexture(src1, src0, (src0.x), (src0), (src0), (src0), Base);
+	}
+
+	void VertexProgram::TEXOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset)
+	{
+		dst = sampleTexture(src1, src0, (src0.x), (src0), (src0), offset, {Base, Offset});
+	}
+
+	void VertexProgram::TEXLOD(Vector4f &dst, Vector4f &src0, const Src& src1, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Lod);
+	}
+
+	void VertexProgram::TEXLODOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Lod, Offset});
+	}
+
+	void VertexProgram::TEXELFETCH(Vector4f &dst, Vector4f &src0, const Src& src1, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), (src0), Fetch);
+	}
+
+	void VertexProgram::TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &offset, Float4 &lod)
+	{
+		dst = sampleTexture(src1, src0, lod, (src0), (src0), offset, {Fetch, Offset});
+	}
+
+	void VertexProgram::TEXGRAD(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy)
+	{
+		dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, src0, Grad);
+	}
+
+	void VertexProgram::TEXGRADOFFSET(Vector4f &dst, Vector4f &src0, const Src& src1, Vector4f &dsx, Vector4f &dsy, Vector4f &offset)
+	{
+		dst = sampleTexture(src1, src0, (src0.x), dsx, dsy, offset, {Grad, Offset});
+	}
+
+	void VertexProgram::TEXSIZE(Vector4f &dst, Float4 &lod, const Src &src1)
+	{
+		Pointer<Byte> texture = data + OFFSET(DrawData, mipmap[TEXTURE_IMAGE_UNITS]) + src1.index * sizeof(Texture);
+		dst = SamplerCore::textureSize(texture, lod);
+	}
+
+	Vector4f VertexProgram::sampleTexture(const Src &s, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+	{
+		Vector4f tmp;
+
+		if(s.type == Shader::PARAMETER_SAMPLER && s.rel.type == Shader::PARAMETER_VOID)
+		{
+			tmp = sampleTexture(s.index, uvwq, lod, dsx, dsy, offset, function);
+		}
+		else
+		{
+			Int index = As<Int>(Float(fetchRegister(s).x.x));
+
+			for(int i = 0; i < VERTEX_TEXTURE_IMAGE_UNITS; i++)
+			{
+				if(shader->usesSampler(i))
+				{
+					If(index == i)
+					{
+						tmp = sampleTexture(i, uvwq, lod, dsx, dsy, offset, function);
+						// FIXME: When the sampler states are the same, we could use one sampler and just index the texture
+					}
+				}
+			}
+		}
+
+		Vector4f c;
+		c.x = tmp[(s.swizzle >> 0) & 0x3];
+		c.y = tmp[(s.swizzle >> 2) & 0x3];
+		c.z = tmp[(s.swizzle >> 4) & 0x3];
+		c.w = tmp[(s.swizzle >> 6) & 0x3];
+
+		return c;
+	}
+
+	Vector4f VertexProgram::sampleTexture(int sampler, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
+	{
+		Pointer<Byte> texture = data + OFFSET(DrawData, mipmap[TEXTURE_IMAGE_UNITS]) + sampler * sizeof(Texture);
+		return SamplerCore(constants, state.sampler[sampler]).sampleTexture(texture, uvwq.x, uvwq.y, uvwq.z, uvwq.w, lod, dsx, dsy, offset, function);
+	}
+}
diff --git a/src/Pipeline/VertexProgram.hpp b/src/Pipeline/VertexProgram.hpp
new file mode 100644
index 0000000..3c4199c
--- /dev/null
+++ b/src/Pipeline/VertexProgram.hpp
@@ -0,0 +1,139 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexProgram_hpp
+#define sw_VertexProgram_hpp
+
+#include "VertexRoutine.hpp"
+#include "ShaderCore.hpp"
+
+#include "SamplerCore.hpp"
+#include "Renderer/Stream.hpp"
+#include "Common/Types.hpp"
+
+namespace sw
+{
+	struct Stream;
+	class VertexShader;
+
+	class VertexProgram : public VertexRoutine, public ShaderCore
+	{
+	public:
+		VertexProgram(const VertexProcessor::State &state, const VertexShader *vertexShader);
+
+		virtual ~VertexProgram();
+
+	private:
+		const VertexShader *const shader;
+
+		RegisterArray<NUM_TEMPORARY_REGISTERS> r;   // Temporary registers
+		Vector4f a0;
+		Array<Int, 4> aL;
+		Vector4f p0;
+
+		Array<Int, 4> increment;
+		Array<Int, 4> iteration;
+
+		Int loopDepth;
+		Int stackIndex;   // FIXME: Inc/decrement callStack
+		Array<UInt, 16> callStack;
+
+		Int enableIndex;
+		Array<Int4, 1 + 24> enableStack;
+		Int4 enableBreak;
+		Int4 enableContinue;
+		Int4 enableLeave;
+
+		Int instanceID;
+		Int4 vertexID;
+
+		typedef Shader::DestinationParameter Dst;
+		typedef Shader::SourceParameter Src;
+		typedef Shader::Control Control;
+		typedef Shader::Usage Usage;
+
+		void pipeline(UInt &index) override;
+		void program(UInt &index);
+		void passThrough();
+
+		Vector4f fetchRegister(const Src &src, unsigned int offset = 0);
+		Vector4f readConstant(const Src &src, unsigned int offset = 0);
+		RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index);
+		RValue<Pointer<Byte>> uniformAddress(int bufferIndex, unsigned int index, Int &offset);
+		Int relativeAddress(const Shader::Relative &rel, int bufferIndex = -1);
+		Int4 dynamicAddress(const Shader::Relative &rel);
+		Int4 enableMask(const Shader::Instruction *instruction);
+
+		void M3X2(Vector4f &dst, Vector4f &src0, Src &src1);
+		void M3X3(Vector4f &dst, Vector4f &src0, Src &src1);
+		void M3X4(Vector4f &dst, Vector4f &src0, Src &src1);
+		void M4X3(Vector4f &dst, Vector4f &src0, Src &src1);
+		void M4X4(Vector4f &dst, Vector4f &src0, Src &src1);
+		void BREAK();
+		void BREAKC(Vector4f &src0, Vector4f &src1, Control);
+		void BREAKP(const Src &predicateRegister);
+		void BREAK(Int4 &condition);
+		void CONTINUE();
+		void TEST();
+		void CALL(int labelIndex, int callSiteIndex);
+		void CALLNZ(int labelIndex, int callSiteIndex, const Src &src);
+		void CALLNZb(int labelIndex, int callSiteIndex, const Src &boolRegister);
+		void CALLNZp(int labelIndex, int callSiteIndex, const Src &predicateRegister);
+		void ELSE();
+		void ENDIF();
+		void ENDLOOP();
+		void ENDREP();
+		void ENDWHILE();
+		void ENDSWITCH();
+		void IF(const Src &src);
+		void IFb(const Src &boolRegister);
+		void IFp(const Src &predicateRegister);
+		void IFC(Vector4f &src0, Vector4f &src1, Control);
+		void IF(Int4 &condition);
+		void LABEL(int labelIndex);
+		void LOOP(const Src &integerRegister);
+		void REP(const Src &integerRegister);
+		void WHILE(const Src &temporaryRegister);
+		void SWITCH();
+		void RET();
+		void LEAVE();
+		void TEX(Vector4f &dst, Vector4f &src, const Src&);
+		void TEXOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &offset);
+		void TEXLOD(Vector4f &dst, Vector4f &src, const Src&, Float4 &lod);
+		void TEXLODOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &offset, Float4 &lod);
+		void TEXELFETCH(Vector4f &dst, Vector4f &src, const Src&, Float4 &lod);
+		void TEXELFETCHOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &offset, Float4 &lod);
+		void TEXGRAD(Vector4f &dst, Vector4f &src, const Src&, Vector4f &dsx, Vector4f &dsy);
+		void TEXGRADOFFSET(Vector4f &dst, Vector4f &src, const Src&, Vector4f &dsx, Vector4f &dsy, Vector4f &offset);
+		void TEXSIZE(Vector4f &dst, Float4 &lod, const Src&);
+
+		Vector4f sampleTexture(const Src &s, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+		Vector4f sampleTexture(int sampler, Vector4f &uvwq, Float4 &lod, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function);
+
+		int ifDepth;
+		int loopRepDepth;
+		int currentLabel;
+		bool whileTest;
+
+		BasicBlock *ifFalseBlock[24 + 24];
+		BasicBlock *loopRepTestBlock[4];
+		BasicBlock *loopRepEndBlock[4];
+		BasicBlock *labelBlock[2048];
+		std::vector<BasicBlock*> callRetBlock[2048];
+		BasicBlock *returnBlock;
+		bool isConditionalIf[24 + 24];
+	};
+}
+
+#endif   // sw_VertexProgram_hpp
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
new file mode 100644
index 0000000..9b8d336
--- /dev/null
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -0,0 +1,788 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexRoutine.hpp"
+
+#include "VertexShader.hpp"
+#include "Constants.hpp"
+#include "Renderer/Vertex.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Common/Half.hpp"
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
+	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
+
+	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader)
+		: v(shader && shader->indirectAddressableInput),
+		  o(shader && shader->indirectAddressableOutput),
+		  state(state)
+	{
+	}
+
+	VertexRoutine::~VertexRoutine()
+	{
+	}
+
+	void VertexRoutine::generate()
+	{
+		const bool textureSampling = state.textureSampling;
+
+		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
+		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
+		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
+
+		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
+		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
+		UInt indexInPrimitive = 0;
+
+		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+
+		Do
+		{
+			UInt index = *Pointer<UInt>(batch);
+			UInt tagIndex = index & 0x0000003C;
+			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
+
+			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
+			{
+				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
+
+				readInput(indexQ);
+				pipeline(indexQ);
+				postTransform();
+				computeClipFlags();
+
+				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
+				writeCache(cacheLine0);
+			}
+
+			UInt cacheIndex = index & 0x0000003F;
+			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
+			writeVertex(vertex, cacheLine);
+
+			if(state.transformFeedbackEnabled != 0)
+			{
+				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
+
+				indexInPrimitive++;
+				If(indexInPrimitive == 3)
+				{
+					primitiveNumber++;
+					indexInPrimitive = 0;
+				}
+			}
+
+			vertex += sizeof(Vertex);
+			batch += sizeof(unsigned int);
+			vertexCount--;
+		}
+		Until(vertexCount == 0)
+
+		Return();
+	}
+
+	void VertexRoutine::readInput(UInt &index)
+	{
+		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		{
+			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,input) + sizeof(void*) * i);
+			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData,stride) + sizeof(unsigned int) * i);
+
+			v[i] = readStream(input, stride, state.input[i], index);
+		}
+	}
+
+	void VertexRoutine::computeClipFlags()
+	{
+		int pos = state.positionRegister;
+
+		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
+		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
+		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
+		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
+		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
+		Int4 minZ = symmetricNormalizedDepth ? CmpNLE(-o[pos].w, o[pos].z) : CmpNLE(Float4(0.0f), o[pos].z);
+
+		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
+		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
+		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
+		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
+		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
+		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
+
+		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+
+		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
+		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
+
+		if(state.preTransformed)
+		{
+			clipFlags &= 0xFBFBFBFB;   // Don't clip against far clip plane
+		}
+	}
+
+	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
+	{
+		const bool textureSampling = state.textureSampling;
+
+		Vector4f v;
+
+		Pointer<Byte> source0 = buffer + index * stride;
+		Pointer<Byte> source1 = source0 + (!textureSampling ? stride : 0);
+		Pointer<Byte> source2 = source1 + (!textureSampling ? stride : 0);
+		Pointer<Byte> source3 = source2 + (!textureSampling ? stride : 0);
+
+		bool isNativeFloatAttrib = (stream.attribType == VertexShader::ATTRIBTYPE_FLOAT) || stream.normalized;
+
+		switch(stream.type)
+		{
+		case STREAMTYPE_FLOAT:
+			{
+				if(stream.count == 0)
+				{
+					// Null stream, all default components
+				}
+				else
+				{
+					if(stream.count == 1)
+					{
+						v.x.x = *Pointer<Float>(source0);
+						v.x.y = *Pointer<Float>(source1);
+						v.x.z = *Pointer<Float>(source2);
+						v.x.w = *Pointer<Float>(source3);
+					}
+					else
+					{
+						v.x = *Pointer<Float4>(source0);
+						v.y = *Pointer<Float4>(source1);
+						v.z = *Pointer<Float4>(source2);
+						v.w = *Pointer<Float4>(source3);
+
+						transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+					}
+
+					switch(stream.attribType)
+					{
+					case VertexShader::ATTRIBTYPE_INT:
+						if(stream.count >= 1) v.x = As<Float4>(Int4(v.x));
+						if(stream.count >= 2) v.x = As<Float4>(Int4(v.y));
+						if(stream.count >= 3) v.x = As<Float4>(Int4(v.z));
+						if(stream.count >= 4) v.x = As<Float4>(Int4(v.w));
+						break;
+					case VertexShader::ATTRIBTYPE_UINT:
+						if(stream.count >= 1) v.x = As<Float4>(UInt4(v.x));
+						if(stream.count >= 2) v.x = As<Float4>(UInt4(v.y));
+						if(stream.count >= 3) v.x = As<Float4>(UInt4(v.z));
+						if(stream.count >= 4) v.x = As<Float4>(UInt4(v.w));
+						break;
+					default:
+						break;
+					}
+				}
+			}
+			break;
+		case STREAMTYPE_BYTE:
+			if(isNativeFloatAttrib) // Stream: UByte, Shader attrib: Float
+			{
+				v.x = Float4(*Pointer<Byte4>(source0));
+				v.y = Float4(*Pointer<Byte4>(source1));
+				v.z = Float4(*Pointer<Byte4>(source2));
+				v.w = Float4(*Pointer<Byte4>(source3));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+				if(stream.normalized)
+				{
+					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+				}
+			}
+			else // Stream: UByte, Shader attrib: Int / UInt
+			{
+				v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
+				v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
+				v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
+				v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+			}
+			break;
+		case STREAMTYPE_SBYTE:
+			if(isNativeFloatAttrib) // Stream: SByte, Shader attrib: Float
+			{
+				v.x = Float4(*Pointer<SByte4>(source0));
+				v.y = Float4(*Pointer<SByte4>(source1));
+				v.z = Float4(*Pointer<SByte4>(source2));
+				v.w = Float4(*Pointer<SByte4>(source3));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+				if(stream.normalized)
+				{
+					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleSByte));
+				}
+			}
+			else // Stream: SByte, Shader attrib: Int / UInt
+			{
+				v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
+				v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
+				v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
+				v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+			}
+			break;
+		case STREAMTYPE_COLOR:
+			{
+				v.x = Float4(*Pointer<Byte4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+				v.y = Float4(*Pointer<Byte4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+				v.z = Float4(*Pointer<Byte4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+				v.w = Float4(*Pointer<Byte4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleByte));
+
+				transpose4x4(v.x, v.y, v.z, v.w);
+
+				// Swap red and blue
+				Float4 t = v.x;
+				v.x = v.z;
+				v.z = t;
+			}
+			break;
+		case STREAMTYPE_SHORT:
+			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+			{
+				v.x = Float4(*Pointer<Short4>(source0));
+				v.y = Float4(*Pointer<Short4>(source1));
+				v.z = Float4(*Pointer<Short4>(source2));
+				v.w = Float4(*Pointer<Short4>(source3));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+				if(stream.normalized)
+				{
+					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleShort));
+				}
+			}
+			else // Stream: Short, Shader attrib: Int/UInt, no type conversion
+			{
+				v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
+				v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
+				v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
+				v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+			}
+			break;
+		case STREAMTYPE_USHORT:
+			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+			{
+				v.x = Float4(*Pointer<UShort4>(source0));
+				v.y = Float4(*Pointer<UShort4>(source1));
+				v.z = Float4(*Pointer<UShort4>(source2));
+				v.w = Float4(*Pointer<UShort4>(source3));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+				if(stream.normalized)
+				{
+					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants,unscaleUShort));
+				}
+			}
+			else // Stream: UShort, Shader attrib: Int/UInt, no type conversion
+			{
+				v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
+				v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
+				v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
+				v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+			}
+			break;
+		case STREAMTYPE_INT:
+			if(isNativeFloatAttrib) // Stream: Int, Shader attrib: Float
+			{
+				v.x = Float4(*Pointer<Int4>(source0));
+				v.y = Float4(*Pointer<Int4>(source1));
+				v.z = Float4(*Pointer<Int4>(source2));
+				v.w = Float4(*Pointer<Int4>(source3));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+				if(stream.normalized)
+				{
+					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleInt));
+				}
+			}
+			else // Stream: Int, Shader attrib: Int/UInt, no type conversion
+			{
+				v.x = *Pointer<Float4>(source0);
+				v.y = *Pointer<Float4>(source1);
+				v.z = *Pointer<Float4>(source2);
+				v.w = *Pointer<Float4>(source3);
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+			}
+			break;
+		case STREAMTYPE_UINT:
+			if(isNativeFloatAttrib) // Stream: UInt, Shader attrib: Float
+			{
+				v.x = Float4(*Pointer<UInt4>(source0));
+				v.y = Float4(*Pointer<UInt4>(source1));
+				v.z = Float4(*Pointer<UInt4>(source2));
+				v.w = Float4(*Pointer<UInt4>(source3));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+
+				if(stream.normalized)
+				{
+					if(stream.count >= 1) v.x *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+					if(stream.count >= 2) v.y *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+					if(stream.count >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+					if(stream.count >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUInt));
+				}
+			}
+			else // Stream: UInt, Shader attrib: Int/UInt, no type conversion
+			{
+				v.x = *Pointer<Float4>(source0);
+				v.y = *Pointer<Float4>(source1);
+				v.z = *Pointer<Float4>(source2);
+				v.w = *Pointer<Float4>(source3);
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+			}
+			break;
+		case STREAMTYPE_UDEC3:
+			{
+				// FIXME: Vectorize
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source0);
+
+					v.x.x = Float(x & 0x000003FF);
+					v.x.y = Float(y & 0x000FFC00);
+					v.x.z = Float(z & 0x3FF00000);
+				}
+
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source1);
+
+					v.y.x = Float(x & 0x000003FF);
+					v.y.y = Float(y & 0x000FFC00);
+					v.y.z = Float(z & 0x3FF00000);
+				}
+
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source2);
+
+					v.z.x = Float(x & 0x000003FF);
+					v.z.y = Float(y & 0x000FFC00);
+					v.z.z = Float(z & 0x3FF00000);
+				}
+
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source3);
+
+					v.w.x = Float(x & 0x000003FF);
+					v.w.y = Float(y & 0x000FFC00);
+					v.w.z = Float(z & 0x3FF00000);
+				}
+
+				transpose4x3(v.x, v.y, v.z, v.w);
+
+				v.y *= Float4(1.0f / 0x00000400);
+				v.z *= Float4(1.0f / 0x00100000);
+			}
+			break;
+		case STREAMTYPE_DEC3N:
+			{
+				// FIXME: Vectorize
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source0);
+
+					v.x.x = Float((x << 22) & 0xFFC00000);
+					v.x.y = Float((y << 12) & 0xFFC00000);
+					v.x.z = Float((z << 2)  & 0xFFC00000);
+				}
+
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source1);
+
+					v.y.x = Float((x << 22) & 0xFFC00000);
+					v.y.y = Float((y << 12) & 0xFFC00000);
+					v.y.z = Float((z << 2)  & 0xFFC00000);
+				}
+
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source2);
+
+					v.z.x = Float((x << 22) & 0xFFC00000);
+					v.z.y = Float((y << 12) & 0xFFC00000);
+					v.z.z = Float((z << 2)  & 0xFFC00000);
+				}
+
+				{
+					Int x, y, z;
+
+					x = y = z = *Pointer<Int>(source3);
+
+					v.w.x = Float((x << 22) & 0xFFC00000);
+					v.w.y = Float((y << 12) & 0xFFC00000);
+					v.w.z = Float((z << 2)  & 0xFFC00000);
+				}
+
+				transpose4x3(v.x, v.y, v.z, v.w);
+
+				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
+				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
+				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
+			}
+			break;
+		case STREAMTYPE_FIXED:
+			{
+				v.x = Float4(*Pointer<Int4>(source0)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+				v.y = Float4(*Pointer<Int4>(source1)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+				v.z = Float4(*Pointer<Int4>(source2)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+				v.w = Float4(*Pointer<Int4>(source3)) * *Pointer<Float4>(constants + OFFSET(Constants,unscaleFixed));
+
+				transpose4xN(v.x, v.y, v.z, v.w, stream.count);
+			}
+			break;
+		case STREAMTYPE_HALF:
+			{
+				if(stream.count >= 1)
+				{
+					UShort x0 = *Pointer<UShort>(source0 + 0);
+					UShort x1 = *Pointer<UShort>(source1 + 0);
+					UShort x2 = *Pointer<UShort>(source2 + 0);
+					UShort x3 = *Pointer<UShort>(source3 + 0);
+
+					v.x.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x0) * 4);
+					v.x.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x1) * 4);
+					v.x.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x2) * 4);
+					v.x.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(x3) * 4);
+				}
+
+				if(stream.count >= 2)
+				{
+					UShort y0 = *Pointer<UShort>(source0 + 2);
+					UShort y1 = *Pointer<UShort>(source1 + 2);
+					UShort y2 = *Pointer<UShort>(source2 + 2);
+					UShort y3 = *Pointer<UShort>(source3 + 2);
+
+					v.y.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y0) * 4);
+					v.y.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y1) * 4);
+					v.y.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y2) * 4);
+					v.y.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(y3) * 4);
+				}
+
+				if(stream.count >= 3)
+				{
+					UShort z0 = *Pointer<UShort>(source0 + 4);
+					UShort z1 = *Pointer<UShort>(source1 + 4);
+					UShort z2 = *Pointer<UShort>(source2 + 4);
+					UShort z3 = *Pointer<UShort>(source3 + 4);
+
+					v.z.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z0) * 4);
+					v.z.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z1) * 4);
+					v.z.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z2) * 4);
+					v.z.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(z3) * 4);
+				}
+
+				if(stream.count >= 4)
+				{
+					UShort w0 = *Pointer<UShort>(source0 + 6);
+					UShort w1 = *Pointer<UShort>(source1 + 6);
+					UShort w2 = *Pointer<UShort>(source2 + 6);
+					UShort w3 = *Pointer<UShort>(source3 + 6);
+
+					v.w.x = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w0) * 4);
+					v.w.y = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w1) * 4);
+					v.w.z = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w2) * 4);
+					v.w.w = *Pointer<Float>(constants + OFFSET(Constants,half2float) + Int(w3) * 4);
+				}
+			}
+			break;
+		case STREAMTYPE_INDICES:
+			{
+				v.x.x = *Pointer<Float>(source0);
+				v.x.y = *Pointer<Float>(source1);
+				v.x.z = *Pointer<Float>(source2);
+				v.x.w = *Pointer<Float>(source3);
+			}
+			break;
+		case STREAMTYPE_2_10_10_10_INT:
+			{
+				Int4 src;
+				src = Insert(src, *Pointer<Int>(source0), 0);
+				src = Insert(src, *Pointer<Int>(source1), 1);
+				src = Insert(src, *Pointer<Int>(source2), 2);
+				src = Insert(src, *Pointer<Int>(source3), 3);
+
+				v.x = Float4((src << 22) >> 22);
+				v.y = Float4((src << 12) >> 22);
+				v.z = Float4((src << 02) >> 22);
+				v.w = Float4(src >> 30);
+
+				if(stream.normalized)
+				{
+					v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
+					v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
+					v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
+					v.w = Max(v.w, Float4(-1.0f));
+				}
+			}
+			break;
+		case STREAMTYPE_2_10_10_10_UINT:
+			{
+				Int4 src;
+				src = Insert(src, *Pointer<Int>(source0), 0);
+				src = Insert(src, *Pointer<Int>(source1), 1);
+				src = Insert(src, *Pointer<Int>(source2), 2);
+				src = Insert(src, *Pointer<Int>(source3), 3);
+
+				v.x = Float4(src & Int4(0x3FF));
+				v.y = Float4((src >> 10) & Int4(0x3FF));
+				v.z = Float4((src >> 20) & Int4(0x3FF));
+				v.w = Float4((src >> 30) & Int4(0x3));
+
+				if(stream.normalized)
+				{
+					v.x *= Float4(1.0f / 0x3FF);
+					v.y *= Float4(1.0f / 0x3FF);
+					v.z *= Float4(1.0f / 0x3FF);
+					v.w *= Float4(1.0f / 0x3);
+				}
+			}
+			break;
+		default:
+			ASSERT(false);
+		}
+
+		if(stream.count < 1) v.x = Float4(0.0f);
+		if(stream.count < 2) v.y = Float4(0.0f);
+		if(stream.count < 3) v.z = Float4(0.0f);
+		if(stream.count < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(0));
+
+		return v;
+	}
+
+	void VertexRoutine::postTransform()
+	{
+		int pos = state.positionRegister;
+
+		// Backtransform
+		if(state.preTransformed)
+		{
+			Float4 rhw = Float4(1.0f) / o[pos].w;
+
+			Float4 W = *Pointer<Float4>(data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
+			Float4 H = *Pointer<Float4>(data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
+			Float4 L = *Pointer<Float4>(data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
+			Float4 T = *Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
+
+			o[pos].x = (o[pos].x - L) / W * rhw;
+			o[pos].y = (o[pos].y - T) / H * rhw;
+			o[pos].z = o[pos].z * rhw;
+			o[pos].w = rhw;
+		}
+
+		if(!halfIntegerCoordinates && !state.preTransformed)
+		{
+			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelX)) * o[pos].w;
+			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,halfPixelY)) * o[pos].w;
+		}
+
+		if(state.superSampling)
+		{
+			o[pos].x = o[pos].x + *Pointer<Float4>(data + OFFSET(DrawData,XXXX)) * o[pos].w;
+			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
+		}
+	}
+
+	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
+	{
+		Vector4f v;
+
+		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+		{
+			if(state.output[i].write)
+			{
+				v.x = o[i].x;
+				v.y = o[i].y;
+				v.z = o[i].z;
+				v.w = o[i].w;
+
+				if(state.output[i].xClamp)
+				{
+					v.x = Max(v.x, Float4(0.0f));
+					v.x = Min(v.x, Float4(1.0f));
+				}
+
+				if(state.output[i].yClamp)
+				{
+					v.y = Max(v.y, Float4(0.0f));
+					v.y = Min(v.y, Float4(1.0f));
+				}
+
+				if(state.output[i].zClamp)
+				{
+					v.z = Max(v.z, Float4(0.0f));
+					v.z = Min(v.z, Float4(1.0f));
+				}
+
+				if(state.output[i].wClamp)
+				{
+					v.w = Max(v.w, Float4(0.0f));
+					v.w = Min(v.w, Float4(1.0f));
+				}
+
+				if(state.output[i].write == 0x01)
+				{
+					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0) = v.x.x;
+					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1) = v.x.y;
+					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2) = v.x.z;
+					*Pointer<Float>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3) = v.x.w;
+				}
+				else
+				{
+					if(state.output[i].write == 0x03)
+					{
+						transpose2x4(v.x, v.y, v.z, v.w);
+					}
+					else
+					{
+						transpose4x4(v.x, v.y, v.z, v.w);
+					}
+
+					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
+					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
+					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
+					*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
+				}
+			}
+		}
+
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
+
+		// Viewport transform
+		int pos = state.positionRegister;
+
+		v.x = o[pos].x;
+		v.y = o[pos].y;
+		v.z = o[pos].z;
+		v.w = o[pos].w;
+
+		if(symmetricNormalizedDepth)
+		{
+			v.z = (v.z + v.w) * Float4(0.5f);   // [-1, 1] -> [0, 1]
+		}
+
+		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
+		Float4 rhw = Float4(1.0f) / w;
+
+		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
+		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
+		v.z = v.z * rhw;
+		v.w = rhw;
+
+		transpose4x4(v.x, v.y, v.z, v.w);
+
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
+	}
+
+	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
+	{
+		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+		{
+			if(state.output[i].write)
+			{
+				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
+			}
+		}
+
+		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
+		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
+	}
+
+	void VertexRoutine::transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive)
+	{
+		If(indexInPrimitive < state.verticesPerPrimitive)
+		{
+			UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
+
+			for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; i++)
+			{
+				if(state.transformFeedbackEnabled & (1ULL << i))
+				{
+					UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
+					UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
+					UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
+					UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
+
+					Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
+					Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
+
+					For(UInt r = 0, r < row, r++)
+					{
+						UInt rOffsetX = r * col * sizeof(float);
+						UInt rOffset4 = r * sizeof(float4);
+
+						For(UInt c = 0, c < col, c++)
+						{
+							UInt cOffset = c * sizeof(float);
+							*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
+						}
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
new file mode 100644
index 0000000..905118b
--- /dev/null
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -0,0 +1,71 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexRoutine_hpp
+#define sw_VertexRoutine_hpp
+
+#include "Renderer/Color.hpp"
+#include "Renderer/VertexProcessor.hpp"
+#include "ShaderCore.hpp"
+#include "VertexShader.hpp"
+
+namespace sw
+{
+	class VertexRoutinePrototype : public Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)>
+	{
+	public:
+		VertexRoutinePrototype() : vertex(Arg<0>()), batch(Arg<1>()), task(Arg<2>()), data(Arg<3>()) {}
+		virtual ~VertexRoutinePrototype() {};
+
+	protected:
+		Pointer<Byte> vertex;
+		Pointer<Byte> batch;
+		Pointer<Byte> task;
+		Pointer<Byte> data;
+	};
+
+	class VertexRoutine : public VertexRoutinePrototype
+	{
+	public:
+		VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader);
+		virtual ~VertexRoutine();
+
+		void generate();
+
+	protected:
+		Pointer<Byte> constants;
+
+		Int clipFlags;
+
+		RegisterArray<MAX_VERTEX_INPUTS> v;    // Input registers
+		RegisterArray<MAX_VERTEX_OUTPUTS> o;   // Output registers
+
+		const VertexProcessor::State &state;
+
+	private:
+		virtual void pipeline(UInt &index) = 0;
+
+		typedef VertexProcessor::State::Input Stream;
+
+		Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index);
+		void readInput(UInt &index);
+		void computeClipFlags();
+		void postTransform();
+		void writeCache(Pointer<Byte> &cacheLine);
+		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheLine);
+		void transformFeedback(const Pointer<Byte> &vertex, const UInt &primitiveNumber, const UInt &indexInPrimitive);
+	};
+}
+
+#endif   // sw_VertexRoutine_hpp
diff --git a/src/Pipeline/VertexShader.cpp b/src/Pipeline/VertexShader.cpp
new file mode 100644
index 0000000..8f1c4f8
--- /dev/null
+++ b/src/Pipeline/VertexShader.cpp
@@ -0,0 +1,330 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "VertexShader.hpp"
+
+#include "Renderer/Vertex.hpp"
+#include "Common/Debug.hpp"
+
+#include <string.h>
+
+namespace sw
+{
+	VertexShader::VertexShader(const VertexShader *vs) : Shader()
+	{
+		shaderModel = 0x0300;
+		positionRegister = Pos;
+		pointSizeRegister = Unused;
+		instanceIdDeclared = false;
+		vertexIdDeclared = false;
+		textureSampling = false;
+
+		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		{
+			input[i] = Semantic();
+			attribType[i] = ATTRIBTYPE_FLOAT;
+		}
+
+		if(vs)   // Make a copy
+		{
+			for(size_t i = 0; i < vs->getLength(); i++)
+			{
+				append(new sw::Shader::Instruction(*vs->getInstruction(i)));
+			}
+
+			memcpy(output, vs->output, sizeof(output));
+			memcpy(input, vs->input, sizeof(input));
+			memcpy(attribType, vs->attribType, sizeof(attribType));
+			positionRegister = vs->positionRegister;
+			pointSizeRegister = vs->pointSizeRegister;
+			instanceIdDeclared = vs->instanceIdDeclared;
+			vertexIdDeclared = vs->vertexIdDeclared;
+			usedSamplers = vs->usedSamplers;
+
+			optimize();
+			analyze();
+		}
+	}
+
+	VertexShader::VertexShader(const unsigned long *token) : Shader()
+	{
+		parse(token);
+
+		positionRegister = Pos;
+		pointSizeRegister = Unused;
+		instanceIdDeclared = false;
+		vertexIdDeclared = false;
+		textureSampling = false;
+
+		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
+		{
+			input[i] = Semantic();
+			attribType[i] = ATTRIBTYPE_FLOAT;
+		}
+
+		optimize();
+		analyze();
+	}
+
+	VertexShader::~VertexShader()
+	{
+	}
+
+	int VertexShader::validate(const unsigned long *const token)
+	{
+		if(!token)
+		{
+			return 0;
+		}
+
+		unsigned short version = (unsigned short)(token[0] & 0x0000FFFF);
+		unsigned char majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
+		ShaderType shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
+
+		if(shaderType != SHADER_VERTEX || majorVersion > 3)
+		{
+			return 0;
+		}
+
+		int instructionCount = 1;
+
+		for(int i = 0; token[i] != 0x0000FFFF; i++)
+		{
+			if((token[i] & 0x0000FFFF) == 0x0000FFFE)   // Comment token
+			{
+				int length = (token[i] & 0x7FFF0000) >> 16;
+
+				i += length;
+			}
+			else
+			{
+				Shader::Opcode opcode = (Shader::Opcode)(token[i] & 0x0000FFFF);
+
+				switch(opcode)
+				{
+				case Shader::OPCODE_TEXCOORD:
+				case Shader::OPCODE_TEXKILL:
+				case Shader::OPCODE_TEX:
+				case Shader::OPCODE_TEXBEM:
+				case Shader::OPCODE_TEXBEML:
+				case Shader::OPCODE_TEXREG2AR:
+				case Shader::OPCODE_TEXREG2GB:
+				case Shader::OPCODE_TEXM3X2PAD:
+				case Shader::OPCODE_TEXM3X2TEX:
+				case Shader::OPCODE_TEXM3X3PAD:
+				case Shader::OPCODE_TEXM3X3TEX:
+				case Shader::OPCODE_RESERVED0:
+				case Shader::OPCODE_TEXM3X3SPEC:
+				case Shader::OPCODE_TEXM3X3VSPEC:
+				case Shader::OPCODE_TEXREG2RGB:
+				case Shader::OPCODE_TEXDP3TEX:
+				case Shader::OPCODE_TEXM3X2DEPTH:
+				case Shader::OPCODE_TEXDP3:
+				case Shader::OPCODE_TEXM3X3:
+				case Shader::OPCODE_TEXDEPTH:
+				case Shader::OPCODE_CMP0:
+				case Shader::OPCODE_BEM:
+				case Shader::OPCODE_DP2ADD:
+				case Shader::OPCODE_DFDX:
+				case Shader::OPCODE_DFDY:
+				case Shader::OPCODE_TEXLDD:
+					return 0;   // Unsupported operation
+				default:
+					instructionCount++;
+					break;
+				}
+
+				i += size(token[i], version);
+			}
+		}
+
+		return instructionCount;
+	}
+
+	bool VertexShader::containsTextureSampling() const
+	{
+		return textureSampling;
+	}
+
+	void VertexShader::setInput(int inputIdx, const sw::Shader::Semantic& semantic, AttribType aType)
+	{
+		input[inputIdx] = semantic;
+		attribType[inputIdx] = aType;
+	}
+
+	void VertexShader::setOutput(int outputIdx, int nbComponents, const sw::Shader::Semantic& semantic)
+	{
+		for(int i = 0; i < nbComponents; ++i)
+		{
+			output[outputIdx][i] = semantic;
+		}
+	}
+
+	void VertexShader::setPositionRegister(int posReg)
+	{
+		setOutput(posReg, 4, sw::Shader::Semantic(sw::Shader::USAGE_POSITION, 0));
+		positionRegister = posReg;
+	}
+
+	void VertexShader::setPointSizeRegister(int ptSizeReg)
+	{
+		setOutput(ptSizeReg, 4, sw::Shader::Semantic(sw::Shader::USAGE_PSIZE, 0));
+		pointSizeRegister = ptSizeReg;
+	}
+
+	const sw::Shader::Semantic& VertexShader::getInput(int inputIdx) const
+	{
+		return input[inputIdx];
+	}
+
+	VertexShader::AttribType VertexShader::getAttribType(int inputIdx) const
+	{
+		return attribType[inputIdx];
+	}
+
+	const sw::Shader::Semantic& VertexShader::getOutput(int outputIdx, int component) const
+	{
+		return output[outputIdx][component];
+	}
+
+	void VertexShader::analyze()
+	{
+		analyzeInput();
+		analyzeOutput();
+		analyzeDirtyConstants();
+		analyzeTextureSampling();
+		analyzeDynamicBranching();
+		analyzeSamplers();
+		analyzeCallSites();
+		analyzeIndirectAddressing();
+	}
+
+	void VertexShader::analyzeInput()
+	{
+		for(unsigned int i = 0; i < instruction.size(); i++)
+		{
+			if(instruction[i]->opcode == Shader::OPCODE_DCL &&
+			   instruction[i]->dst.type == Shader::PARAMETER_INPUT)
+			{
+				int index = instruction[i]->dst.index;
+
+				input[index] = Semantic(instruction[i]->usage, instruction[i]->usageIndex);
+			}
+		}
+	}
+
+	void VertexShader::analyzeOutput()
+	{
+		if(shaderModel < 0x0300)
+		{
+			output[Pos][0] = Semantic(Shader::USAGE_POSITION, 0);
+			output[Pos][1] = Semantic(Shader::USAGE_POSITION, 0);
+			output[Pos][2] = Semantic(Shader::USAGE_POSITION, 0);
+			output[Pos][3] = Semantic(Shader::USAGE_POSITION, 0);
+
+			for(const auto &inst : instruction)
+			{
+				const DestinationParameter &dst = inst->dst;
+
+				switch(dst.type)
+				{
+				case Shader::PARAMETER_RASTOUT:
+					switch(dst.index)
+					{
+					case 0:
+						// Position already assumed written
+						break;
+					case 1:
+						output[Fog][0] = Semantic(Shader::USAGE_FOG, 0);
+						break;
+					case 2:
+						output[Pts][1] = Semantic(Shader::USAGE_PSIZE, 0);
+						pointSizeRegister = Pts;
+						break;
+					default: ASSERT(false);
+					}
+					break;
+				case Shader::PARAMETER_ATTROUT:
+					if(dst.index == 0)
+					{
+						if(dst.x) output[C0][0] = Semantic(Shader::USAGE_COLOR, 0);
+						if(dst.y) output[C0][1] = Semantic(Shader::USAGE_COLOR, 0);
+						if(dst.z) output[C0][2] = Semantic(Shader::USAGE_COLOR, 0);
+						if(dst.w) output[C0][3] = Semantic(Shader::USAGE_COLOR, 0);
+					}
+					else if(dst.index == 1)
+					{
+						if(dst.x) output[C1][0] = Semantic(Shader::USAGE_COLOR, 1);
+						if(dst.y) output[C1][1] = Semantic(Shader::USAGE_COLOR, 1);
+						if(dst.z) output[C1][2] = Semantic(Shader::USAGE_COLOR, 1);
+						if(dst.w) output[C1][3] = Semantic(Shader::USAGE_COLOR, 1);
+					}
+					else ASSERT(false);
+					break;
+				case Shader::PARAMETER_TEXCRDOUT:
+					if(dst.x) output[T0 + dst.index][0] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+					if(dst.y) output[T0 + dst.index][1] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+					if(dst.z) output[T0 + dst.index][2] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+					if(dst.w) output[T0 + dst.index][3] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+					break;
+				default:
+					break;
+				}
+			}
+		}
+		else   // Shader Model 3.0 input declaration
+		{
+			for(const auto &inst : instruction)
+			{
+				if(inst->opcode == Shader::OPCODE_DCL &&
+				   inst->dst.type == Shader::PARAMETER_OUTPUT)
+				{
+					unsigned char usage = inst->usage;
+					unsigned char usageIndex = inst->usageIndex;
+
+					const DestinationParameter &dst = inst->dst;
+
+					if(dst.x) output[dst.index][0] = Semantic(usage, usageIndex);
+					if(dst.y) output[dst.index][1] = Semantic(usage, usageIndex);
+					if(dst.z) output[dst.index][2] = Semantic(usage, usageIndex);
+					if(dst.w) output[dst.index][3] = Semantic(usage, usageIndex);
+
+					if(usage == Shader::USAGE_POSITION && usageIndex == 0)
+					{
+						positionRegister = dst.index;
+					}
+
+					if(usage == Shader::USAGE_PSIZE && usageIndex == 0)
+					{
+						pointSizeRegister = dst.index;
+					}
+				}
+			}
+		}
+	}
+
+	void VertexShader::analyzeTextureSampling()
+	{
+		textureSampling = false;
+
+		for(const auto &inst : instruction)
+		{
+			if(inst->src[1].type == PARAMETER_SAMPLER)
+			{
+				textureSampling = true;
+				break;
+			}
+		}
+	}
+}
diff --git a/src/Pipeline/VertexShader.hpp b/src/Pipeline/VertexShader.hpp
new file mode 100644
index 0000000..9a9a0a6
--- /dev/null
+++ b/src/Pipeline/VertexShader.hpp
@@ -0,0 +1,78 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_VertexShader_hpp
+#define sw_VertexShader_hpp
+
+#include "Shader.hpp"
+#include "Main/Config.hpp"
+
+namespace sw
+{
+	class VertexShader : public Shader
+	{
+	public:
+		enum AttribType : unsigned char
+		{
+			ATTRIBTYPE_FLOAT,
+			ATTRIBTYPE_INT,
+			ATTRIBTYPE_UINT,
+
+			ATTRIBTYPE_LAST = ATTRIBTYPE_UINT
+		};
+
+		explicit VertexShader(const VertexShader *vs = 0);
+		explicit VertexShader(const unsigned long *token);
+
+		virtual ~VertexShader();
+
+		static int validate(const unsigned long *const token);   // Returns number of instructions if valid
+		bool containsTextureSampling() const;
+
+		void setInput(int inputIdx, const Semantic& semantic, AttribType attribType = ATTRIBTYPE_FLOAT);
+		void setOutput(int outputIdx, int nbComponents, const Semantic& semantic);
+		void setPositionRegister(int posReg);
+		void setPointSizeRegister(int ptSizeReg);
+		void declareInstanceId() { instanceIdDeclared = true; }
+		void declareVertexId() { vertexIdDeclared = true; }
+
+		const Semantic& getInput(int inputIdx) const;
+		const Semantic& getOutput(int outputIdx, int component) const;
+		AttribType getAttribType(int inputIndex) const;
+		int getPositionRegister() const { return positionRegister; }
+		int getPointSizeRegister() const { return pointSizeRegister; }
+		bool isInstanceIdDeclared() const { return instanceIdDeclared; }
+		bool isVertexIdDeclared() const { return vertexIdDeclared; }
+
+	private:
+		void analyze();
+		void analyzeInput();
+		void analyzeOutput();
+		void analyzeTextureSampling();
+
+		Semantic input[MAX_VERTEX_INPUTS];
+		Semantic output[MAX_VERTEX_OUTPUTS][4];
+
+		AttribType attribType[MAX_VERTEX_INPUTS];
+
+		int positionRegister;
+		int pointSizeRegister;
+
+		bool instanceIdDeclared;
+		bool vertexIdDeclared;
+		bool textureSampling;
+	};
+}
+
+#endif   // sw_VertexShader_hpp
diff --git a/src/System/CPUID.cpp b/src/System/CPUID.cpp
new file mode 100644
index 0000000..c080034
--- /dev/null
+++ b/src/System/CPUID.cpp
@@ -0,0 +1,301 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "CPUID.hpp"
+
+#if defined(_WIN32)
+	#ifndef WIN32_LEAN_AND_MEAN
+		#define WIN32_LEAN_AND_MEAN
+	#endif
+	#include <windows.h>
+	#include <intrin.h>
+	#include <float.h>
+#else
+	#include <unistd.h>
+	#include <sched.h>
+	#include <sys/types.h>
+#endif
+
+namespace sw
+{
+	bool CPUID::MMX = detectMMX();
+	bool CPUID::CMOV = detectCMOV();
+	bool CPUID::SSE = detectSSE();
+	bool CPUID::SSE2 = detectSSE2();
+	bool CPUID::SSE3 = detectSSE3();
+	bool CPUID::SSSE3 = detectSSSE3();
+	bool CPUID::SSE4_1 = detectSSE4_1();
+	int CPUID::cores = detectCoreCount();
+	int CPUID::affinity = detectAffinity();
+
+	bool CPUID::enableMMX = true;
+	bool CPUID::enableCMOV = true;
+	bool CPUID::enableSSE = true;
+	bool CPUID::enableSSE2 = true;
+	bool CPUID::enableSSE3 = true;
+	bool CPUID::enableSSSE3 = true;
+	bool CPUID::enableSSE4_1 = true;
+
+	void CPUID::setEnableMMX(bool enable)
+	{
+		enableMMX = enable;
+
+		if(!enableMMX)
+		{
+			enableSSE = false;
+			enableSSE2 = false;
+			enableSSE3 = false;
+			enableSSSE3 = false;
+			enableSSE4_1 = false;
+		}
+	}
+
+	void CPUID::setEnableCMOV(bool enable)
+	{
+		enableCMOV = enable;
+
+		if(!CMOV)
+		{
+			enableSSE = false;
+			enableSSE2 = false;
+			enableSSE3 = false;
+			enableSSSE3 = false;
+			enableSSE4_1 = false;
+		}
+	}
+
+	void CPUID::setEnableSSE(bool enable)
+	{
+		enableSSE = enable;
+
+		if(enableSSE)
+		{
+			enableMMX = true;
+			enableCMOV = true;
+		}
+		else
+		{
+			enableSSE2 = false;
+			enableSSE3 = false;
+			enableSSSE3 = false;
+			enableSSE4_1 = false;
+		}
+	}
+
+	void CPUID::setEnableSSE2(bool enable)
+	{
+		enableSSE2 = enable;
+
+		if(enableSSE2)
+		{
+			enableMMX = true;
+			enableCMOV = true;
+			enableSSE = true;
+		}
+		else
+		{
+			enableSSE3 = false;
+			enableSSSE3 = false;
+			enableSSE4_1 = false;
+		}
+	}
+
+	void CPUID::setEnableSSE3(bool enable)
+	{
+		enableSSE3 = enable;
+
+		if(enableSSE3)
+		{
+			enableMMX = true;
+			enableCMOV = true;
+			enableSSE = true;
+			enableSSE2 = true;
+		}
+		else
+		{
+			enableSSSE3 = false;
+			enableSSE4_1 = false;
+		}
+	}
+
+	void CPUID::setEnableSSSE3(bool enable)
+	{
+		enableSSSE3 = enable;
+
+		if(enableSSSE3)
+		{
+			enableMMX = true;
+			enableCMOV = true;
+			enableSSE = true;
+			enableSSE2 = true;
+			enableSSE3 = true;
+		}
+		else
+		{
+			enableSSE4_1 = false;
+		}
+	}
+
+	void CPUID::setEnableSSE4_1(bool enable)
+	{
+		enableSSE4_1 = enable;
+
+		if(enableSSE4_1)
+		{
+			enableMMX = true;
+			enableCMOV = true;
+			enableSSE = true;
+			enableSSE2 = true;
+			enableSSE3 = true;
+			enableSSSE3 = true;
+		}
+	}
+
+	static void cpuid(int registers[4], int info)
+	{
+		#if defined(__i386__) || defined(__x86_64__)
+			#if defined(_WIN32)
+				__cpuid(registers, info);
+			#else
+				__asm volatile("cpuid": "=a" (registers[0]), "=b" (registers[1]), "=c" (registers[2]), "=d" (registers[3]): "a" (info));
+			#endif
+		#else
+			registers[0] = 0;
+			registers[1] = 0;
+			registers[2] = 0;
+			registers[3] = 0;
+		#endif
+	}
+
+	bool CPUID::detectMMX()
+	{
+		int registers[4];
+		cpuid(registers, 1);
+		return MMX = (registers[3] & 0x00800000) != 0;
+	}
+
+	bool CPUID::detectCMOV()
+	{
+		int registers[4];
+		cpuid(registers, 1);
+		return CMOV = (registers[3] & 0x00008000) != 0;
+	}
+
+	bool CPUID::detectSSE()
+	{
+		int registers[4];
+		cpuid(registers, 1);
+		return SSE = (registers[3] & 0x02000000) != 0;
+	}
+
+	bool CPUID::detectSSE2()
+	{
+		int registers[4];
+		cpuid(registers, 1);
+		return SSE2 = (registers[3] & 0x04000000) != 0;
+	}
+
+	bool CPUID::detectSSE3()
+	{
+		int registers[4];
+		cpuid(registers, 1);
+		return SSE3 = (registers[2] & 0x00000001) != 0;
+	}
+
+	bool CPUID::detectSSSE3()
+	{
+		int registers[4];
+		cpuid(registers, 1);
+		return SSSE3 = (registers[2] & 0x00000200) != 0;
+	}
+
+	bool CPUID::detectSSE4_1()
+	{
+		int registers[4];
+		cpuid(registers, 1);
+		return SSE4_1 = (registers[2] & 0x00080000) != 0;
+	}
+
+	int CPUID::detectCoreCount()
+	{
+		int cores = 0;
+
+		#if defined(_WIN32)
+			DWORD_PTR processAffinityMask = 1;
+			DWORD_PTR systemAffinityMask = 1;
+
+			GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
+
+			while(systemAffinityMask)
+			{
+				if(systemAffinityMask & 1)
+				{
+					cores++;
+				}
+
+				systemAffinityMask >>= 1;
+			}
+		#else
+			cores = sysconf(_SC_NPROCESSORS_ONLN);
+		#endif
+
+		if(cores < 1)  cores = 1;
+		if(cores > 16) cores = 16;
+
+		return cores;   // FIXME: Number of physical cores
+	}
+
+	int CPUID::detectAffinity()
+	{
+		int cores = 0;
+
+		#if defined(_WIN32)
+			DWORD_PTR processAffinityMask = 1;
+			DWORD_PTR systemAffinityMask = 1;
+
+			GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask);
+
+			while(processAffinityMask)
+			{
+				if(processAffinityMask & 1)
+				{
+					cores++;
+				}
+
+				processAffinityMask >>= 1;
+			}
+		#else
+			return detectCoreCount();   // FIXME: Assumes no affinity limitation
+		#endif
+
+		if(cores < 1)  cores = 1;
+		if(cores > 16) cores = 16;
+
+		return cores;
+	}
+
+	void CPUID::setFlushToZero(bool enable)
+	{
+		#if defined(_MSC_VER)
+			_controlfp(enable ? _DN_FLUSH : _DN_SAVE, _MCW_DN);
+		#else
+			// Unimplemented
+		#endif
+	}
+
+	void CPUID::setDenormalsAreZero(bool enable)
+	{
+		// Unimplemented
+	}
+}
diff --git a/src/System/CPUID.hpp b/src/System/CPUID.hpp
new file mode 100644
index 0000000..3c21cd7
--- /dev/null
+++ b/src/System/CPUID.hpp
@@ -0,0 +1,137 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_CPUID_hpp
+#define sw_CPUID_hpp
+
+namespace sw
+{
+	#if !defined(__i386__) && defined(_M_IX86)
+		#define __i386__ 1
+	#endif
+
+	#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+		#define __x86_64__ 1
+	#endif
+
+	class CPUID
+	{
+	public:
+		static bool supportsMMX();
+		static bool supportsCMOV();
+		static bool supportsMMX2();   // MMX instructions added by SSE: pshufw, pmulhuw, pmovmskb, pavgw/b, pextrw, pinsrw, pmaxsw/ub, etc.
+		static bool supportsSSE();
+		static bool supportsSSE2();
+		static bool supportsSSE3();
+		static bool supportsSSSE3();
+		static bool supportsSSE4_1();
+		static int coreCount();
+		static int processAffinity();
+
+		static void setEnableMMX(bool enable);
+		static void setEnableCMOV(bool enable);
+		static void setEnableSSE(bool enable);
+		static void setEnableSSE2(bool enable);
+		static void setEnableSSE3(bool enable);
+		static void setEnableSSSE3(bool enable);
+		static void setEnableSSE4_1(bool enable);
+
+		static void setFlushToZero(bool enable);        // Denormal results are written as zero
+		static void setDenormalsAreZero(bool enable);   // Denormal inputs are read as zero
+
+	private:
+		static bool MMX;
+		static bool CMOV;
+		static bool SSE;
+		static bool SSE2;
+		static bool SSE3;
+		static bool SSSE3;
+		static bool SSE4_1;
+		static int cores;
+		static int affinity;
+
+		static bool enableMMX;
+		static bool enableCMOV;
+		static bool enableSSE;
+		static bool enableSSE2;
+		static bool enableSSE3;
+		static bool enableSSSE3;
+		static bool enableSSE4_1;
+
+		static bool detectMMX();
+		static bool detectCMOV();
+		static bool detectSSE();
+		static bool detectSSE2();
+		static bool detectSSE3();
+		static bool detectSSSE3();
+		static bool detectSSE4_1();
+		static int detectCoreCount();
+		static int detectAffinity();
+	};
+}
+
+namespace sw
+{
+	inline bool CPUID::supportsMMX()
+	{
+		return MMX && enableMMX;
+	}
+
+	inline bool CPUID::supportsCMOV()
+	{
+		return CMOV && enableCMOV;
+	}
+
+	inline bool CPUID::supportsMMX2()
+	{
+		return supportsSSE();   // Coincides with 64-bit integer vector instructions supported by SSE
+	}
+
+	inline bool CPUID::supportsSSE()
+	{
+		return SSE && enableSSE;
+	}
+
+	inline bool CPUID::supportsSSE2()
+	{
+		return SSE2 && enableSSE2;
+	}
+
+	inline bool CPUID::supportsSSE3()
+	{
+		return SSE3 && enableSSE3;
+	}
+
+	inline bool CPUID::supportsSSSE3()
+	{
+		return SSSE3 && enableSSSE3;
+	}
+
+	inline bool CPUID::supportsSSE4_1()
+	{
+		return SSE4_1 && enableSSE4_1;
+	}
+
+	inline int CPUID::coreCount()
+	{
+		return cores;
+	}
+
+	inline int CPUID::processAffinity()
+	{
+		return affinity;
+	}
+}
+
+#endif   // sw_CPUID_hpp
diff --git a/src/System/Configurator.cpp b/src/System/Configurator.cpp
new file mode 100644
index 0000000..ead1d28
--- /dev/null
+++ b/src/System/Configurator.cpp
@@ -0,0 +1,255 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Configurator.hpp"
+
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+#if defined(__unix__)
+#include <unistd.h>
+#endif
+
+namespace sw
+{
+	Configurator::Configurator(string iniPath)
+	{
+		path = iniPath;
+
+		readFile();
+	}
+
+	Configurator::~Configurator()
+	{
+	}
+
+	bool Configurator::readFile()
+	{
+		#if defined(__unix__)
+			if(access(path.c_str(), R_OK) != 0)
+			{
+				return false;
+			}
+		#endif
+
+		fstream file(path.c_str(), ios::in);
+		if(file.fail()) return false;
+
+		string line;
+		string keyName;
+
+		while(getline(file, line))
+		{
+			if(line.length())
+			{
+				if(line[line.length() - 1] == '\r')
+				{
+					line = line.substr(0, line.length() - 1);
+				}
+
+				if(!isprint(line[0]))
+				{
+				//	printf("Failing on char %d\n", line[0]);
+					file.close();
+					return false;
+				}
+
+				string::size_type pLeft = line.find_first_of(";#[=");
+
+				if(pLeft != string::npos)
+				{
+					switch(line[pLeft])
+					{
+					case '[':
+						{
+							string::size_type pRight = line.find_last_of("]");
+
+							if(pRight != string::npos && pRight > pLeft)
+							{
+								keyName = line.substr(pLeft + 1, pRight - pLeft - 1);
+								addKeyName(keyName);
+							}
+						}
+						break;
+					case '=':
+						{
+							string valueName = line.substr(0, pLeft);
+							string value = line.substr(pLeft + 1);
+							addValue(keyName, valueName, value);
+						}
+						break;
+					case ';':
+					case '#':
+						// Ignore comments
+						break;
+					}
+				}
+			}
+		}
+
+		file.close();
+
+		if(names.size())
+		{
+			return true;
+		}
+
+		return false;
+	}
+
+	void Configurator::writeFile(std::string title)
+	{
+		#if defined(__unix__)
+			if(access(path.c_str(), W_OK) != 0)
+			{
+				return;
+			}
+		#endif
+
+		fstream file(path.c_str(), ios::out);
+		if(file.fail()) return;
+
+		file << "; " << title << endl << endl;
+
+		for(unsigned int keyID = 0; keyID < sections.size(); keyID++)
+		{
+			file << "[" << names[keyID] << "]" << endl;
+
+			for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); valueID++)
+			{
+				file << sections[keyID].names[valueID] << "=" << sections[keyID].values[valueID] << endl;
+			}
+
+			file << endl;
+		}
+
+		file.close();
+	}
+
+	int Configurator::findKey(string keyName) const
+	{
+		for(unsigned int keyID = 0; keyID < names.size(); keyID++)
+		{
+			if(names[keyID] == keyName)
+			{
+				return keyID;
+			}
+		}
+
+		return -1;
+	}
+
+	int Configurator::findValue(unsigned int keyID, string valueName) const
+	{
+		if(!sections.size() || keyID >= sections.size())
+		{
+			return -1;
+		}
+
+		for(unsigned int valueID = 0; valueID < sections[keyID].names.size(); ++valueID)
+		{
+			if(sections[keyID].names[valueID] == valueName)
+			{
+				return valueID;
+			}
+		}
+
+		return -1;
+	}
+
+	unsigned int Configurator::addKeyName(string keyName)
+	{
+		names.resize(names.size() + 1, keyName);
+		sections.resize(sections.size() + 1);
+		return (unsigned int)names.size() - 1;
+	}
+
+	void Configurator::addValue(string const keyName, string const valueName, string const value)
+	{
+		int keyID = findKey(keyName);
+
+		if(keyID == -1)
+		{
+			keyID = addKeyName(keyName);
+		}
+
+		int valueID = findValue(keyID, valueName);
+
+		if(valueID == -1)
+		{
+			sections[keyID].names.resize(sections[keyID].names.size() + 1, valueName);
+			sections[keyID].values.resize(sections[keyID].values.size() + 1, value);
+		}
+		else
+		{
+			sections[keyID].values[valueID] = value;
+		}
+	}
+
+	string Configurator::getValue(string keyName, string valueName, string defaultValue) const
+	{
+		int keyID = findKey(keyName);
+		if(keyID == -1) return defaultValue;
+		int valueID = findValue((unsigned int)keyID, valueName);
+		if(valueID == -1) return defaultValue;
+
+		return sections[keyID].values[valueID];
+	}
+
+	int Configurator::getInteger(string keyName, string valueName, int defaultValue) const
+	{
+		char svalue[256];
+
+		sprintf(svalue, "%d", defaultValue);
+
+		return atoi(getValue(keyName, valueName, svalue).c_str());
+	}
+
+	bool Configurator::getBoolean(string keyName, string valueName, bool defaultValue) const
+	{
+		return getInteger(keyName, valueName, (int)defaultValue) != 0;
+	}
+
+	double Configurator::getFloat(string keyName, string valueName, double defaultValue) const
+	{
+		char svalue[256];
+
+		sprintf(svalue, "%f", defaultValue);
+
+		return atof(getValue(keyName, valueName, svalue).c_str());
+	}
+
+	unsigned int Configurator::getFormatted(string keyName, string valueName, char *format,
+											void *v1, void *v2, void *v3, void *v4,
+											void *v5, void *v6, void *v7, void *v8,
+											void *v9, void *v10, void *v11, void *v12,
+											void *v13, void *v14, void *v15, void *v16)
+	{
+		string value = getValue(keyName, valueName);
+
+		if(!value.length()) return false;
+
+		unsigned int nVals = sscanf(value.c_str(), format,
+									v1, v2, v3, v4, v5, v6, v7, v8,
+									v9, v10, v11, v12, v13, v14, v15, v16);
+
+		return nVals;
+	}
+}
diff --git a/src/System/Configurator.hpp b/src/System/Configurator.hpp
new file mode 100644
index 0000000..6fd930c
--- /dev/null
+++ b/src/System/Configurator.hpp
@@ -0,0 +1,66 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Configurator_hpp
+#define sw_Configurator_hpp
+
+#include <string>
+#include <vector>
+
+#include <stdlib.h>
+
+namespace sw
+{
+	class Configurator
+	{
+	public:
+		Configurator(std::string iniPath = "");
+
+		~Configurator();
+
+		std::string getValue(std::string sectionName, std::string valueName, std::string defaultValue = "") const;
+		int getInteger(std::string sectionName, std::string valueName, int defaultValue = 0) const;
+		bool getBoolean(std::string sectionName, std::string valueName, bool defaultValue = false) const;
+		double getFloat(std::string sectionName, std::string valueName, double defaultValue = 0.0) const;
+		unsigned int getFormatted(std::string sectionName, std::string valueName, char *format,
+		                          void *v1 = 0, void *v2 = 0, void *v3 = 0, void *v4 = 0,
+		                          void *v5 = 0, void *v6 = 0, void *v7 = 0, void *v8 = 0,
+		                          void *v9 = 0, void *v10 = 0, void *v11 = 0, void *v12 = 0,
+		                          void *v13 = 0, void *v14 = 0, void *v15 = 0, void *v16 = 0);
+
+		void addValue(std::string sectionName, std::string valueName, std::string value);
+
+		void writeFile(std::string title = "Configuration File");
+
+	private:
+		bool readFile();
+
+		unsigned int addKeyName(std::string sectionName);
+		int findKey(std::string sectionName) const;
+		int findValue(unsigned int sectionID, std::string valueName) const;
+
+		std::string path;
+
+		struct Section
+		{
+			std::vector<std::string> names;
+			std::vector<std::string> values;
+		};
+
+		std::vector<Section> sections;
+		std::vector<std::string> names;
+	};
+}
+
+#endif   // sw_Configurator_hpp
diff --git a/src/System/Debug.cpp b/src/System/Debug.cpp
new file mode 100644
index 0000000..acf469e
--- /dev/null
+++ b/src/System/Debug.cpp
@@ -0,0 +1,39 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Debug.hpp"
+
+#include <stdio.h>
+#include <stdarg.h>
+
+namespace sw
+{
+void trace(const char *format, ...)
+{
+	if(false)
+	{
+		FILE *file = fopen("debug.txt", "a");
+
+		if(file)
+		{
+			va_list vararg;
+			va_start(vararg, format);
+			vfprintf(file, format, vararg);
+			va_end(vararg);
+
+			fclose(file);
+		}
+	}
+}
+}
diff --git a/src/System/Debug.hpp b/src/System/Debug.hpp
new file mode 100644
index 0000000..9758c3b
--- /dev/null
+++ b/src/System/Debug.hpp
@@ -0,0 +1,58 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Debug_hpp
+#define Debug_hpp
+
+#if defined(__ANDROID__) && !defined(ANDROID_HOST_BUILD)
+#include "DebugAndroid.hpp"
+#else
+
+#include <assert.h>
+#include <stdio.h>
+
+#undef min
+#undef max
+
+namespace sw
+{
+void trace(const char *format, ...);
+inline void trace() {}
+}
+
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+	#define TRACE(format, ...) sw::trace("[0x%0.8X]%s(" format ")\n", this, __FUNCTION__, ##__VA_ARGS__)
+#else
+	#define TRACE(...) ((void)0)
+#endif
+
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+	#define UNIMPLEMENTED(...) do { \
+		sw::trace("\t! Unimplemented: %s(%d): ", __FUNCTION__, __LINE__); \
+		sw::trace(__VA_ARGS__); \
+		sw::trace("\n"); \
+		ASSERT(false); \
+	} while(0)
+#else
+	#define UNIMPLEMENTED(...) ((void)0)
+#endif
+
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+	#define ASSERT(expression) {if(!(expression)) sw::trace("\t! Assert failed in %s(%d): " #expression "\n", __FUNCTION__, __LINE__); assert(expression);}
+#else
+	#define ASSERT assert
+#endif
+
+#endif   // !__ANDROID__
+#endif   // Debug_hpp
diff --git a/src/System/DebugAndroid.cpp b/src/System/DebugAndroid.cpp
new file mode 100644
index 0000000..c511fc3
--- /dev/null
+++ b/src/System/DebugAndroid.cpp
@@ -0,0 +1,53 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "DebugAndroid.hpp"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <cutils/properties.h>
+
+void AndroidEnterDebugger()
+{
+	ALOGE(__FUNCTION__);
+#ifndef NDEBUG
+	static volatile int * const makefault = nullptr;
+	char value[PROPERTY_VALUE_MAX];
+	property_get("debug.db.uid", value, "-1");
+	int debug_uid = atoi(value);
+	if((debug_uid >= 0) && (geteuid() < static_cast<uid_t>(debug_uid)))
+	{
+		ALOGE("Waiting for debugger: gdbserver :${PORT} --attach %u. Look for thread %u", getpid(), gettid());
+		volatile int waiting = 1;
+		while (waiting) {
+			sleep(1);
+		}
+	}
+	else
+	{
+		ALOGE("No debugger");
+	}
+#endif
+}
+
+void trace(const char *format, ...)
+{
+#ifndef NDEBUG
+	va_list vararg;
+	va_start(vararg, format);
+	android_vprintLog(ANDROID_LOG_VERBOSE, NULL, LOG_TAG, format, vararg);
+	va_end(vararg);
+#endif
+}
diff --git a/src/System/DebugAndroid.hpp b/src/System/DebugAndroid.hpp
new file mode 100644
index 0000000..eced194
--- /dev/null
+++ b/src/System/DebugAndroid.hpp
@@ -0,0 +1,99 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DebugAndroid_hpp
+#define DebugAndroid_hpp
+
+#if ANDROID_PLATFORM_SDK_VERSION < 27
+#include <cutils/log.h>
+#elif ANDROID_PLATFORM_SDK_VERSION >= 27
+#include <log/log.h>
+#else
+#error "ANDROID_PLATFORM_SDK_VERSION is not defined"
+#endif
+
+#include <cassert>
+
+// On Android Virtual Devices we heavily depend on logging, even in
+// production builds. We do this because AVDs are components of larger
+// systems, and may be configured in ways that are difficult to
+// reproduce locally. For example some system run tests against
+// third-party code that we cannot access.  Aborting (cf. assert) on
+// unimplemented functionality creates two problems. First, it produces
+// a service failure where none is needed. Second, it puts the
+// customer on the critical path for notifying us of a problem.
+// The alternative, skipping unimplemented functionality silently, is
+// arguably worse: neither the service provider nor the customer will
+// learn that unimplemented functionality may have compromised the test
+// results.
+// Logging invocations of unimplemented functionality is useful to both
+// service provider and the customer. The service provider can learn
+// that the functionality is needed. The customer learns that the test
+// results may be compromised.
+
+/**
+ * Enter the debugger with a memory fault iff debuggerd is set to capture this
+ * process. Otherwise return.
+ */
+void AndroidEnterDebugger();
+
+#define ASSERT(E) do { \
+		if (!(E)) { \
+			ALOGE("badness: assertion_failed %s in %s at %s:%d", #E,	\
+				  __FUNCTION__, __FILE__, __LINE__);					\
+			AndroidEnterDebugger();										\
+		}																\
+	} while(0)
+
+#undef assert
+#define assert(E) ASSERT(E)
+
+#define ERR(format, ...)												\
+	do {																\
+		ALOGE("badness: err %s %s:%d (" format ")", __FUNCTION__, __FILE__, \
+			  __LINE__, ##__VA_ARGS__);									\
+		AndroidEnterDebugger();											\
+	} while(0)
+
+#define FIXME(format, ...)												\
+	do {																\
+		ALOGE("badness: fixme %s %s:%d (" format ")", __FUNCTION__, __FILE__, \
+			  __LINE__, ##__VA_ARGS__);									\
+		AndroidEnterDebugger();											\
+	} while(0)
+
+// TODO: Handle __VA_ARGS__ (can be empty)
+#define UNIMPLEMENTED(...) do {						\
+		ALOGE("badness: unimplemented: %s %s:%d",	\
+			  __FUNCTION__, __FILE__, __LINE__);	\
+		AndroidEnterDebugger();						\
+	} while(0)
+
+#define UNREACHABLE(value) do {                                         \
+		ALOGE("badness: unreachable case reached: %s %s:%d. %s: %d", \
+			  __FUNCTION__, __FILE__, __LINE__, #value, value);			\
+		AndroidEnterDebugger();                                         \
+	} while(0)
+
+#ifndef NDEBUG
+	#define TRACE(format, ...)								   \
+		ALOGV("%s %s:%d (" format ")", __FUNCTION__, __FILE__, \
+			  __LINE__, ##__VA_ARGS__)
+#else
+	#define TRACE(...) ((void)0)
+#endif
+
+void trace(const char *format, ...);
+
+#endif   // DebugAndroid_hpp
diff --git a/src/System/GrallocAndroid.cpp b/src/System/GrallocAndroid.cpp
new file mode 100644
index 0000000..c877e9933
--- /dev/null
+++ b/src/System/GrallocAndroid.cpp
@@ -0,0 +1,106 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "GrallocAndroid.hpp"
+#include "Debug.hpp"
+
+#ifdef HAVE_GRALLOC1
+#include <sync/sync.h>
+#endif
+
+GrallocModule *GrallocModule::getInstance()
+{
+	static GrallocModule instance;
+	return &instance;
+}
+
+GrallocModule::GrallocModule()
+{
+	const hw_module_t *module = nullptr;
+	hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &module);
+
+	m_major_version = (module->module_api_version >> 8) & 0xff;
+	switch(m_major_version)
+	{
+	case 0:
+		m_module = reinterpret_cast<const gralloc_module_t*>(module);
+		break;
+	case 1:
+#ifdef HAVE_GRALLOC1
+		gralloc1_open(module, &m_gralloc1_device);
+		m_gralloc1_lock = (GRALLOC1_PFN_LOCK) m_gralloc1_device->getFunction(m_gralloc1_device, GRALLOC1_FUNCTION_LOCK);
+		m_gralloc1_unlock = (GRALLOC1_PFN_UNLOCK)m_gralloc1_device->getFunction(m_gralloc1_device, GRALLOC1_FUNCTION_UNLOCK);
+		break;
+#endif
+	default:
+		TRACE("unknown gralloc major version (%d)", m_major_version);
+		break;
+	}
+}
+
+int GrallocModule::lock(buffer_handle_t handle, int usage, int left, int top, int width, int height, void **vaddr)
+{
+	switch(m_major_version)
+	{
+	case 0:
+		{
+			return m_module->lock(m_module, handle, usage, left, top, width, height, vaddr);
+		}
+	case 1:
+#ifdef HAVE_GRALLOC1
+		{
+			gralloc1_rect_t outRect{};
+			outRect.left = left;
+			outRect.top = top;
+			outRect.width = width;
+			outRect.height = height;
+			return m_gralloc1_lock(m_gralloc1_device, handle, usage, usage, &outRect, vaddr, -1);
+		}
+#endif
+	default:
+		{
+			TRACE("no gralloc module to lock");
+			return -1;
+		}
+	}
+}
+
+int GrallocModule::unlock(buffer_handle_t handle)
+{
+	switch(m_major_version)
+	{
+	case 0:
+		{
+			return m_module->unlock(m_module, handle);
+		}
+	case 1:
+#ifdef HAVE_GRALLOC1
+		{
+			int32_t fenceFd = -1;
+			int error = m_gralloc1_unlock(m_gralloc1_device, handle, &fenceFd);
+			if (!error)
+			{
+				sync_wait(fenceFd, -1);
+				close(fenceFd);
+			}
+			return error;
+		}
+#endif
+	default:
+		{
+			TRACE("no gralloc module to unlock");
+			return -1;
+		}
+	}
+}
diff --git a/src/System/GrallocAndroid.hpp b/src/System/GrallocAndroid.hpp
new file mode 100644
index 0000000..fe0b15a
--- /dev/null
+++ b/src/System/GrallocAndroid.hpp
@@ -0,0 +1,44 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GRALLOC_ANDROID
+#define GRALLOC_ANDROID
+
+#include <hardware/gralloc.h>
+
+#ifdef HAVE_GRALLOC1
+#include <hardware/gralloc1.h>
+#endif
+
+#include <unistd.h> // for close()
+
+class GrallocModule
+{
+public:
+	static GrallocModule *getInstance();
+	int lock(buffer_handle_t handle, int usage, int left, int top, int width, int height, void **vaddr);
+	int unlock(buffer_handle_t handle);
+
+private:
+	GrallocModule();
+	uint8_t m_major_version;
+	const gralloc_module_t *m_module;
+#ifdef HAVE_GRALLOC1
+	gralloc1_device_t *m_gralloc1_device = nullptr;
+	GRALLOC1_PFN_LOCK m_gralloc1_lock = nullptr;
+	GRALLOC1_PFN_UNLOCK m_gralloc1_unlock = nullptr;
+#endif
+};
+
+#endif  // GRALLOC_ANDROID
diff --git a/src/System/Half.cpp b/src/System/Half.cpp
new file mode 100644
index 0000000..cde8190
--- /dev/null
+++ b/src/System/Half.cpp
@@ -0,0 +1,102 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Half.hpp"
+
+namespace sw
+{
+	half::half(float fp32)
+	{
+		unsigned int fp32i = *(unsigned int*)&fp32;
+		unsigned int sign = (fp32i & 0x80000000) >> 16;
+		unsigned int abs = fp32i & 0x7FFFFFFF;
+
+		if(abs > 0x47FFEFFF)   // Infinity
+		{
+			fp16i = sign | 0x7FFF;
+		}
+		else if(abs < 0x38800000)   // Denormal
+		{
+			unsigned int mantissa = (abs & 0x007FFFFF) | 0x00800000;
+			int e = 113 - (abs >> 23);
+
+			if(e < 24)
+			{
+				abs = mantissa >> e;
+			}
+			else
+			{
+				abs = 0;
+			}
+
+			fp16i = sign | (abs + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
+		}
+		else
+		{
+			fp16i = sign | (abs + 0xC8000000 + 0x00000FFF + ((abs >> 13) & 1)) >> 13;
+		}
+	}
+
+	half::operator float() const
+	{
+		unsigned int fp32i;
+
+		int s = (fp16i >> 15) & 0x00000001;
+		int e = (fp16i >> 10) & 0x0000001F;
+		int m =  fp16i        & 0x000003FF;
+
+		if(e == 0)
+		{
+			if(m == 0)
+			{
+				fp32i = s << 31;
+
+				return (float&)fp32i;
+			}
+			else
+			{
+				while(!(m & 0x00000400))
+				{
+					m <<= 1;
+					e -=  1;
+				}
+
+				e += 1;
+				m &= ~0x00000400;
+			}
+		}
+
+		e = e + (127 - 15);
+		m = m << 13;
+
+		fp32i = (s << 31) | (e << 23) | m;
+
+		return (float&)fp32i;
+	}
+
+	half &half::operator=(half h)
+	{
+		fp16i = h.fp16i;
+
+		return *this;
+	}
+
+
+	half &half::operator=(float f)
+	{
+		*this = half(f);
+
+		return *this;
+	}
+}
diff --git a/src/System/Half.hpp b/src/System/Half.hpp
new file mode 100644
index 0000000..f2d378e
--- /dev/null
+++ b/src/System/Half.hpp
@@ -0,0 +1,93 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Half_hpp
+#define sw_Half_hpp
+
+namespace sw
+{
+	class half
+	{
+	public:
+		half() = default;
+		explicit half(float f);
+
+		operator float() const;
+
+		half &operator=(half h);
+		half &operator=(float f);
+
+	private:
+		unsigned short fp16i;
+	};
+
+	inline half shortAsHalf(short s)
+	{
+		union
+		{
+			half h;
+			short s;
+		} hs;
+
+		hs.s = s;
+
+		return hs.h;
+	}
+
+	class RGB9E5
+	{
+		unsigned int R : 9;
+		unsigned int G : 9;
+		unsigned int B : 9;
+		unsigned int E : 5;
+
+	public:
+		void toRGB16F(half rgb[3]) const
+		{
+			constexpr int offset = 24;   // Exponent bias (15) + number of mantissa bits per component (9) = 24
+
+			const float factor = (1u << E) * (1.0f / (1 << offset));
+			rgb[0] = half(R * factor);
+			rgb[1] = half(G * factor);
+			rgb[2] = half(B * factor);
+		}
+	};
+
+	class R11G11B10F
+	{
+		unsigned int R : 11;
+		unsigned int G : 11;
+		unsigned int B : 10;
+
+		static inline half float11ToFloat16(unsigned short fp11)
+		{
+			return shortAsHalf(fp11 << 4);   // Sign bit 0
+		}
+
+		static inline half float10ToFloat16(unsigned short fp10)
+		{
+			return shortAsHalf(fp10 << 5);   // Sign bit 0
+		}
+
+	public:
+		void toRGB16F(half rgb[3]) const
+		{
+			rgb[0] = float11ToFloat16(R);
+			rgb[1] = float11ToFloat16(G);
+			rgb[2] = float10ToFloat16(B);
+		}
+	};
+}
+
+#endif   // sw_Half_hpp
diff --git a/src/System/Math.cpp b/src/System/Math.cpp
new file mode 100644
index 0000000..290d4ab
--- /dev/null
+++ b/src/System/Math.cpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Math.hpp"
+
+namespace sw
+{
+	inline uint64_t FNV_1a(uint64_t hash, unsigned char data)
+	{
+		return (hash ^ data) * 1099511628211;
+	}
+
+	uint64_t FNV_1a(const unsigned char *data, int size)
+	{
+		int64_t hash = 0xCBF29CE484222325;
+
+		for(int i = 0; i < size; i++)
+		{
+			hash = FNV_1a(hash, data[i]);
+		}
+
+		return hash;
+	}
+
+	unsigned char sRGB8toLinear8(unsigned char value)
+	{
+		static unsigned char sRGBtoLinearTable[256] = { 255 };
+		if(sRGBtoLinearTable[0] == 255)
+		{
+			for(int i = 0; i < 256; i++)
+			{
+				sRGBtoLinearTable[i] = static_cast<unsigned char>(sw::sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
+			}
+		}
+
+		return sRGBtoLinearTable[value];
+	}
+}
diff --git a/src/System/Math.hpp b/src/System/Math.hpp
new file mode 100644
index 0000000..a35d2e0
--- /dev/null
+++ b/src/System/Math.hpp
@@ -0,0 +1,385 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Math_hpp
+#define sw_Math_hpp
+
+#include "Types.hpp"
+#include "Half.hpp"
+
+#include <cmath>
+#if defined(_MSC_VER)
+	#include <intrin.h>
+#endif
+
+namespace sw
+{
+	using std::abs;
+
+	#undef min
+	#undef max
+
+	template<class T>
+	inline T max(T a, T b)
+	{
+		return a > b ? a : b;
+	}
+
+	template<class T>
+	inline T min(T a, T b)
+	{
+		return a < b ? a : b;
+	}
+
+	template<class T>
+	inline T max(T a, T b, T c)
+	{
+		return max(max(a, b), c);
+	}
+
+	template<class T>
+	inline T min(T a, T b, T c)
+	{
+		return min(min(a, b), c);
+	}
+
+	template<class T>
+	inline T max(T a, T b, T c, T d)
+	{
+		return max(max(a, b), max(c, d));
+	}
+
+	template<class T>
+	inline T min(T a, T b, T c, T d)
+	{
+		return min(min(a, b), min(c, d));
+	}
+
+	template<class T>
+	inline void swap(T &a, T &b)
+	{
+		T t = a;
+		a = b;
+		b = t;
+	}
+
+	template <typename destType, typename sourceType>
+	destType bitCast(const sourceType &source)
+	{
+		union
+		{
+			sourceType s;
+			destType d;
+		} sd;
+		sd.s = source;
+		return sd.d;
+	}
+
+	inline int iround(float x)
+	{
+		return (int)floor(x + 0.5f);
+	//	return _mm_cvtss_si32(_mm_load_ss(&x));   // FIXME: Demands SSE support
+	}
+
+	inline int ifloor(float x)
+	{
+		return (int)floor(x);
+	}
+
+	inline int ceilFix4(int x)
+	{
+		return (x + 0xF) & 0xFFFFFFF0;
+	}
+
+	inline int ceilInt4(int x)
+	{
+		return (x + 0xF) >> 4;
+	}
+
+	#define BITS(x)    ( \
+	!!((x) & 0x80000000) + \
+	!!((x) & 0xC0000000) + \
+	!!((x) & 0xE0000000) + \
+	!!((x) & 0xF0000000) + \
+	!!((x) & 0xF8000000) + \
+	!!((x) & 0xFC000000) + \
+	!!((x) & 0xFE000000) + \
+	!!((x) & 0xFF000000) + \
+	!!((x) & 0xFF800000) + \
+	!!((x) & 0xFFC00000) + \
+	!!((x) & 0xFFE00000) + \
+	!!((x) & 0xFFF00000) + \
+	!!((x) & 0xFFF80000) + \
+	!!((x) & 0xFFFC0000) + \
+	!!((x) & 0xFFFE0000) + \
+	!!((x) & 0xFFFF0000) + \
+	!!((x) & 0xFFFF8000) + \
+	!!((x) & 0xFFFFC000) + \
+	!!((x) & 0xFFFFE000) + \
+	!!((x) & 0xFFFFF000) + \
+	!!((x) & 0xFFFFF800) + \
+	!!((x) & 0xFFFFFC00) + \
+	!!((x) & 0xFFFFFE00) + \
+	!!((x) & 0xFFFFFF00) + \
+	!!((x) & 0xFFFFFF80) + \
+	!!((x) & 0xFFFFFFC0) + \
+	!!((x) & 0xFFFFFFE0) + \
+	!!((x) & 0xFFFFFFF0) + \
+	!!((x) & 0xFFFFFFF8) + \
+	!!((x) & 0xFFFFFFFC) + \
+	!!((x) & 0xFFFFFFFE) + \
+	!!((x) & 0xFFFFFFFF))
+
+	#define MAX(x, y) ((x) > (y) ? (x) : (y))
+	#define MIN(x, y) ((x) < (y) ? (x) : (y))
+
+	inline float exp2(float x)
+	{
+		return exp2f(x);
+	}
+
+	inline int exp2(int x)
+	{
+		return 1 << x;
+	}
+
+	inline unsigned long log2(int x)
+	{
+		#if defined(_MSC_VER)
+			unsigned long y;
+			_BitScanReverse(&y, x);
+			return y;
+		#else
+			return 31 - __builtin_clz(x);
+		#endif
+	}
+
+	inline int ilog2(float x)
+	{
+		unsigned int y = *(unsigned int*)&x;
+
+		return ((y & 0x7F800000) >> 23) - 127;
+	}
+
+	inline float log2(float x)
+	{
+		return logf(x) * 1.44269504f;   // 1.0 / log[e](2)
+	}
+
+	inline bool isPow2(int x)
+	{
+		return (x & -x) == x;
+	}
+
+	template<class T>
+	inline T clamp(T x, T a, T b)
+	{
+		if(x < a) x = a;
+		if(x > b) x = b;
+
+		return x;
+	}
+
+	inline float clamp01(float x)
+	{
+		return clamp(x, 0.0f, 1.0f);
+	}
+
+	inline int ceilPow2(int x)
+	{
+		int i = 1;
+
+		while(i < x)
+		{
+			i <<= 1;
+		}
+
+		return i;
+	}
+
+	inline int floorDiv(int a, int b)
+	{
+		return a / b + ((a % b) >> 31);
+	}
+
+	inline int floorMod(int a, int b)
+	{
+		int r = a % b;
+		return r + ((r >> 31) & b);
+	}
+
+	inline int ceilDiv(int a, int b)
+	{
+		return a / b - (-(a % b) >> 31);
+	}
+
+	inline int ceilMod(int a, int b)
+	{
+		int r = a % b;
+		return r - ((-r >> 31) & b);
+	}
+
+	template<const int n>
+	inline unsigned int unorm(float x)
+	{
+		static const unsigned int max = 0xFFFFFFFF >> (32 - n);
+		static const float maxf = static_cast<float>(max);
+
+		if(x >= 1.0f)
+		{
+			return max;
+		}
+		else if(x <= 0.0f)
+		{
+			return 0;
+		}
+		else
+		{
+			return static_cast<unsigned int>(maxf * x + 0.5f);
+		}
+	}
+
+	template<const int n>
+	inline int snorm(float x)
+	{
+		static const unsigned int min = 0x80000000 >> (32 - n);
+		static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
+		static const float maxf = static_cast<float>(max);
+		static const unsigned int range = 0xFFFFFFFF >> (32 - n);
+
+		if(x >= 0.0f)
+		{
+			if(x >= 1.0f)
+			{
+				return max;
+			}
+			else
+			{
+				return static_cast<int>(maxf * x + 0.5f);
+			}
+		}
+		else
+		{
+			if(x <= -1.0f)
+			{
+				return min;
+			}
+			else
+			{
+				return static_cast<int>(maxf * x - 0.5f) & range;
+			}
+		}
+	}
+
+	template<const int n>
+	inline unsigned int ucast(float x)
+	{
+		static const unsigned int max = 0xFFFFFFFF >> (32 - n);
+		static const float maxf = static_cast<float>(max);
+
+		if(x >= maxf)
+		{
+			return max;
+		}
+		else if(x <= 0.0f)
+		{
+			return 0;
+		}
+		else
+		{
+			return static_cast<unsigned int>(x + 0.5f);
+		}
+	}
+
+	template<const int n>
+	inline int scast(float x)
+	{
+		static const unsigned int min = 0x80000000 >> (32 - n);
+		static const unsigned int max = 0xFFFFFFFF >> (32 - n + 1);
+		static const float maxf = static_cast<float>(max);
+		static const float minf = static_cast<float>(min);
+		static const unsigned int range = 0xFFFFFFFF >> (32 - n);
+
+		if(x > 0.0f)
+		{
+			if(x >= maxf)
+			{
+				return max;
+			}
+			else
+			{
+				return static_cast<int>(x + 0.5f);
+			}
+		}
+		else
+		{
+			if(x <= -minf)
+			{
+				return min;
+			}
+			else
+			{
+				return static_cast<int>(x - 0.5f) & range;
+			}
+		}
+	}
+
+	inline float sRGBtoLinear(float c)
+	{
+		if(c <= 0.04045f)
+		{
+			return c * 0.07739938f;   // 1.0f / 12.92f;
+		}
+		else
+		{
+			return powf((c + 0.055f) * 0.9478673f, 2.4f);   // 1.0f / 1.055f
+		}
+	}
+
+	inline float linearToSRGB(float c)
+	{
+		if(c <= 0.0031308f)
+		{
+			return c * 12.92f;
+		}
+		else
+		{
+			return 1.055f * powf(c, 0.4166667f) - 0.055f;   // 1.0f / 2.4f
+		}
+	}
+
+	unsigned char sRGB8toLinear8(unsigned char value);
+
+	uint64_t FNV_1a(const unsigned char *data, int size);   // Fowler-Noll-Vo hash function
+
+	// Round up to the next multiple of alignment
+	template<typename T>
+	inline T align(T value, unsigned int alignment)
+	{
+		return ((value + alignment - 1) / alignment) * alignment;
+	}
+
+	template<unsigned int alignment, typename T>
+	inline T align(T value)
+	{
+		return ((value + alignment - 1) / alignment) * alignment;
+	}
+
+	inline int clampToSignedInt(unsigned int x)
+	{
+		return static_cast<int>(min(x, 0x7FFFFFFFu));
+	}
+}
+
+#endif   // sw_Math_hpp
diff --git a/src/System/Memory.cpp b/src/System/Memory.cpp
new file mode 100644
index 0000000..45fef40
--- /dev/null
+++ b/src/System/Memory.cpp
@@ -0,0 +1,262 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Memory.hpp"
+
+#include "Types.hpp"
+#include "Debug.hpp"
+
+#if defined(_WIN32)
+	#ifndef WIN32_LEAN_AND_MEAN
+		#define WIN32_LEAN_AND_MEAN
+	#endif
+	#include <windows.h>
+	#include <intrin.h>
+#else
+	#include <errno.h>
+	#include <sys/mman.h>
+	#include <stdlib.h>
+	#include <unistd.h>
+#endif
+
+#include <memory.h>
+
+#undef allocate
+#undef deallocate
+
+#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined (_M_X64)) && !defined(__x86__)
+#define __x86__
+#endif
+
+namespace sw
+{
+namespace
+{
+struct Allocation
+{
+//	size_t bytes;
+	unsigned char *block;
+};
+
+void *allocateRaw(size_t bytes, size_t alignment)
+{
+	ASSERT((alignment & (alignment - 1)) == 0);   // Power of 2 alignment.
+
+	#if defined(LINUX_ENABLE_NAMED_MMAP)
+		void *allocation;
+		int result = posix_memalign(&allocation, alignment, bytes);
+		if(result != 0)
+		{
+			errno = result;
+			allocation = nullptr;
+		}
+		return allocation;
+	#else
+		unsigned char *block = new unsigned char[bytes + sizeof(Allocation) + alignment];
+		unsigned char *aligned = nullptr;
+
+		if(block)
+		{
+			aligned = (unsigned char*)((uintptr_t)(block + sizeof(Allocation) + alignment - 1) & -(intptr_t)alignment);
+			Allocation *allocation = (Allocation*)(aligned - sizeof(Allocation));
+
+		//	allocation->bytes = bytes;
+			allocation->block = block;
+		}
+
+		return aligned;
+	#endif
+}
+
+#if defined(LINUX_ENABLE_NAMED_MMAP)
+// Create a file descriptor for anonymous memory with the given
+// name. Returns -1 on failure.
+// TODO: remove once libc wrapper exists.
+int memfd_create(const char* name, unsigned int flags)
+{
+	#if __aarch64__
+	#define __NR_memfd_create 279
+	#elif __arm__
+	#define __NR_memfd_create 279
+	#elif __powerpc64__
+	#define __NR_memfd_create 360
+	#elif __i386__
+	#define __NR_memfd_create 356
+	#elif __x86_64__
+	#define __NR_memfd_create 319
+	#endif /* __NR_memfd_create__ */
+	#ifdef __NR_memfd_create
+		// In the event of no system call this returns -1 with errno set
+		// as ENOSYS.
+		return syscall(__NR_memfd_create, name, flags);
+	#else
+		return -1;
+	#endif
+}
+
+// Returns a file descriptor for use with an anonymous mmap, if
+// memfd_create fails, -1 is returned. Note, the mappings should be
+// MAP_PRIVATE so that underlying pages aren't shared.
+int anonymousFd()
+{
+	static int fd = memfd_create("SwiftShader JIT", 0);
+	return fd;
+}
+
+// Ensure there is enough space in the "anonymous" fd for length.
+void ensureAnonFileSize(int anonFd, size_t length)
+{
+	static size_t fileSize = 0;
+	if(length > fileSize)
+	{
+		ftruncate(anonFd, length);
+		fileSize = length;
+	}
+}
+#endif  // defined(LINUX_ENABLE_NAMED_MMAP)
+
+}  // anonymous namespace
+
+size_t memoryPageSize()
+{
+	static int pageSize = 0;
+
+	if(pageSize == 0)
+	{
+		#if defined(_WIN32)
+			SYSTEM_INFO systemInfo;
+			GetSystemInfo(&systemInfo);
+			pageSize = systemInfo.dwPageSize;
+		#else
+			pageSize = sysconf(_SC_PAGESIZE);
+		#endif
+	}
+
+	return pageSize;
+}
+
+void *allocate(size_t bytes, size_t alignment)
+{
+	void *memory = allocateRaw(bytes, alignment);
+
+	if(memory)
+	{
+		memset(memory, 0, bytes);
+	}
+
+	return memory;
+}
+
+void deallocate(void *memory)
+{
+	#if defined(LINUX_ENABLE_NAMED_MMAP)
+		free(memory);
+	#else
+		if(memory)
+		{
+			unsigned char *aligned = (unsigned char*)memory;
+			Allocation *allocation = (Allocation*)(aligned - sizeof(Allocation));
+
+			delete[] allocation->block;
+		}
+	#endif
+}
+
+void *allocateExecutable(size_t bytes)
+{
+	size_t pageSize = memoryPageSize();
+	size_t length = (bytes + pageSize - 1) & ~(pageSize - 1);
+	void *mapping;
+
+	#if defined(LINUX_ENABLE_NAMED_MMAP)
+		// Try to name the memory region for the executable code,
+		// to aid profilers.
+		int anonFd = anonymousFd();
+		if(anonFd == -1)
+		{
+			mapping = mmap(nullptr, length, PROT_READ | PROT_WRITE,
+			               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		}
+		else
+		{
+			ensureAnonFileSize(anonFd, length);
+			mapping = mmap(nullptr, length, PROT_READ | PROT_WRITE,
+			               MAP_PRIVATE, anonFd, 0);
+		}
+
+		if(mapping == MAP_FAILED)
+		{
+			mapping = nullptr;
+		}
+	#else
+		mapping = allocate(length, pageSize);
+	#endif
+
+	return mapping;
+}
+
+void markExecutable(void *memory, size_t bytes)
+{
+	#if defined(_WIN32)
+		unsigned long oldProtection;
+		VirtualProtect(memory, bytes, PAGE_EXECUTE_READ, &oldProtection);
+	#else
+		mprotect(memory, bytes, PROT_READ | PROT_EXEC);
+	#endif
+}
+
+void deallocateExecutable(void *memory, size_t bytes)
+{
+	#if defined(_WIN32)
+		unsigned long oldProtection;
+		VirtualProtect(memory, bytes, PAGE_READWRITE, &oldProtection);
+		deallocate(memory);
+	#elif defined(LINUX_ENABLE_NAMED_MMAP)
+		size_t pageSize = memoryPageSize();
+		size_t length = (bytes + pageSize - 1) & ~(pageSize - 1);
+		munmap(memory, length);
+	#else
+		mprotect(memory, bytes, PROT_READ | PROT_WRITE);
+		deallocate(memory);
+	#endif
+}
+
+void clear(uint16_t *memory, uint16_t element, size_t count)
+{
+	#if defined(_MSC_VER) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+		__stosw(memory, element, count);
+	#elif defined(__GNUC__) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+		__asm__("rep stosw" : : "D"(memory), "a"(element), "c"(count));
+	#else
+		for(size_t i = 0; i < count; i++)
+		{
+			memory[i] = element;
+		}
+	#endif
+}
+
+void clear(uint32_t *memory, uint32_t element, size_t count)
+{
+	#if defined(_MSC_VER) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+		__stosd((unsigned long*)memory, element, count);
+	#elif defined(__GNUC__) && defined(__x86__) && !defined(MEMORY_SANITIZER)
+		__asm__("rep stosl" : : "D"(memory), "a"(element), "c"(count));
+	#else
+		for(size_t i = 0; i < count; i++)
+		{
+			memory[i] = element;
+		}
+	#endif
+}
+}
diff --git a/src/System/Memory.hpp b/src/System/Memory.hpp
new file mode 100644
index 0000000..8d3a159
--- /dev/null
+++ b/src/System/Memory.hpp
@@ -0,0 +1,36 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef Memory_hpp
+#define Memory_hpp
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace sw
+{
+size_t memoryPageSize();
+
+void *allocate(size_t bytes, size_t alignment = 16);
+void deallocate(void *memory);
+
+void *allocateExecutable(size_t bytes);   // Allocates memory that can be made executable using markExecutable()
+void markExecutable(void *memory, size_t bytes);
+void deallocateExecutable(void *memory, size_t bytes);
+
+void clear(uint16_t *memory, uint16_t element, size_t count);
+void clear(uint32_t *memory, uint32_t element, size_t count);
+}
+
+#endif   // Memory_hpp
diff --git a/src/System/MutexLock.hpp b/src/System/MutexLock.hpp
new file mode 100644
index 0000000..65e9fa4
--- /dev/null
+++ b/src/System/MutexLock.hpp
@@ -0,0 +1,199 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_MutexLock_hpp
+#define sw_MutexLock_hpp
+
+#include "Thread.hpp"
+
+#if defined(__linux__)
+// Use a pthread mutex on Linux. Since many processes may use SwiftShader
+// at the same time it's best to just have the scheduler overhead.
+#include <pthread.h>
+
+namespace sw
+{
+	class MutexLock
+	{
+	public:
+		MutexLock()
+		{
+			pthread_mutex_init(&mutex, NULL);
+		}
+
+		~MutexLock()
+		{
+			pthread_mutex_destroy(&mutex);
+		}
+
+		bool attemptLock()
+		{
+			return pthread_mutex_trylock(&mutex) == 0;
+		}
+
+		void lock()
+		{
+			pthread_mutex_lock(&mutex);
+		}
+
+		void unlock()
+		{
+			pthread_mutex_unlock(&mutex);
+		}
+
+	private:
+		pthread_mutex_t mutex;
+	};
+}
+
+#else   // !__linux__
+
+#include <atomic>
+
+namespace sw
+{
+	class BackoffLock
+	{
+	public:
+		BackoffLock()
+		{
+			mutex = 0;
+		}
+
+		bool attemptLock()
+		{
+			if(!isLocked())
+			{
+				if(mutex.exchange(true) == false)
+				{
+					return true;
+				}
+			}
+
+			return false;
+		}
+
+		void lock()
+		{
+			int backoff = 1;
+
+			while(!attemptLock())
+			{
+				if(backoff <= 64)
+				{
+					for(int i = 0; i < backoff; i++)
+					{
+						nop();
+						nop();
+						nop();
+						nop();
+						nop();
+
+						nop();
+						nop();
+						nop();
+						nop();
+						nop();
+
+						nop();
+						nop();
+						nop();
+						nop();
+						nop();
+
+						nop();
+						nop();
+						nop();
+						nop();
+						nop();
+
+						nop();
+						nop();
+						nop();
+						nop();
+						nop();
+
+						nop();
+						nop();
+						nop();
+						nop();
+						nop();
+
+						nop();
+						nop();
+						nop();
+						nop();
+						nop();
+					}
+
+					backoff *= 2;
+				}
+				else
+				{
+					Thread::yield();
+
+					backoff = 1;
+				}
+			};
+		}
+
+		void unlock()
+		{
+			mutex.store(false, std::memory_order_release);
+		}
+
+		bool isLocked()
+		{
+			return mutex.load(std::memory_order_acquire);
+		}
+
+	private:
+		struct
+		{
+			// Ensure that the mutex variable is on its own 64-byte cache line to avoid false sharing
+			// Padding must be public to avoid compiler warnings
+			volatile int padding1[16];
+			std::atomic<bool> mutex;
+			volatile int padding2[15];
+		};
+	};
+
+	using MutexLock = BackoffLock;
+}
+
+#endif   // !__ANDROID__
+
+class LockGuard
+{
+public:
+	explicit LockGuard(sw::MutexLock &mutex) : mutex(&mutex)
+	{
+		mutex.lock();
+	}
+
+	explicit LockGuard(sw::MutexLock *mutex) : mutex(mutex)
+	{
+		if (mutex) mutex->lock();
+	}
+
+	~LockGuard()
+	{
+		if (mutex) mutex->unlock();
+	}
+
+protected:
+	sw::MutexLock *mutex;
+};
+
+#endif   // sw_MutexLock_hpp
diff --git a/src/System/Resource.cpp b/src/System/Resource.cpp
new file mode 100644
index 0000000..3a63810
--- /dev/null
+++ b/src/System/Resource.cpp
@@ -0,0 +1,184 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Resource.hpp"
+
+#include "Memory.hpp"
+#include "Debug.hpp"
+
+namespace sw
+{
+	Resource::Resource(size_t bytes) : size(bytes)
+	{
+		blocked = 0;
+
+		accessor = PUBLIC;
+		count = 0;
+		orphaned = false;
+
+		buffer = allocate(bytes);
+	}
+
+	Resource::~Resource()
+	{
+		deallocate(buffer);
+	}
+
+	void *Resource::lock(Accessor claimer)
+	{
+		criticalSection.lock();
+
+		while(count > 0 && accessor != claimer)
+		{
+			blocked++;
+			criticalSection.unlock();
+
+			unblock.wait();
+
+			criticalSection.lock();
+			blocked--;
+		}
+
+		accessor = claimer;
+		count++;
+
+		criticalSection.unlock();
+
+		return buffer;
+	}
+
+	void *Resource::lock(Accessor relinquisher, Accessor claimer)
+	{
+		criticalSection.lock();
+
+		// Release
+		while(count > 0 && accessor == relinquisher)
+		{
+			count--;
+
+			if(count == 0)
+			{
+				if(blocked)
+				{
+					unblock.signal();
+				}
+				else if(orphaned)
+				{
+					criticalSection.unlock();
+
+					delete this;
+
+					return 0;
+				}
+			}
+		}
+
+		// Acquire
+		while(count > 0 && accessor != claimer)
+		{
+			blocked++;
+			criticalSection.unlock();
+
+			unblock.wait();
+
+			criticalSection.lock();
+			blocked--;
+		}
+
+		accessor = claimer;
+		count++;
+
+		criticalSection.unlock();
+
+		return buffer;
+	}
+
+	void Resource::unlock()
+	{
+		criticalSection.lock();
+		ASSERT(count > 0);
+
+		count--;
+
+		if(count == 0)
+		{
+			if(blocked)
+			{
+				unblock.signal();
+			}
+			else if(orphaned)
+			{
+				criticalSection.unlock();
+
+				delete this;
+
+				return;
+			}
+		}
+
+		criticalSection.unlock();
+	}
+
+	void Resource::unlock(Accessor relinquisher)
+	{
+		criticalSection.lock();
+		ASSERT(count > 0);
+
+		while(count > 0 && accessor == relinquisher)
+		{
+			count--;
+
+			if(count == 0)
+			{
+				if(blocked)
+				{
+					unblock.signal();
+				}
+				else if(orphaned)
+				{
+					criticalSection.unlock();
+
+					delete this;
+
+					return;
+				}
+			}
+		}
+
+		criticalSection.unlock();
+	}
+
+	void Resource::destruct()
+	{
+		criticalSection.lock();
+
+		if(count == 0 && !blocked)
+		{
+			criticalSection.unlock();
+
+			delete this;
+
+			return;
+		}
+
+		orphaned = true;
+
+		criticalSection.unlock();
+	}
+
+	const void *Resource::data() const
+	{
+		return buffer;
+	}
+}
diff --git a/src/System/Resource.hpp b/src/System/Resource.hpp
new file mode 100644
index 0000000..0acfa48
--- /dev/null
+++ b/src/System/Resource.hpp
@@ -0,0 +1,60 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Resource_hpp
+#define sw_Resource_hpp
+
+#include "MutexLock.hpp"
+
+namespace sw
+{
+	enum Accessor
+	{
+		PUBLIC,    // Application/API access
+		PRIVATE,   // Renderer access, shared by multiple threads if read-only
+		MANAGED,   // Renderer access, shared read/write access if partitioned
+		EXCLUSIVE
+	};
+
+	class Resource
+	{
+	public:
+		Resource(size_t bytes);
+
+		void destruct();   // Asynchronous destructor
+
+		void *lock(Accessor claimer);
+		void *lock(Accessor relinquisher, Accessor claimer);
+		void unlock();
+		void unlock(Accessor relinquisher);
+
+		const void *data() const;
+		const size_t size;
+
+	private:
+		~Resource();   // Always call destruct() instead
+
+		MutexLock criticalSection;
+		Event unblock;
+		volatile int blocked;
+
+		volatile Accessor accessor;
+		volatile int count;
+		bool orphaned;
+
+		void *buffer;
+	};
+}
+
+#endif   // sw_Resource_hpp
diff --git a/src/System/SharedLibrary.hpp b/src/System/SharedLibrary.hpp
new file mode 100644
index 0000000..8a8c3a1
--- /dev/null
+++ b/src/System/SharedLibrary.hpp
@@ -0,0 +1,171 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SharedLibrary_hpp
+#define SharedLibrary_hpp
+
+#if defined(_WIN32)
+	#include <Windows.h>
+#else
+	#include <dlfcn.h>
+#endif
+
+#include <string>
+
+void *getLibraryHandle(const char *path);
+void *loadLibrary(const char *path);
+void freeLibrary(void *library);
+void *getProcAddress(void *library, const char *name);
+
+template<int n>
+void *loadLibrary(const std::string &libraryDirectory, const char *(&names)[n], const char *mustContainSymbol = nullptr)
+{
+	for(const char *libraryName : names)
+	{
+		std::string libraryPath = libraryDirectory + libraryName;
+		void *library = getLibraryHandle(libraryPath.c_str());
+
+		if(library)
+		{
+			if(!mustContainSymbol || getProcAddress(library, mustContainSymbol))
+			{
+				return library;
+			}
+
+			freeLibrary(library);
+		}
+	}
+
+	for(const char *libraryName : names)
+	{
+		std::string libraryPath = libraryDirectory + libraryName;
+		void *library = loadLibrary(libraryPath.c_str());
+
+		if(library)
+		{
+			if(!mustContainSymbol || getProcAddress(library, mustContainSymbol))
+			{
+				return library;
+			}
+
+			freeLibrary(library);
+		}
+	}
+
+	return nullptr;
+}
+
+#if defined(_WIN32)
+	inline void *loadLibrary(const char *path)
+	{
+		return (void*)LoadLibrary(path);
+	}
+
+	inline void *getLibraryHandle(const char *path)
+	{
+		HMODULE module = NULL;
+		GetModuleHandleEx(0, path, &module);
+		return (void*)module;
+	}
+
+	inline void freeLibrary(void *library)
+	{
+		FreeLibrary((HMODULE)library);
+	}
+
+	inline void *getProcAddress(void *library, const char *name)
+	{
+		return (void*)GetProcAddress((HMODULE)library, name);
+	}
+
+	inline std::string getModuleDirectory()
+	{
+		static int dummy_symbol = 0;
+
+		HMODULE module = NULL;
+		GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS, (LPCTSTR)&dummy_symbol, &module);
+
+		char filename[1024];
+		if(module && (GetModuleFileName(module, filename, sizeof(filename)) != 0))
+		{
+			std::string directory(filename);
+			return directory.substr(0, directory.find_last_of("\\/") + 1).c_str();
+		}
+		else
+		{
+			return "";
+		}
+	}
+#else
+	inline void *loadLibrary(const char *path)
+	{
+		return dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+	}
+
+	inline void *getLibraryHandle(const char *path)
+	{
+		#ifdef __ANDROID__
+			// bionic doesn't support RTLD_NOLOAD before L
+			return dlopen(path, RTLD_NOW | RTLD_LOCAL);
+		#else
+			void *resident = dlopen(path, RTLD_LAZY | RTLD_NOLOAD | RTLD_LOCAL);
+
+			if(resident)
+			{
+				return dlopen(path, RTLD_LAZY | RTLD_LOCAL);   // Increment reference count
+			}
+
+			return nullptr;
+		#endif
+	}
+
+	inline void freeLibrary(void *library)
+	{
+		if(library)
+		{
+			dlclose(library);
+		}
+	}
+
+	inline void *getProcAddress(void *library, const char *name)
+	{
+		void *symbol = dlsym(library, name);
+
+		if(!symbol)
+		{
+			const char *reason = dlerror();   // Silence the error
+			(void)reason;
+		}
+
+		return symbol;
+	}
+
+	inline std::string getModuleDirectory()
+	{
+		static int dummy_symbol = 0;
+
+		Dl_info dl_info;
+		if(dladdr(&dummy_symbol, &dl_info) != 0)
+		{
+			std::string directory(dl_info.dli_fname);
+			return directory.substr(0, directory.find_last_of("\\/") + 1).c_str();
+		}
+		else
+		{
+			return "";
+		}
+	}
+#endif
+
+#endif   // SharedLibrary_hpp
diff --git a/src/System/Socket.cpp b/src/System/Socket.cpp
new file mode 100644
index 0000000..b098031
--- /dev/null
+++ b/src/System/Socket.cpp
@@ -0,0 +1,110 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Socket.hpp"
+
+#if defined(_WIN32)
+	#include <ws2tcpip.h>
+#else
+	#include <unistd.h>
+	#include <netdb.h>
+	#include <netinet/in.h>
+	#include <sys/select.h>
+#endif
+
+namespace sw
+{
+	Socket::Socket(SOCKET socket) : socket(socket)
+	{
+	}
+
+	Socket::Socket(const char *address, const char *port)
+	{
+		#if defined(_WIN32)
+			socket = INVALID_SOCKET;
+		#else
+			socket = -1;
+		#endif
+
+		addrinfo hints = {};
+		hints.ai_family = AF_INET;
+		hints.ai_socktype = SOCK_STREAM;
+		hints.ai_protocol = IPPROTO_TCP;
+		hints.ai_flags = AI_PASSIVE;
+
+		addrinfo *info = 0;
+		getaddrinfo(address, port, &hints, &info);
+
+		if(info)
+		{
+			socket = ::socket(info->ai_family, info->ai_socktype, info->ai_protocol);
+			bind(socket, info->ai_addr, (int)info->ai_addrlen);
+		}
+	}
+
+	Socket::~Socket()
+	{
+		#if defined(_WIN32)
+			closesocket(socket);
+		#else
+			close(socket);
+		#endif
+	}
+
+	void Socket::listen(int backlog)
+	{
+		::listen(socket, backlog);
+	}
+
+	bool Socket::select(int us)
+	{
+		fd_set sockets;
+		FD_ZERO(&sockets);
+		FD_SET(socket, &sockets);
+
+		timeval timeout = {us / 1000000, us % 1000000};
+
+		return ::select(FD_SETSIZE, &sockets, 0, 0, &timeout) >= 1;
+	}
+
+	Socket *Socket::accept()
+	{
+		return new Socket(::accept(socket, 0, 0));
+	}
+
+	int Socket::receive(char *buffer, int length)
+	{
+		return recv(socket, buffer, length, 0);
+	}
+
+	void Socket::send(const char *buffer, int length)
+	{
+		::send(socket, buffer, length, 0);
+	}
+
+	void Socket::startup()
+	{
+		#if defined(_WIN32)
+			WSADATA winsockData;
+			WSAStartup(MAKEWORD(2, 2), &winsockData);
+		#endif
+	}
+
+	void Socket::cleanup()
+	{
+		#if defined(_WIN32)
+			WSACleanup();
+		#endif
+	}
+}
diff --git a/src/System/Socket.hpp b/src/System/Socket.hpp
new file mode 100644
index 0000000..b6b9abd
--- /dev/null
+++ b/src/System/Socket.hpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Socket_hpp
+#define sw_Socket_hpp
+
+#if defined(_WIN32)
+	#include <winsock2.h>
+#else
+	#include <sys/socket.h>
+	typedef int SOCKET;
+#endif
+
+namespace sw
+{
+	class Socket
+	{
+	public:
+		Socket(SOCKET socket);
+		Socket(const char *address, const char *port);
+		~Socket();
+
+		void listen(int backlog = 1);
+		bool select(int us);
+		Socket *accept();
+		
+		int receive(char *buffer, int length);
+		void send(const char *buffer, int length);
+
+		static void startup();
+		static void cleanup();
+
+	private:
+		SOCKET socket;
+	};
+}
+
+#endif   // sw_Socket_hpp
diff --git a/src/System/Thread.cpp b/src/System/Thread.cpp
new file mode 100644
index 0000000..df9a0b7
--- /dev/null
+++ b/src/System/Thread.cpp
@@ -0,0 +1,91 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Thread.hpp"
+
+namespace sw
+{
+	Thread::Thread(void (*threadFunction)(void *parameters), void *parameters)
+	{
+		Event init;
+		Entry entry = {threadFunction, parameters, &init};
+
+		#if defined(_WIN32)
+			handle = CreateThread(NULL, 1024 * 1024, startFunction, &entry, 0, NULL);
+		#else
+			pthread_create(&handle, NULL, startFunction, &entry);
+		#endif
+
+		init.wait();
+	}
+
+	Thread::~Thread()
+	{
+		join();   // Make threads exit before deleting them to not block here
+	}
+
+	void Thread::join()
+	{
+		if(!hasJoined)
+		{
+			#if defined(_WIN32)
+				WaitForSingleObject(handle, INFINITE);
+				CloseHandle(handle);
+			#else
+				pthread_join(handle, NULL);
+			#endif
+
+			hasJoined = true;
+		}
+	}
+
+	#if defined(_WIN32)
+		unsigned long __stdcall Thread::startFunction(void *parameters)
+		{
+			Entry entry = *(Entry*)parameters;
+			entry.init->signal();
+			entry.threadFunction(entry.threadParameters);
+			return 0;
+		}
+	#else
+		void *Thread::startFunction(void *parameters)
+		{
+			Entry entry = *(Entry*)parameters;
+			entry.init->signal();
+			entry.threadFunction(entry.threadParameters);
+			return nullptr;
+		}
+	#endif
+
+	Event::Event()
+	{
+		#if defined(_WIN32)
+			handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+		#else
+			pthread_cond_init(&handle, NULL);
+			pthread_mutex_init(&mutex, NULL);
+			signaled = false;
+		#endif
+	}
+
+	Event::~Event()
+	{
+		#if defined(_WIN32)
+			CloseHandle(handle);
+		#else
+			pthread_cond_destroy(&handle);
+			pthread_mutex_destroy(&mutex);
+		#endif
+	}
+}
diff --git a/src/System/Thread.hpp b/src/System/Thread.hpp
new file mode 100644
index 0000000..b8280f1
--- /dev/null
+++ b/src/System/Thread.hpp
@@ -0,0 +1,338 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Thread_hpp
+#define sw_Thread_hpp
+
+#if defined(_WIN32)
+	#ifndef WIN32_LEAN_AND_MEAN
+		#define WIN32_LEAN_AND_MEAN
+	#endif
+	#include <windows.h>
+	#include <intrin.h>
+#else
+	#include <pthread.h>
+	#include <sched.h>
+	#include <unistd.h>
+	#define TLS_OUT_OF_INDEXES (pthread_key_t)(~0)
+#endif
+
+#include <stdlib.h>
+
+#if defined(__clang__)
+#if __has_include(<atomic>) // clang has an explicit check for the availability of atomic
+#define USE_STD_ATOMIC 1
+#endif
+// atomic is available in C++11 or newer, and in Visual Studio 2012 or newer
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1700)) || (__cplusplus >= 201103L)
+#define USE_STD_ATOMIC 1
+#endif
+
+#if USE_STD_ATOMIC
+#include <atomic>
+#endif
+
+namespace sw
+{
+	class Event;
+
+	class Thread
+	{
+	public:
+		Thread(void (*threadFunction)(void *parameters), void *parameters);
+
+		~Thread();
+
+		void join();
+
+		static void yield();
+		static void sleep(int milliseconds);
+
+		#if defined(_WIN32)
+			typedef DWORD LocalStorageKey;
+		#else
+			typedef pthread_key_t LocalStorageKey;
+		#endif
+
+		static LocalStorageKey allocateLocalStorageKey(void (*destructor)(void *storage) = free);
+		static void freeLocalStorageKey(LocalStorageKey key);
+		static void *allocateLocalStorage(LocalStorageKey key, size_t size);
+		static void *getLocalStorage(LocalStorageKey key);
+		static void freeLocalStorage(LocalStorageKey key);
+
+	private:
+		struct Entry
+		{
+			void (*const threadFunction)(void *parameters);
+			void *threadParameters;
+			Event *init;
+		};
+
+		#if defined(_WIN32)
+			static unsigned long __stdcall startFunction(void *parameters);
+			HANDLE handle;
+		#else
+			static void *startFunction(void *parameters);
+			pthread_t handle;
+		#endif
+
+		bool hasJoined = false;
+	};
+
+	class Event
+	{
+		friend class Thread;
+
+	public:
+		Event();
+
+		~Event();
+
+		void signal();
+		void wait();
+
+	private:
+		#if defined(_WIN32)
+			HANDLE handle;
+		#else
+			pthread_cond_t handle;
+			pthread_mutex_t mutex;
+			volatile bool signaled;
+		#endif
+	};
+
+	#if PERF_PROFILE
+	int64_t atomicExchange(int64_t volatile *target, int64_t value);
+	int atomicExchange(int volatile *target, int value);
+	#endif
+
+	int atomicIncrement(int volatile *value);
+	int atomicDecrement(int volatile *value);
+	int atomicAdd(int volatile *target, int value);
+	void nop();
+}
+
+namespace sw
+{
+	inline void Thread::yield()
+	{
+		#if defined(_WIN32)
+			Sleep(0);
+		#elif defined(__APPLE__)
+			pthread_yield_np();
+		#else
+			sched_yield();
+		#endif
+	}
+
+	inline void Thread::sleep(int milliseconds)
+	{
+		#if defined(_WIN32)
+			Sleep(milliseconds);
+		#else
+			usleep(1000 * milliseconds);
+		#endif
+	}
+
+	inline Thread::LocalStorageKey Thread::allocateLocalStorageKey(void (*destructor)(void *storage))
+	{
+		#if defined(_WIN32)
+			return TlsAlloc();
+		#else
+			LocalStorageKey key;
+			pthread_key_create(&key, destructor);
+			return key;
+		#endif
+	}
+
+	inline void Thread::freeLocalStorageKey(LocalStorageKey key)
+	{
+		#if defined(_WIN32)
+			TlsFree(key);
+		#else
+			pthread_key_delete(key);   // Using an invalid key is an error but not undefined behavior.
+		#endif
+	}
+
+	inline void *Thread::allocateLocalStorage(LocalStorageKey key, size_t size)
+	{
+		if(key == TLS_OUT_OF_INDEXES)
+		{
+			return nullptr;
+		}
+
+		freeLocalStorage(key);
+
+		void *storage = malloc(size);
+
+		#if defined(_WIN32)
+			TlsSetValue(key, storage);
+		#else
+			pthread_setspecific(key, storage);
+		#endif
+
+		return storage;
+	}
+
+	inline void *Thread::getLocalStorage(LocalStorageKey key)
+	{
+		#if defined(_WIN32)
+			return TlsGetValue(key);
+		#else
+			if(key == TLS_OUT_OF_INDEXES)   // Avoid undefined behavior.
+			{
+				return nullptr;
+			}
+
+			return pthread_getspecific(key);
+		#endif
+	}
+
+	inline void Thread::freeLocalStorage(LocalStorageKey key)
+	{
+		free(getLocalStorage(key));
+
+		#if defined(_WIN32)
+			TlsSetValue(key, nullptr);
+		#else
+			pthread_setspecific(key, nullptr);
+		#endif
+	}
+
+	inline void Event::signal()
+	{
+		#if defined(_WIN32)
+			SetEvent(handle);
+		#else
+			pthread_mutex_lock(&mutex);
+			signaled = true;
+			pthread_cond_signal(&handle);
+			pthread_mutex_unlock(&mutex);
+		#endif
+	}
+
+	inline void Event::wait()
+	{
+		#if defined(_WIN32)
+			WaitForSingleObject(handle, INFINITE);
+		#else
+			pthread_mutex_lock(&mutex);
+			while(!signaled) pthread_cond_wait(&handle, &mutex);
+			signaled = false;
+			pthread_mutex_unlock(&mutex);
+		#endif
+	}
+
+	#if PERF_PROFILE
+	inline int64_t atomicExchange(volatile int64_t *target, int64_t value)
+	{
+		#if defined(_WIN32)
+			return InterlockedExchange64(target, value);
+		#else
+			int ret;
+			__asm__ __volatile__("lock; xchg8 %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+			return ret;
+		#endif
+	}
+
+	inline int atomicExchange(volatile int *target, int value)
+	{
+		#if defined(_WIN32)
+			return InterlockedExchange((volatile long*)target, (long)value);
+		#else
+			int ret;
+			__asm__ __volatile__("lock; xchgl %x0,(%x1)" : "=r" (ret) :"r" (target), "0" (value) : "memory" );
+			return ret;
+		#endif
+	}
+	#endif
+
+	inline int atomicIncrement(volatile int *value)
+	{
+		#if defined(_WIN32)
+			return InterlockedIncrement((volatile long*)value);
+		#else
+			return __sync_add_and_fetch(value, 1);
+		#endif
+	}
+
+	inline int atomicDecrement(volatile int *value)
+	{
+		#if defined(_WIN32)
+			return InterlockedDecrement((volatile long*)value);
+		#else
+			return __sync_sub_and_fetch(value, 1);
+		#endif
+	}
+
+	inline int atomicAdd(volatile int* target, int value)
+	{
+		#if defined(_WIN32)
+			return InterlockedExchangeAdd((volatile long*)target, value) + value;
+		#else
+			return __sync_add_and_fetch(target, value);
+		#endif
+	}
+
+	inline void nop()
+	{
+		#if defined(_WIN32)
+			__nop();
+		#else
+			__asm__ __volatile__ ("nop");
+		#endif
+	}
+
+	#if USE_STD_ATOMIC
+		class AtomicInt
+		{
+		public:
+			AtomicInt() : ai() {}
+			AtomicInt(int i) : ai(i) {}
+
+			inline operator int() const { return ai.load(std::memory_order_acquire); }
+			inline void operator=(const AtomicInt& i) { ai.store(i.ai.load(std::memory_order_acquire), std::memory_order_release); }
+			inline void operator=(int i) { ai.store(i, std::memory_order_release); }
+			inline void operator--() { ai.fetch_sub(1, std::memory_order_acq_rel); }
+			inline void operator++() { ai.fetch_add(1, std::memory_order_acq_rel); }
+			inline int operator--(int) { return ai.fetch_sub(1, std::memory_order_acq_rel) - 1; }
+			inline int operator++(int) { return ai.fetch_add(1, std::memory_order_acq_rel) + 1; }
+			inline void operator-=(int i) { ai.fetch_sub(i, std::memory_order_acq_rel); }
+			inline void operator+=(int i) { ai.fetch_add(i, std::memory_order_acq_rel); }
+		private:
+			std::atomic<int> ai;
+		};
+	#else
+		class AtomicInt
+		{
+		public:
+			AtomicInt() {}
+			AtomicInt(int i) : vi(i) {}
+
+			inline operator int() const { return vi; } // Note: this isn't a guaranteed atomic operation
+			inline void operator=(const AtomicInt& i) { sw::atomicExchange(&vi, i.vi); }
+			inline void operator=(int i) { sw::atomicExchange(&vi, i); }
+			inline void operator--() { sw::atomicDecrement(&vi); }
+			inline void operator++() { sw::atomicIncrement(&vi); }
+			inline int operator--(int) { return sw::atomicDecrement(&vi); }
+			inline int operator++(int) { return sw::atomicIncrement(&vi); }
+			inline void operator-=(int i) { sw::atomicAdd(&vi, -i); }
+			inline void operator+=(int i) { sw::atomicAdd(&vi, i); }
+		private:
+			volatile int vi;
+		};
+	#endif
+}
+
+#endif   // sw_Thread_hpp
diff --git a/src/System/Timer.cpp b/src/System/Timer.cpp
new file mode 100644
index 0000000..8ff2cf3
--- /dev/null
+++ b/src/System/Timer.cpp
@@ -0,0 +1,95 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Timer.hpp"
+
+#if !defined(__i386__) && defined(_M_IX86)
+	#define __i386__ 1
+#endif
+
+#if !defined(__x86_64__) && (defined(_M_AMD64) || defined (_M_X64))
+	#define __x86_64__ 1
+#endif
+
+#if defined(_WIN32)
+	#ifndef WIN32_LEAN_AND_MEAN
+		#define WIN32_LEAN_AND_MEAN
+	#endif
+	#include <windows.h>
+	#include <intrin.h>
+#else
+	#include <sys/time.h>
+	#if defined(__i386__) || defined(__x86_64__)
+		#include <x86intrin.h>
+	#endif
+#endif
+
+namespace sw
+{
+	Timer::Timer()
+	{
+	}
+
+	Timer::~Timer()
+	{
+	}
+
+	double Timer::seconds()
+	{
+		#if defined(_WIN32)
+			return (double)counter() / (double)frequency();
+		#else
+			timeval t;
+			gettimeofday(&t, 0);
+			return (double)t.tv_sec + (double)t.tv_usec * 1.0e-6;
+		#endif
+	}
+
+	int64_t Timer::ticks()
+	{
+		#if defined(_WIN32)
+			return __rdtsc();
+		#elif defined(__i386__) || defined(__x86_64__)
+			int64_t tsc;
+			__asm volatile("rdtsc": "=A" (tsc));
+			return tsc;
+		#else
+			return 0;
+		#endif
+	}
+
+	int64_t Timer::counter()
+	{
+		#if defined(_WIN32)
+			int64_t counter;
+			QueryPerformanceCounter((LARGE_INTEGER*)&counter);
+			return counter;
+		#else
+			timeval t;
+			gettimeofday(&t, 0);
+			return t.tv_sec * 1000000 + t.tv_usec;
+		#endif
+	}
+
+	int64_t Timer::frequency()
+	{
+		#if defined(_WIN32)
+			int64_t frequency;
+			QueryPerformanceFrequency((LARGE_INTEGER*)&frequency);
+			return frequency;
+		#else
+			return 1000000;   // gettimeofday uses microsecond resolution
+		#endif
+	}
+}
diff --git a/src/System/Timer.hpp b/src/System/Timer.hpp
new file mode 100644
index 0000000..977c877
--- /dev/null
+++ b/src/System/Timer.hpp
@@ -0,0 +1,37 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Timer_hpp
+#define sw_Timer_hpp
+
+#include "Types.hpp"
+
+namespace sw
+{
+	class Timer
+	{
+	public:
+		Timer();
+
+		~Timer();
+
+		static double seconds();
+		static int64_t ticks();
+
+		static int64_t counter();
+		static int64_t frequency();
+	};
+}
+
+#endif   // sw_Timer_hpp
diff --git a/src/System/Types.hpp b/src/System/Types.hpp
new file mode 100644
index 0000000..cd08ed5
--- /dev/null
+++ b/src/System/Types.hpp
@@ -0,0 +1,157 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_Types_hpp
+#define sw_Types_hpp
+
+#include <limits>
+#include <type_traits>
+
+// GCC warns against bitfields not fitting the entire range of an enum with a fixed underlying type of unsigned int, which gets promoted to an error with -Werror and cannot be suppressed.
+// However, GCC already defaults to using unsigned int as the underlying type of an unscoped enum without a fixed underlying type. So we can just omit it.
+#if defined(__GNUC__) && !defined(__clang__)
+namespace {enum E {}; static_assert(!std::numeric_limits<std::underlying_type<E>::type>::is_signed, "expected unscoped enum whose underlying type is not fixed to be unsigned");}
+#define ENUM_UNDERLYING_TYPE_UNSIGNED_INT
+#else
+#define ENUM_UNDERLYING_TYPE_UNSIGNED_INT : unsigned int
+#endif
+
+#if defined(_MSC_VER)
+	typedef signed __int8 int8_t;
+	typedef signed __int16 int16_t;
+	typedef signed __int32 int32_t;
+	typedef signed __int64 int64_t;
+	typedef unsigned __int8 uint8_t;
+	typedef unsigned __int16 uint16_t;
+	typedef unsigned __int32 uint32_t;
+	typedef unsigned __int64 uint64_t;
+	#define ALIGN(bytes, type) __declspec(align(bytes)) type
+#else
+	#include <stdint.h>
+	#define ALIGN(bytes, type) type __attribute__((aligned(bytes)))
+#endif
+
+namespace sw
+{
+	typedef ALIGN(1, uint8_t) byte;
+	typedef ALIGN(2, uint16_t) word;
+	typedef ALIGN(4, uint32_t) dword;
+	typedef ALIGN(8, uint64_t) qword;
+	typedef ALIGN(16, uint64_t) qword2[2];
+	typedef ALIGN(4, uint8_t) byte4[4];
+	typedef ALIGN(8, uint8_t) byte8[8];
+	typedef ALIGN(16, uint8_t) byte16[16];
+	typedef ALIGN(8, uint16_t) word4[4];
+	typedef ALIGN(8, uint32_t) dword2[2];
+	typedef ALIGN(16, uint32_t) dword4[4];
+	typedef ALIGN(16, uint64_t) xword[2];
+
+	typedef ALIGN(1, int8_t) sbyte;
+	typedef ALIGN(4, int8_t) sbyte4[4];
+	typedef ALIGN(8, int8_t) sbyte8[8];
+	typedef ALIGN(16, int8_t) sbyte16[16];
+	typedef ALIGN(8, short) short4[4];
+	typedef ALIGN(8, unsigned short) ushort4[4];
+	typedef ALIGN(16, short) short8[8];
+	typedef ALIGN(16, unsigned short) ushort8[8];
+	typedef ALIGN(8, int) int2[2];
+	typedef ALIGN(8, unsigned int) uint2[2];
+	typedef ALIGN(16, unsigned int) uint4[4];
+
+	typedef ALIGN(8, float) float2[2];
+
+	ALIGN(16, struct int4
+	{
+		int x;
+		int y;
+		int z;
+		int w;
+
+		int &operator[](int i)
+		{
+			return (&x)[i];
+		}
+
+		const int &operator[](int i) const
+		{
+			return (&x)[i];
+		}
+
+		bool operator!=(const int4 &rhs)
+		{
+			return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
+		}
+
+		bool operator==(const int4 &rhs)
+		{
+			return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+		}
+	});
+
+	ALIGN(16, struct float4
+	{
+		float x;
+		float y;
+		float z;
+		float w;
+
+		float &operator[](int i)
+		{
+			return (&x)[i];
+		}
+
+		const float &operator[](int i) const
+		{
+			return (&x)[i];
+		}
+
+		bool operator!=(const float4 &rhs)
+		{
+			return x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w;
+		}
+
+		bool operator==(const float4 &rhs)
+		{
+			return x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w;
+		}
+	});
+
+	inline float4 vector(float x, float y, float z, float w)
+	{
+		float4 v;
+
+		v.x = x;
+		v.y = y;
+		v.z = z;
+		v.w = w;
+
+		return v;
+	}
+
+	inline float4 replicate(float f)
+	{
+		float4 v;
+
+		v.x = f;
+		v.y = f;
+		v.z = f;
+		v.w = f;
+
+		return v;
+	}
+
+	#define OFFSET(s,m) (int)(size_t)&reinterpret_cast<const volatile char&>((((s*)0)->m))
+}
+
+#endif   // sw_Types_hpp
diff --git a/src/System/Version.h b/src/System/Version.h
new file mode 100644
index 0000000..72bd15d
--- /dev/null
+++ b/src/System/Version.h
@@ -0,0 +1,24 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MAJOR_VERSION 4
+#define MINOR_VERSION 1
+#define BUILD_VERSION 0
+#define BUILD_REVISION 2
+
+#define STRINGIFY(x) #x
+#define MACRO_STRINGIFY(x) STRINGIFY(x)
+
+#define REVISION_STRING MACRO_STRINGIFY(BUILD_REVISION)
+#define VERSION_STRING MACRO_STRINGIFY(MAJOR_VERSION) "." MACRO_STRINGIFY(MINOR_VERSION) "." MACRO_STRINGIFY(BUILD_VERSION) "." MACRO_STRINGIFY(BUILD_REVISION)
diff --git a/src/WSI/FrameBuffer.cpp b/src/WSI/FrameBuffer.cpp
new file mode 100644
index 0000000..7a8ddc1
--- /dev/null
+++ b/src/WSI/FrameBuffer.cpp
@@ -0,0 +1,638 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBuffer.hpp"
+
+#include "Renderer/Surface.hpp"
+#include "Reactor/Reactor.hpp"
+#include "Common/Timer.hpp"
+#include "Common/Debug.hpp"
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define ASYNCHRONOUS_BLIT false   // FIXME: Currently leads to rare race conditions
+
+namespace sw
+{
+	extern bool forceWindowed;
+
+	FrameBuffer::Cursor FrameBuffer::cursor = {};
+	bool FrameBuffer::topLeftOrigin = false;
+
+	FrameBuffer::FrameBuffer(int width, int height, bool fullscreen, bool topLeftOrigin)
+	{
+		this->topLeftOrigin = topLeftOrigin;
+
+		framebuffer = nullptr;
+
+		this->width = width;
+		this->height = height;
+		format = FORMAT_X8R8G8B8;
+		stride = 0;
+
+		windowed = !fullscreen || forceWindowed;
+
+		blitFunction = nullptr;
+		blitRoutine = nullptr;
+		blitState = {};
+
+		if(ASYNCHRONOUS_BLIT)
+		{
+			terminate = false;
+			FrameBuffer *parameters = this;
+			blitThread = new Thread(threadFunction, &parameters);
+		}
+	}
+
+	FrameBuffer::~FrameBuffer()
+	{
+		if(ASYNCHRONOUS_BLIT)
+		{
+			terminate = true;
+			blitEvent.signal();
+			blitThread->join();
+			delete blitThread;
+		}
+
+		delete blitRoutine;
+	}
+
+	void FrameBuffer::setCursorImage(sw::Surface *cursorImage)
+	{
+		if(cursorImage)
+		{
+			cursor.image = cursorImage->lockExternal(0, 0, 0, sw::LOCK_READONLY, sw::PUBLIC);
+			cursorImage->unlockExternal();
+
+			cursor.width = cursorImage->getWidth();
+			cursor.height = cursorImage->getHeight();
+		}
+		else
+		{
+			cursor.width = 0;
+			cursor.height = 0;
+		}
+	}
+
+	void FrameBuffer::setCursorOrigin(int x0, int y0)
+	{
+		cursor.hotspotX = x0;
+		cursor.hotspotY = y0;
+	}
+
+	void FrameBuffer::setCursorPosition(int x, int y)
+	{
+		cursor.positionX = x;
+		cursor.positionY = y;
+	}
+
+	void FrameBuffer::copy(sw::Surface *source)
+	{
+		if(!source)
+		{
+			return;
+		}
+
+		if(!lock())
+		{
+			return;
+		}
+
+		int sourceStride = source->getInternalPitchB();
+
+		updateState = {};
+		updateState.width = width;
+		updateState.height = height;
+		updateState.destFormat = format;
+		updateState.destStride = stride;
+		updateState.sourceFormat = source->getInternalFormat();
+		updateState.sourceStride = topLeftOrigin ? sourceStride : -sourceStride;
+		updateState.cursorWidth = cursor.width;
+		updateState.cursorHeight = cursor.height;
+
+		renderbuffer = source->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PUBLIC);
+
+		if(!topLeftOrigin)
+		{
+			renderbuffer = (byte*)renderbuffer + (height - 1) * sourceStride;
+		}
+
+		cursor.x = cursor.positionX - cursor.hotspotX;
+		cursor.y = cursor.positionY - cursor.hotspotY;
+
+		if(ASYNCHRONOUS_BLIT)
+		{
+			blitEvent.signal();
+			syncEvent.wait();
+		}
+		else
+		{
+			copyLocked();
+		}
+
+		source->unlockInternal();
+		unlock();
+
+		profiler.nextFrame();   // Assumes every copy() is a full frame
+	}
+
+	void FrameBuffer::copyLocked()
+	{
+		if(memcmp(&blitState, &updateState, sizeof(BlitState)) != 0)
+		{
+			blitState = updateState;
+			delete blitRoutine;
+
+			blitRoutine = copyRoutine(blitState);
+			blitFunction = (void(*)(void*, void*, Cursor*))blitRoutine->getEntry();
+		}
+
+		blitFunction(framebuffer, renderbuffer, &cursor);
+	}
+
+	Routine *FrameBuffer::copyRoutine(const BlitState &state)
+	{
+		const int width = state.width;
+		const int height = state.height;
+		const int dBytes = Surface::bytes(state.destFormat);
+		const int dStride = state.destStride;
+		const int sBytes = Surface::bytes(state.sourceFormat);
+		const int sStride = state.sourceStride;
+
+		Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)> function;
+		{
+			Pointer<Byte> dst(function.Arg<0>());
+			Pointer<Byte> src(function.Arg<1>());
+			Pointer<Byte> cursor(function.Arg<2>());
+
+			For(Int y = 0, y < height, y++)
+			{
+				Pointer<Byte> d = dst + y * dStride;
+				Pointer<Byte> s = src + y * sStride;
+
+				Int x0 = 0;
+
+				switch(state.destFormat)
+				{
+				case FORMAT_X8R8G8B8:
+				case FORMAT_A8R8G8B8:
+					{
+						Int x = x0;
+
+						switch(state.sourceFormat)
+						{
+						case FORMAT_X8R8G8B8:
+						case FORMAT_A8R8G8B8:
+							For(, x < width - 3, x += 4)
+							{
+								*Pointer<Int4>(d, 1) = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+								s += 4 * sBytes;
+								d += 4 * dBytes;
+							}
+							break;
+						case FORMAT_X8B8G8R8:
+						case FORMAT_A8B8G8R8:
+							For(, x < width - 3, x += 4)
+							{
+								Int4 bgra = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+								*Pointer<Int4>(d, 1) = ((bgra & Int4(0x00FF0000)) >> 16) |
+								                       ((bgra & Int4(0x000000FF)) << 16) |
+								                       (bgra & Int4(0xFF00FF00));
+
+								s += 4 * sBytes;
+								d += 4 * dBytes;
+							}
+							break;
+						case FORMAT_A16B16G16R16:
+							For(, x < width - 1, x += 2)
+							{
+								Short4 c0 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 0), 0xC6)) >> 8;
+								Short4 c1 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 8), 0xC6)) >> 8;
+
+								*Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
+
+								s += 2 * sBytes;
+								d += 2 * dBytes;
+							}
+							break;
+						case FORMAT_R5G6B5:
+							For(, x < width - 3, x += 4)
+							{
+								Int4 rgb = Int4(*Pointer<Short4>(s));
+
+								*Pointer<Int4>(d) = (((rgb & Int4(0xF800)) << 8) | ((rgb & Int4(0xE01F)) << 3)) |
+								                    (((rgb & Int4(0x07E0)) << 5) | ((rgb & Int4(0x0600)) >> 1)) |
+								                    (((rgb & Int4(0x001C)) >> 2) | Int4(0xFF000000));
+
+								s += 4 * sBytes;
+								d += 4 * dBytes;
+							}
+							break;
+						default:
+							ASSERT(false);
+							break;
+						}
+
+						For(, x < width, x++)
+						{
+							switch(state.sourceFormat)
+							{
+							case FORMAT_X8R8G8B8:
+							case FORMAT_A8R8G8B8:
+								*Pointer<Int>(d) = *Pointer<Int>(s);
+								break;
+							case FORMAT_X8B8G8R8:
+							case FORMAT_A8B8G8R8:
+								{
+									Int rgba = *Pointer<Int>(s);
+
+									*Pointer<Int>(d) = ((rgba & Int(0x00FF0000)) >> 16) |
+									                   ((rgba & Int(0x000000FF)) << 16) |
+									                   (rgba & Int(0xFF00FF00));
+								}
+								break;
+							case FORMAT_A16B16G16R16:
+								{
+									Short4 c = As<UShort4>(Swizzle(*Pointer<Short4>(s), 0xC6)) >> 8;
+
+									*Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
+								}
+								break;
+							case FORMAT_R5G6B5:
+								{
+									Int rgb = Int(*Pointer<Short>(s));
+
+									*Pointer<Int>(d) = 0xFF000000 |
+									                   ((rgb & 0xF800) << 8) | ((rgb & 0xE01F) << 3) |
+								                       ((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1) |
+								                       ((rgb & 0x001C) >> 2);
+								}
+								break;
+							default:
+								ASSERT(false);
+								break;
+							}
+
+							s += sBytes;
+							d += dBytes;
+						}
+					}
+					break;
+				case FORMAT_X8B8G8R8:
+				case FORMAT_A8B8G8R8:
+				case FORMAT_SRGB8_X8:
+				case FORMAT_SRGB8_A8:
+					{
+						Int x = x0;
+
+						switch(state.sourceFormat)
+						{
+						case FORMAT_X8B8G8R8:
+						case FORMAT_A8B8G8R8:
+							For(, x < width - 3, x += 4)
+							{
+								*Pointer<Int4>(d, 1) = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+								s += 4 * sBytes;
+								d += 4 * dBytes;
+							}
+							break;
+						case FORMAT_X8R8G8B8:
+						case FORMAT_A8R8G8B8:
+							For(, x < width - 3, x += 4)
+							{
+								Int4 bgra = *Pointer<Int4>(s, sStride % 16 ? 1 : 16);
+
+								*Pointer<Int4>(d, 1) = ((bgra & Int4(0x00FF0000)) >> 16) |
+								                       ((bgra & Int4(0x000000FF)) << 16) |
+								                       (bgra & Int4(0xFF00FF00));
+
+								s += 4 * sBytes;
+								d += 4 * dBytes;
+							}
+							break;
+						case FORMAT_A16B16G16R16:
+							For(, x < width - 1, x += 2)
+							{
+								Short4 c0 = *Pointer<UShort4>(s + 0) >> 8;
+								Short4 c1 = *Pointer<UShort4>(s + 8) >> 8;
+
+								*Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
+
+								s += 2 * sBytes;
+								d += 2 * dBytes;
+							}
+							break;
+						case FORMAT_R5G6B5:
+							For(, x < width - 3, x += 4)
+							{
+								Int4 rgb = Int4(*Pointer<Short4>(s));
+
+								*Pointer<Int4>(d) = Int4(0xFF000000) |
+                                                    (((rgb & Int4(0x001F)) << 19) | ((rgb & Int4(0x001C)) << 14)) |
+								                    (((rgb & Int4(0x07E0)) << 5) | ((rgb & Int4(0x0600)) >> 1)) |
+								                    (((rgb & Int4(0xF800)) >> 8) | ((rgb & Int4(0xE000)) >> 13));
+
+								s += 4 * sBytes;
+								d += 4 * dBytes;
+							}
+							break;
+						default:
+							ASSERT(false);
+							break;
+						}
+
+						For(, x < width, x++)
+						{
+							switch(state.sourceFormat)
+							{
+							case FORMAT_X8B8G8R8:
+							case FORMAT_A8B8G8R8:
+								*Pointer<Int>(d) = *Pointer<Int>(s);
+								break;
+							case FORMAT_X8R8G8B8:
+							case FORMAT_A8R8G8B8:
+								{
+									Int bgra = *Pointer<Int>(s);
+									*Pointer<Int>(d) = ((bgra & Int(0x00FF0000)) >> 16) |
+									                   ((bgra & Int(0x000000FF)) << 16) |
+									                   (bgra & Int(0xFF00FF00));
+								}
+								break;
+							case FORMAT_A16B16G16R16:
+								{
+									Short4 c = *Pointer<UShort4>(s) >> 8;
+
+									*Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
+								}
+								break;
+							case FORMAT_R5G6B5:
+								{
+									Int rgb = Int(*Pointer<Short>(s));
+
+									*Pointer<Int>(d) = 0xFF000000 |
+									                   ((rgb & 0x001F) << 19) | ((rgb & 0x001C) << 14) |
+								                       ((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1) |
+								                       ((rgb & 0xF800) >> 8) | ((rgb & 0xE000) >> 13);
+								}
+								break;
+							default:
+								ASSERT(false);
+								break;
+							}
+
+							s += sBytes;
+							d += dBytes;
+						}
+					}
+					break;
+				case FORMAT_R8G8B8:
+					{
+						For(Int x = x0, x < width, x++)
+						{
+							switch(state.sourceFormat)
+							{
+							case FORMAT_X8R8G8B8:
+							case FORMAT_A8R8G8B8:
+								*Pointer<Byte>(d + 0) = *Pointer<Byte>(s + 0);
+								*Pointer<Byte>(d + 1) = *Pointer<Byte>(s + 1);
+								*Pointer<Byte>(d + 2) = *Pointer<Byte>(s + 2);
+								break;
+							case FORMAT_X8B8G8R8:
+							case FORMAT_A8B8G8R8:
+								*Pointer<Byte>(d + 0) = *Pointer<Byte>(s + 2);
+								*Pointer<Byte>(d + 1) = *Pointer<Byte>(s + 1);
+								*Pointer<Byte>(d + 2) = *Pointer<Byte>(s + 0);
+								break;
+							case FORMAT_A16B16G16R16:
+								*Pointer<Byte>(d + 0) = *Pointer<Byte>(s + 5);
+								*Pointer<Byte>(d + 1) = *Pointer<Byte>(s + 3);
+								*Pointer<Byte>(d + 2) = *Pointer<Byte>(s + 1);
+								break;
+							case FORMAT_R5G6B5:
+								{
+									Int rgb = Int(*Pointer<Short>(s));
+
+									*Pointer<Byte>(d + 0) = Byte(((rgb & 0x001F) << 3) | ((rgb & 0x001C) >> 2));
+									*Pointer<Byte>(d + 1) = Byte(((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1));
+									*Pointer<Byte>(d + 2) = Byte(((rgb & 0xF800) << 8) | ((rgb & 0xE000) << 3));
+								}
+								break;
+							default:
+								ASSERT(false);
+								break;
+							}
+
+							s += sBytes;
+							d += dBytes;
+						}
+					}
+					break;
+				case FORMAT_R5G6B5:
+					{
+						For(Int x = x0, x < width, x++)
+						{
+							switch(state.sourceFormat)
+							{
+							case FORMAT_X8R8G8B8:
+							case FORMAT_A8R8G8B8:
+								{
+									Int c = *Pointer<Int>(s);
+
+									*Pointer<Short>(d) = Short((c & 0x00F80000) >> 8 |
+									                           (c & 0x0000FC00) >> 5 |
+									                           (c & 0x000000F8) >> 3);
+								}
+								break;
+							case FORMAT_X8B8G8R8:
+							case FORMAT_A8B8G8R8:
+								{
+									Int c = *Pointer<Int>(s);
+
+									*Pointer<Short>(d) = Short((c & 0x00F80000) >> 19 |
+									                           (c & 0x0000FC00) >> 5 |
+									                           (c & 0x000000F8) << 8);
+								}
+								break;
+							case FORMAT_A16B16G16R16:
+								{
+									Short4 cc = *Pointer<UShort4>(s) >> 8;
+									Int c = Int(As<Int2>(PackUnsigned(cc, cc)));
+
+									*Pointer<Short>(d) = Short((c & 0x00F80000) >> 19 |
+									                           (c & 0x0000FC00) >> 5 |
+									                           (c & 0x000000F8) << 8);
+								}
+								break;
+							case FORMAT_R5G6B5:
+								*Pointer<Short>(d) = *Pointer<Short>(s);
+								break;
+							default:
+								ASSERT(false);
+								break;
+							}
+
+							s += sBytes;
+							d += dBytes;
+						}
+					}
+					break;
+				default:
+					ASSERT(false);
+					break;
+				}
+			}
+
+			if(state.cursorWidth > 0 && state.cursorHeight > 0)
+			{
+				Int x0 = *Pointer<Int>(cursor + OFFSET(Cursor,x));
+				Int y0 = *Pointer<Int>(cursor + OFFSET(Cursor,y));
+
+				For(Int y1 = 0, y1 < state.cursorHeight, y1++)
+				{
+					Int y = y0 + y1;
+
+					If(y >= 0 && y < height)
+					{
+						Pointer<Byte> d = dst + y * dStride + x0 * dBytes;
+						Pointer<Byte> s = src + y * sStride + x0 * sBytes;
+						Pointer<Byte> c = *Pointer<Pointer<Byte>>(cursor + OFFSET(Cursor,image)) + y1 * state.cursorWidth * 4;
+
+						For(Int x1 = 0, x1 < state.cursorWidth, x1++)
+						{
+							Int x = x0 + x1;
+
+							If(x >= 0 && x < width)
+							{
+								blend(state, d, s, c);
+							}
+
+							c += 4;
+							s += sBytes;
+							d += dBytes;
+						}
+					}
+				}
+			}
+		}
+
+		return function(L"FrameBuffer");
+	}
+
+	void FrameBuffer::blend(const BlitState &state, const Pointer<Byte> &d, const Pointer<Byte> &s, const Pointer<Byte> &c)
+	{
+		Short4 c1;
+		Short4 c2;
+
+		c1 = Unpack(*Pointer<Byte4>(c));
+
+		switch(state.sourceFormat)
+		{
+		case FORMAT_X8R8G8B8:
+		case FORMAT_A8R8G8B8:
+			c2 = Unpack(*Pointer<Byte4>(s));
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8B8G8R8:
+			c2 = Swizzle(Unpack(*Pointer<Byte4>(s)), 0xC6);
+			break;
+		case FORMAT_A16B16G16R16:
+			c2 = Swizzle(*Pointer<Short4>(s), 0xC6);
+			break;
+		case FORMAT_R5G6B5:
+			{
+				Int rgb(*Pointer<Short>(s));
+				rgb = 0xFF000000 |
+				      ((rgb & 0xF800) << 8) | ((rgb & 0xE01F) << 3) |
+				      ((rgb & 0x07E0) << 5) | ((rgb & 0x0600) >> 1) |
+				      ((rgb & 0x001C) >> 2);
+				c2 = Unpack(As<Byte4>(rgb));
+			}
+			break;
+		default:
+			ASSERT(false);
+			break;
+		}
+
+		c1 = As<Short4>(As<UShort4>(c1) >> 9);
+		c2 = As<Short4>(As<UShort4>(c2) >> 9);
+
+		Short4 alpha = Swizzle(c1, 0xFF) & Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0x0000);
+
+		c1 = (c1 - c2) * alpha;
+		c1 = c1 >> 7;
+		c1 = c1 + c2;
+		c1 = c1 + c1;
+
+		switch(state.destFormat)
+		{
+		case FORMAT_X8R8G8B8:
+		case FORMAT_A8R8G8B8:
+			*Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
+			break;
+		case FORMAT_X8B8G8R8:
+		case FORMAT_A8B8G8R8:
+		case FORMAT_SRGB8_X8:
+		case FORMAT_SRGB8_A8:
+			{
+				c1 = Swizzle(c1, 0xC6);
+
+				*Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
+			}
+			break;
+		case FORMAT_R8G8B8:
+			{
+				Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
+
+				*Pointer<Byte>(d + 0) = Byte(c >> 0);
+				*Pointer<Byte>(d + 1) = Byte(c >> 8);
+				*Pointer<Byte>(d + 2) = Byte(c >> 16);
+			}
+			break;
+		case FORMAT_R5G6B5:
+			{
+				Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
+
+				*Pointer<Short>(d) = Short((c & 0x00F80000) >> 8 |
+				                           (c & 0x0000FC00) >> 5 |
+				                           (c & 0x000000F8) >> 3);
+			}
+			break;
+		default:
+			ASSERT(false);
+			break;
+		}
+	}
+
+	void FrameBuffer::threadFunction(void *parameters)
+	{
+		FrameBuffer *frameBuffer = *static_cast<FrameBuffer**>(parameters);
+
+		while(!frameBuffer->terminate)
+		{
+			frameBuffer->blitEvent.wait();
+
+			if(!frameBuffer->terminate)
+			{
+				frameBuffer->copyLocked();
+
+				frameBuffer->syncEvent.signal();
+			}
+		}
+	}
+}
diff --git a/src/WSI/FrameBuffer.hpp b/src/WSI/FrameBuffer.hpp
new file mode 100644
index 0000000..dd539e1
--- /dev/null
+++ b/src/WSI/FrameBuffer.hpp
@@ -0,0 +1,106 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef	sw_FrameBuffer_hpp
+#define	sw_FrameBuffer_hpp
+
+#include "Reactor/Reactor.hpp"
+#include "Renderer/Surface.hpp"
+#include "Common/Thread.hpp"
+
+namespace sw
+{
+	class Surface;
+
+	struct BlitState
+	{
+		int width;
+		int height;
+		Format destFormat;
+		Format sourceFormat;
+		int destStride;
+		int sourceStride;
+		int cursorWidth;
+		int cursorHeight;
+	};
+
+	class [[clang::lto_visibility_public]] FrameBuffer
+	{
+	public:
+		FrameBuffer(int width, int height, bool fullscreen, bool topLeftOrigin);
+
+		virtual ~FrameBuffer() = 0;
+
+		virtual void flip(sw::Surface *source) = 0;
+		virtual void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) = 0;
+
+		virtual void *lock() = 0;
+		virtual void unlock() = 0;
+
+		static void setCursorImage(sw::Surface *cursor);
+		static void setCursorOrigin(int x0, int y0);
+		static void setCursorPosition(int x, int y);
+
+		static Routine *copyRoutine(const BlitState &state);
+
+	protected:
+		void copy(sw::Surface *source);
+
+		bool windowed;
+
+		void *framebuffer;   // Native window buffer.
+		int width;
+		int height;
+		int stride;
+		Format format;
+
+	private:
+		void copyLocked();
+
+		static void threadFunction(void *parameters);
+
+		void *renderbuffer;   // Render target buffer.
+
+		struct Cursor
+		{
+			void *image;
+			int x;
+			int y;
+			int width;
+			int height;
+			int hotspotX;
+			int hotspotY;
+			int positionX;
+			int positionY;
+		};
+
+		static Cursor cursor;
+
+		void (*blitFunction)(void *dst, void *src, Cursor *cursor);
+		Routine *blitRoutine;
+		BlitState blitState;     // State of the current blitRoutine.
+		BlitState updateState;   // State of the routine to be generated.
+
+		static void blend(const BlitState &state, const Pointer<Byte> &d, const Pointer<Byte> &s, const Pointer<Byte> &c);
+
+		Thread *blitThread;
+		Event syncEvent;
+		Event blitEvent;
+		volatile bool terminate;
+
+		static bool topLeftOrigin;
+	};
+}
+
+#endif	 //	sw_FrameBuffer_hpp
diff --git a/src/WSI/FrameBufferAndroid.cpp b/src/WSI/FrameBufferAndroid.cpp
new file mode 100644
index 0000000..0ae5f09
--- /dev/null
+++ b/src/WSI/FrameBufferAndroid.cpp
@@ -0,0 +1,145 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferAndroid.hpp"
+
+#include "Common/GrallocAndroid.hpp"
+
+#include <system/window.h>
+
+namespace sw
+{
+	inline int dequeueBuffer(ANativeWindow* window, ANativeWindowBuffer** buffer)
+	{
+		#if ANDROID_PLATFORM_SDK_VERSION > 16
+			return native_window_dequeue_buffer_and_wait(window, buffer);
+		#else
+			return window->dequeueBuffer(window, buffer);
+		#endif
+	}
+
+	inline int queueBuffer(ANativeWindow* window, ANativeWindowBuffer* buffer, int fenceFd)
+	{
+		#if ANDROID_PLATFORM_SDK_VERSION > 16
+			return window->queueBuffer(window, buffer, fenceFd);
+		#else
+			return window->queueBuffer(window, buffer);
+		#endif
+	}
+
+	inline int cancelBuffer(ANativeWindow* window, ANativeWindowBuffer* buffer, int fenceFd)
+	{
+		#if ANDROID_PLATFORM_SDK_VERSION > 16
+			return window->cancelBuffer(window, buffer, fenceFd);
+		#else
+			return window->cancelBuffer(window, buffer);
+		#endif
+	}
+
+	FrameBufferAndroid::FrameBufferAndroid(ANativeWindow* window, int width, int height)
+		: FrameBuffer(width, height, false, false),
+		  nativeWindow(window), buffer(nullptr)
+	{
+		nativeWindow->common.incRef(&nativeWindow->common);
+		native_window_set_usage(nativeWindow, GRALLOC_USAGE_SW_READ_OFTEN | GRALLOC_USAGE_SW_WRITE_OFTEN);
+	}
+
+	FrameBufferAndroid::~FrameBufferAndroid()
+	{
+		nativeWindow->common.decRef(&nativeWindow->common);
+	}
+
+	void FrameBufferAndroid::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		copy(source);
+
+		if(buffer)
+		{
+			if(framebuffer)
+			{
+				framebuffer = nullptr;
+				unlock();
+			}
+
+			queueBuffer(nativeWindow, buffer, -1);
+		}
+	}
+
+	void *FrameBufferAndroid::lock()
+	{
+		if(dequeueBuffer(nativeWindow, &buffer) != 0)
+		{
+			return nullptr;
+		}
+
+		if(GrallocModule::getInstance()->lock(buffer->handle,
+		                 GRALLOC_USAGE_SW_READ_OFTEN | GRALLOC_USAGE_SW_WRITE_OFTEN,
+		                 0, 0, buffer->width, buffer->height, &framebuffer) != 0)
+		{
+			TRACE("%s failed to lock buffer %p", __FUNCTION__, buffer);
+			return nullptr;
+		}
+
+		if((buffer->width < width) || (buffer->height < height))
+		{
+			TRACE("lock failed: buffer of %dx%d too small for window of %dx%d",
+			      buffer->width, buffer->height, width, height);
+			return nullptr;
+		}
+
+		switch(buffer->format)
+		{
+		case HAL_PIXEL_FORMAT_RGB_565:   format = FORMAT_R5G6B5; break;
+		case HAL_PIXEL_FORMAT_RGBA_8888: format = FORMAT_A8B8G8R8; break;
+#if ANDROID_PLATFORM_SDK_VERSION > 16
+		case HAL_PIXEL_FORMAT_IMPLEMENTATION_DEFINED: format = FORMAT_X8B8G8R8; break;
+#endif
+		case HAL_PIXEL_FORMAT_RGBX_8888: format = FORMAT_X8B8G8R8; break;
+		case HAL_PIXEL_FORMAT_BGRA_8888: format = FORMAT_A8R8G8B8; break;
+		case HAL_PIXEL_FORMAT_RGB_888:
+			// Frame buffers are expected to have 16-bit or 32-bit colors, not 24-bit.
+			TRACE("Unsupported frame buffer format RGB_888"); ASSERT(false);
+			format = FORMAT_R8G8B8;   // Wrong component order.
+			break;
+		default:
+			TRACE("Unsupported frame buffer format %d", buffer->format); ASSERT(false);
+			format = FORMAT_NULL;
+			break;
+		}
+
+		stride = buffer->stride * Surface::bytes(format);
+		return framebuffer;
+	}
+
+	void FrameBufferAndroid::unlock()
+	{
+		if(!buffer)
+		{
+			TRACE("%s: badness unlock with no active buffer", __FUNCTION__);
+			return;
+		}
+
+		framebuffer = nullptr;
+
+		if(GrallocModule::getInstance()->unlock(buffer->handle) != 0)
+		{
+			TRACE("%s: badness unlock failed", __FUNCTION__);
+		}
+	}
+}
+
+sw::FrameBuffer *createFrameBuffer(void *display, ANativeWindow* window, int width, int height)
+{
+	return new sw::FrameBufferAndroid(window, width, height);
+}
diff --git a/src/WSI/FrameBufferAndroid.hpp b/src/WSI/FrameBufferAndroid.hpp
new file mode 100644
index 0000000..b71c32b
--- /dev/null
+++ b/src/WSI/FrameBufferAndroid.hpp
@@ -0,0 +1,47 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferAndroid_hpp
+#define sw_FrameBufferAndroid_hpp
+
+#include "Main/FrameBuffer.hpp"
+#include "Common/Debug.hpp"
+
+struct ANativeWindow;
+struct ANativeWindowBuffer;
+
+namespace sw
+{
+	class FrameBufferAndroid : public FrameBuffer
+	{
+	public:
+		FrameBufferAndroid(ANativeWindow *window, int width, int height);
+
+		~FrameBufferAndroid() override;
+
+		void flip(sw::Surface *source) override {blit(source, nullptr, nullptr);};
+		void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void *lock() override;
+		void unlock() override;
+
+		bool setSwapRectangle(int l, int t, int w, int h);
+
+	private:
+		ANativeWindow *nativeWindow;
+		ANativeWindowBuffer *buffer;
+	};
+}
+
+#endif   // sw_FrameBufferAndroid
diff --git a/src/WSI/FrameBufferDD.cpp b/src/WSI/FrameBufferDD.cpp
new file mode 100644
index 0000000..46ed89f
--- /dev/null
+++ b/src/WSI/FrameBufferDD.cpp
@@ -0,0 +1,510 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferDD.hpp"
+
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	extern bool forceWindowed;
+
+	GUID secondaryDisplay = {0};
+
+	int __stdcall enumDisplayCallback(GUID* guid, char *driverDescription, char *driverName, void *context, HMONITOR monitor)
+	{
+		if(strcmp(driverName, "\\\\.\\DISPLAY2") == 0)
+		{
+			secondaryDisplay = *guid;
+		}
+
+		return 1;
+	}
+
+	FrameBufferDD::FrameBufferDD(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin) : FrameBufferWin(windowHandle, width, height, fullscreen, topLeftOrigin)
+	{
+		directDraw = 0;
+		frontBuffer = 0;
+		backBuffer = 0;
+
+		framebuffer = nullptr;
+
+		ddraw = LoadLibrary("ddraw.dll");
+		DirectDrawCreate = (DIRECTDRAWCREATE)GetProcAddress(ddraw, "DirectDrawCreate");
+		DirectDrawEnumerateExA = (DIRECTDRAWENUMERATEEXA)GetProcAddress(ddraw, "DirectDrawEnumerateExA");
+
+		if(!windowed)
+		{
+			initFullscreen();
+		}
+		else
+		{
+			initWindowed();
+		}
+	}
+
+	FrameBufferDD::~FrameBufferDD()
+	{
+		releaseAll();
+
+		FreeLibrary(ddraw);
+	}
+
+	void FrameBufferDD::createSurfaces()
+	{
+		if(backBuffer)
+		{
+			backBuffer->Release();
+			backBuffer = 0;
+		}
+
+		if(frontBuffer)
+		{
+			frontBuffer->Release();
+			frontBuffer = 0;
+		}
+
+		if(!windowed)
+		{
+			DDSURFACEDESC surfaceDescription = {0};
+			surfaceDescription.dwSize = sizeof(surfaceDescription);
+			surfaceDescription.dwFlags = DDSD_CAPS | DDSD_BACKBUFFERCOUNT;
+			surfaceDescription.ddsCaps.dwCaps = DDSCAPS_PRIMARYSURFACE | DDSCAPS_FLIP | DDSCAPS_COMPLEX;
+			surfaceDescription.dwBackBufferCount = 1;
+			directDraw->CreateSurface(&surfaceDescription, &frontBuffer, 0);
+
+			if(frontBuffer)
+			{
+				DDSCAPS surfaceCapabilties = {0};
+				surfaceCapabilties.dwCaps = DDSCAPS_BACKBUFFER;
+				frontBuffer->GetAttachedSurface(&surfaceCapabilties, &backBuffer);
+				backBuffer->AddRef();
+			}
+		}
+		else
+		{
+			IDirectDrawClipper *clipper;
+
+			DDSURFACEDESC ddsd = {0};
+			ddsd.dwSize = sizeof(ddsd);
+			ddsd.dwFlags = DDSD_CAPS;
+			ddsd.ddsCaps.dwCaps	= DDSCAPS_PRIMARYSURFACE;
+
+			long result = directDraw->CreateSurface(&ddsd, &frontBuffer, 0);
+			directDraw->GetDisplayMode(&ddsd);
+
+			switch(ddsd.ddpfPixelFormat.dwRGBBitCount)
+			{
+			case 32: format = FORMAT_X8R8G8B8; break;
+			case 24: format = FORMAT_R8G8B8;   break;
+			case 16: format = FORMAT_R5G6B5;   break;
+			default: format = FORMAT_NULL;     break;
+			}
+
+			if((result != DD_OK && result != DDERR_PRIMARYSURFACEALREADYEXISTS) || (format == FORMAT_NULL))
+			{
+				assert(!"Failed to initialize graphics: Incompatible display mode.");
+			}
+			else
+			{
+				ddsd.dwFlags = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT;
+				ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN;
+				ddsd.dwWidth = width;
+				ddsd.dwHeight = height;
+
+				directDraw->CreateSurface(&ddsd, &backBuffer, 0);
+
+				directDraw->CreateClipper(0, &clipper, 0);
+				clipper->SetHWnd(0, windowHandle);
+				frontBuffer->SetClipper(clipper);
+				clipper->Release();
+			}
+		}
+	}
+
+	bool FrameBufferDD::readySurfaces()
+	{
+		if(!frontBuffer || !backBuffer)
+		{
+			createSurfaces();
+		}
+
+		if(frontBuffer && backBuffer)
+		{
+			if(frontBuffer->IsLost() || backBuffer->IsLost())
+			{
+				restoreSurfaces();
+			}
+
+			if(frontBuffer && backBuffer)
+			{
+				if(!frontBuffer->IsLost() && !backBuffer->IsLost())
+				{
+					return true;
+				}
+			}
+		}
+
+		return false;
+	}
+
+	void FrameBufferDD::updateClipper(HWND windowOverride)
+	{
+		if(windowed)
+		{
+			if(frontBuffer)
+			{
+				HWND window = windowOverride ? windowOverride : windowHandle;
+
+				IDirectDrawClipper *clipper;
+				frontBuffer->GetClipper(&clipper);
+				clipper->SetHWnd(0, window);
+				clipper->Release();
+			}
+		}
+	}
+
+	void FrameBufferDD::restoreSurfaces()
+	{
+		long result1 = frontBuffer->Restore();
+		long result2 = backBuffer->Restore();
+
+		if(result1 != DD_OK || result2 != DD_OK)   // Surfaces could not be restored; recreate them
+		{
+			createSurfaces();
+		}
+	}
+
+	void FrameBufferDD::initFullscreen()
+	{
+		releaseAll();
+
+		if(true)   // Render to primary display
+		{
+			DirectDrawCreate(0, &directDraw, 0);
+		}
+		else   // Render to secondary display
+		{
+			DirectDrawEnumerateEx(&enumDisplayCallback, 0, DDENUM_ATTACHEDSECONDARYDEVICES);
+			DirectDrawCreate(&secondaryDisplay, &directDraw, 0);
+		}
+
+		directDraw->SetCooperativeLevel(windowHandle, DDSCL_EXCLUSIVE | DDSCL_FULLSCREEN);
+
+		long result;
+
+		do
+		{
+			format = FORMAT_X8R8G8B8;
+			result = directDraw->SetDisplayMode(width, height, 32);
+
+			if(result == DDERR_INVALIDMODE)
+			{
+				format = FORMAT_R8G8B8;
+				result = directDraw->SetDisplayMode(width, height, 24);
+
+				if(result == DDERR_INVALIDMODE)
+				{
+					format = FORMAT_R5G6B5;
+					result = directDraw->SetDisplayMode(width, height, 16);
+
+					if(result == DDERR_INVALIDMODE)
+					{
+						assert(!"Failed to initialize graphics: Display mode not supported.");
+					}
+				}
+			}
+
+			if(result != DD_OK)
+			{
+				Sleep(1);
+			}
+		}
+		while(result != DD_OK);
+
+		createSurfaces();
+
+		updateBounds(windowHandle);
+	}
+
+	void FrameBufferDD::initWindowed()
+	{
+		releaseAll();
+
+		DirectDrawCreate(0, &directDraw, 0);
+		directDraw->SetCooperativeLevel(windowHandle, DDSCL_NORMAL);
+
+		createSurfaces();
+
+		updateBounds(windowHandle);
+	}
+
+	void FrameBufferDD::flip(sw::Surface *source)
+	{
+		copy(source);
+
+		if(!readySurfaces())
+		{
+			return;
+		}
+
+		while(true)
+		{
+			long result;
+
+			if(windowed)
+			{
+				result = frontBuffer->Blt(&bounds, backBuffer, 0, DDBLT_WAIT, 0);
+			}
+			else
+			{
+				result = frontBuffer->Flip(0, DDFLIP_NOVSYNC);
+			}
+
+			if(result != DDERR_WASSTILLDRAWING)
+			{
+				break;
+			}
+
+			Sleep(0);
+		}
+	}
+
+	void FrameBufferDD::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		copy(source);
+
+		if(!readySurfaces())
+		{
+			return;
+		}
+
+		RECT dRect;
+
+		if(destRect)
+		{
+			dRect.bottom = bounds.top + destRect->y1;
+			dRect.left = bounds.left + destRect->x0;
+			dRect.right = bounds.left + destRect->x1;
+			dRect.top = bounds.top + destRect->y0;
+		}
+		else
+		{
+			dRect.bottom = bounds.top + height;
+			dRect.left = bounds.left + 0;
+			dRect.right = bounds.left + width;
+			dRect.top = bounds.top + 0;
+		}
+
+		while(true)
+		{
+			long result = frontBuffer->Blt(&dRect, backBuffer, (LPRECT)sourceRect, DDBLT_WAIT, 0);
+
+			if(result != DDERR_WASSTILLDRAWING)
+			{
+				break;
+			}
+
+			Sleep(0);
+		}
+	}
+
+	void FrameBufferDD::flip(HWND windowOverride, sw::Surface *source)
+	{
+		updateClipper(windowOverride);
+		updateBounds(windowOverride);
+
+		flip(source);
+	}
+
+	void FrameBufferDD::blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		updateClipper(windowOverride);
+		updateBounds(windowOverride);
+
+		blit(source, sourceRect, destRect);
+	}
+
+	void FrameBufferDD::screenshot(void *destBuffer)
+	{
+		if(!readySurfaces())
+		{
+			return;
+		}
+
+		DDSURFACEDESC DDSD;
+		DDSD.dwSize = sizeof(DDSD);
+
+		long result = frontBuffer->Lock(0, &DDSD, DDLOCK_WAIT, 0);
+
+		if(result == DD_OK)
+		{
+			int width = DDSD.dwWidth;
+			int height = DDSD.dwHeight;
+			int stride = DDSD.lPitch;
+
+			void *sourceBuffer = DDSD.lpSurface;
+
+			for(int y = 0; y < height; y++)
+			{
+				memcpy(destBuffer, sourceBuffer, width * 4);   // FIXME: Assumes 32-bit buffer
+
+				(char*&)sourceBuffer += stride;
+				(char*&)destBuffer += 4 * width;
+			}
+
+			frontBuffer->Unlock(0);
+		}
+	}
+
+	void FrameBufferDD::setGammaRamp(GammaRamp *gammaRamp, bool calibrate)
+	{
+		IDirectDrawGammaControl *gammaControl = 0;
+
+		if(frontBuffer)
+		{
+			frontBuffer->QueryInterface(IID_IDirectDrawGammaControl, (void**)&gammaControl);
+
+			if(gammaControl)
+			{
+				gammaControl->SetGammaRamp(calibrate ? DDSGR_CALIBRATE : 0, (DDGAMMARAMP*)gammaRamp);
+
+				gammaControl->Release();
+			}
+		}
+	}
+
+	void FrameBufferDD::getGammaRamp(GammaRamp *gammaRamp)
+	{
+		IDirectDrawGammaControl *gammaControl = 0;
+
+		if(frontBuffer)
+		{
+			frontBuffer->QueryInterface(IID_IDirectDrawGammaControl, (void**)&gammaControl);
+
+			if(gammaControl)
+			{
+				gammaControl->GetGammaRamp(0, (DDGAMMARAMP*)gammaRamp);
+
+				gammaControl->Release();
+			}
+		}
+	}
+
+	void *FrameBufferDD::lock()
+	{
+		if(framebuffer)
+		{
+			return framebuffer;
+		}
+
+		if(!readySurfaces())
+		{
+			return nullptr;
+		}
+
+		DDSURFACEDESC DDSD;
+		DDSD.dwSize = sizeof(DDSD);
+
+		long result = backBuffer->Lock(0, &DDSD, DDLOCK_WAIT, 0);
+
+		if(result == DD_OK)
+		{
+			width = DDSD.dwWidth;
+			height = DDSD.dwHeight;
+			stride = DDSD.lPitch;
+
+			framebuffer = DDSD.lpSurface;
+
+			return framebuffer;
+		}
+
+		return nullptr;
+	}
+
+	void FrameBufferDD::unlock()
+	{
+		if(!framebuffer || !backBuffer) return;
+
+		backBuffer->Unlock(0);
+
+		framebuffer = nullptr;
+	}
+
+	void FrameBufferDD::drawText(int x, int y, const char *string, ...)
+	{
+		char buffer[256];
+		va_list arglist;
+
+		va_start(arglist, string);
+		vsprintf(buffer, string, arglist);
+		va_end(arglist);
+
+		HDC hdc;
+
+		backBuffer->GetDC(&hdc);
+
+		SetBkColor(hdc, RGB(0, 0, 255));
+		SetTextColor(hdc, RGB(255, 255, 255));
+
+		TextOut(hdc, x, y, buffer, lstrlen(buffer));
+
+		backBuffer->ReleaseDC(hdc);
+	}
+
+	bool FrameBufferDD::getScanline(bool &inVerticalBlank, unsigned int &scanline)
+	{
+		HRESULT result = directDraw->GetScanLine((unsigned long*)&scanline);
+
+		if(result == DD_OK)
+		{
+			inVerticalBlank = false;
+		}
+		else if(result == DDERR_VERTICALBLANKINPROGRESS)
+		{
+			inVerticalBlank = true;
+		}
+		else if(result == DDERR_UNSUPPORTED)
+		{
+			return false;
+		}
+		else ASSERT(false);
+
+		return true;
+	}
+
+	void FrameBufferDD::releaseAll()
+	{
+		unlock();
+
+		if(backBuffer)
+		{
+			backBuffer->Release();
+			backBuffer = 0;
+		}
+
+		if(frontBuffer)
+		{
+			frontBuffer->Release();
+			frontBuffer = 0;
+		}
+
+		if(directDraw)
+		{
+			directDraw->SetCooperativeLevel(0, DDSCL_NORMAL);
+			directDraw->Release();
+			directDraw = 0;
+		}
+	}
+}
diff --git a/src/WSI/FrameBufferDD.hpp b/src/WSI/FrameBufferDD.hpp
new file mode 100644
index 0000000..22d76c9
--- /dev/null
+++ b/src/WSI/FrameBufferDD.hpp
@@ -0,0 +1,69 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef	sw_FrameBufferDD_hpp
+#define	sw_FrameBufferDD_hpp
+
+#include "FrameBufferWin.hpp"
+
+#include <ddraw.h>
+
+namespace sw
+{
+	class FrameBufferDD : public FrameBufferWin
+	{
+	public:
+		FrameBufferDD(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+		~FrameBufferDD() override;
+
+		void flip(sw::Surface *source) override;
+		void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void flip(HWND windowOverride, sw::Surface *source) override;
+		void blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void *lock() override;
+		void unlock() override;
+
+		void setGammaRamp(GammaRamp *gammaRamp, bool calibrate) override;
+		void getGammaRamp(GammaRamp *gammaRamp) override;
+
+		void screenshot(void *destBuffer) override;
+		bool getScanline(bool &inVerticalBlank, unsigned int &scanline) override;
+
+		void drawText(int x, int y, const char *string, ...);
+
+	private:
+		void initFullscreen();
+		void initWindowed();
+		void createSurfaces();
+		bool readySurfaces();
+		void updateClipper(HWND windowOverride);
+		void restoreSurfaces();
+		void releaseAll();
+
+		HMODULE ddraw;
+		typedef HRESULT (WINAPI *DIRECTDRAWCREATE)( GUID FAR *lpGUID, LPDIRECTDRAW FAR *lplpDD, IUnknown FAR *pUnkOuter );
+		HRESULT (WINAPI *DirectDrawCreate)( GUID FAR *lpGUID, LPDIRECTDRAW FAR *lplpDD, IUnknown FAR *pUnkOuter );
+		typedef HRESULT (WINAPI *DIRECTDRAWENUMERATEEXA)( LPDDENUMCALLBACKEXA lpCallback, LPVOID lpContext, DWORD dwFlags);
+		HRESULT (WINAPI *DirectDrawEnumerateExA)( LPDDENUMCALLBACKEXA lpCallback, LPVOID lpContext, DWORD dwFlags);
+
+		IDirectDraw *directDraw;
+		IDirectDrawSurface *frontBuffer;
+		IDirectDrawSurface *backBuffer;
+	};
+}
+
+#endif	 //	sw_FrameBufferDD_hpp
diff --git a/src/WSI/FrameBufferGDI.cpp b/src/WSI/FrameBufferGDI.cpp
new file mode 100644
index 0000000..90a469e
--- /dev/null
+++ b/src/WSI/FrameBufferGDI.cpp
@@ -0,0 +1,162 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferGDI.hpp"
+
+#include "Common/Debug.hpp"
+
+namespace sw
+{
+	extern bool forceWindowed;
+
+	FrameBufferGDI::FrameBufferGDI(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin) : FrameBufferWin(windowHandle, width, height, fullscreen, topLeftOrigin)
+	{
+		if(!windowed)
+		{
+			SetWindowPos(windowHandle, HWND_TOPMOST, 0, 0, width, height, SWP_SHOWWINDOW);
+
+			DEVMODE deviceMode;
+			deviceMode.dmSize = sizeof(DEVMODE);
+			deviceMode.dmPelsWidth= width;
+			deviceMode.dmPelsHeight = height;
+			deviceMode.dmFields = DM_PELSWIDTH | DM_PELSHEIGHT;
+
+			ChangeDisplaySettings(&deviceMode, CDS_FULLSCREEN);
+		}
+
+		init(this->windowHandle);
+
+		format = FORMAT_X8R8G8B8;
+	}
+
+	FrameBufferGDI::~FrameBufferGDI()
+	{
+		release();
+
+		if(!windowed)
+		{
+			ChangeDisplaySettings(0, 0);
+
+			RECT clientRect;
+			RECT windowRect;
+			GetClientRect(windowHandle, &clientRect);
+			GetWindowRect(windowHandle, &windowRect);
+			int windowWidth = width + (windowRect.right - windowRect.left) - (clientRect.right - clientRect.left);
+			int windowHeight = height + (windowRect.bottom - windowRect.top) - (clientRect.bottom - clientRect.top);
+			int desktopWidth = GetSystemMetrics(SM_CXSCREEN);
+			int desktopHeight = GetSystemMetrics(SM_CYSCREEN);
+			SetWindowPos(windowHandle, HWND_TOP, desktopWidth / 2 - windowWidth / 2, desktopHeight / 2 - windowHeight / 2, windowWidth, windowHeight, SWP_SHOWWINDOW);
+		}
+	}
+
+	void *FrameBufferGDI::lock()
+	{
+		stride = width * 4;
+
+		return framebuffer;
+	}
+
+	void FrameBufferGDI::unlock()
+	{
+	}
+
+	void FrameBufferGDI::flip(sw::Surface *source)
+	{
+		blit(source, nullptr, nullptr);
+	}
+
+	void FrameBufferGDI::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		copy(source);
+
+		int sourceLeft = sourceRect ? sourceRect->x0 : 0;
+		int sourceTop = sourceRect ? sourceRect->y0 : 0;
+		int sourceWidth = sourceRect ? sourceRect->x1 - sourceRect->x0 : width;
+		int sourceHeight = sourceRect ? sourceRect->y1 - sourceRect->y0 : height;
+		int destLeft = destRect ? destRect->x0 : 0;
+		int destTop = destRect ? destRect->y0 : 0;
+		int destWidth = destRect ? destRect->x1 - destRect->x0 : bounds.right - bounds.left;
+		int destHeight = destRect ? destRect->y1 - destRect->y0 : bounds.bottom - bounds.top;
+
+		StretchBlt(windowContext, destLeft, destTop, destWidth, destHeight, bitmapContext, sourceLeft, sourceTop, sourceWidth, sourceHeight, SRCCOPY);
+	}
+
+	void FrameBufferGDI::flip(HWND windowOverride, sw::Surface *source)
+	{
+		blit(windowOverride, source, nullptr, nullptr);
+	}
+
+	void FrameBufferGDI::blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		if(windowed && windowOverride != 0 && windowOverride != bitmapWindow)
+		{
+			release();
+			init(windowOverride);
+		}
+
+		blit(source, sourceRect, destRect);
+	}
+
+	void FrameBufferGDI::setGammaRamp(GammaRamp *gammaRamp, bool calibrate)
+	{
+		SetDeviceGammaRamp(windowContext, gammaRamp);
+	}
+
+	void FrameBufferGDI::getGammaRamp(GammaRamp *gammaRamp)
+	{
+		GetDeviceGammaRamp(windowContext, gammaRamp);
+	}
+
+	void FrameBufferGDI::screenshot(void *destBuffer)
+	{
+		UNIMPLEMENTED();
+	}
+
+	bool FrameBufferGDI::getScanline(bool &inVerticalBlank, unsigned int &scanline)
+	{
+		UNIMPLEMENTED();
+
+		return false;
+	}
+
+	void FrameBufferGDI::init(HWND window)
+	{
+		bitmapWindow = window;
+
+		windowContext = GetDC(window);
+		bitmapContext = CreateCompatibleDC(windowContext);
+
+		BITMAPINFO bitmapInfo;
+		memset(&bitmapInfo, 0, sizeof(BITMAPINFO));
+		bitmapInfo.bmiHeader.biSize = sizeof(BITMAPINFO);
+		bitmapInfo.bmiHeader.biBitCount = 32;
+		bitmapInfo.bmiHeader.biPlanes = 1;
+		bitmapInfo.bmiHeader.biHeight = -height;
+		bitmapInfo.bmiHeader.biWidth = width;
+		bitmapInfo.bmiHeader.biCompression = BI_RGB;
+
+		bitmap = CreateDIBSection(bitmapContext, &bitmapInfo, DIB_RGB_COLORS, &framebuffer, 0, 0);
+		SelectObject(bitmapContext, bitmap);
+
+		updateBounds(window);
+	}
+
+	void FrameBufferGDI::release()
+	{
+		SelectObject(bitmapContext, 0);
+		DeleteObject(bitmap);
+		ReleaseDC(bitmapWindow, windowContext);
+		DeleteDC(bitmapContext);
+	}
+}
diff --git a/src/WSI/FrameBufferGDI.hpp b/src/WSI/FrameBufferGDI.hpp
new file mode 100644
index 0000000..add2504
--- /dev/null
+++ b/src/WSI/FrameBufferGDI.hpp
@@ -0,0 +1,56 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef	sw_FrameBufferGDI_hpp
+#define	sw_FrameBufferGDI_hpp
+
+#include "FrameBufferWin.hpp"
+
+namespace sw
+{
+	class FrameBufferGDI : public FrameBufferWin
+	{
+	public:
+		FrameBufferGDI(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+		~FrameBufferGDI() override;
+
+		void flip(sw::Surface *source) override;
+		void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void flip(HWND windowOverride, sw::Surface *source) override;
+		void blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void *lock() override;
+		void unlock() override;
+
+		void setGammaRamp(GammaRamp *gammaRamp, bool calibrate) override;
+		void getGammaRamp(GammaRamp *gammaRamp) override;
+
+		void screenshot(void *destBuffer) override;
+		bool getScanline(bool &inVerticalBlank, unsigned int &scanline) override;
+
+	private:
+		void init(HWND bitmapWindow);
+		void release();
+
+		HDC windowContext;
+		HDC bitmapContext;
+		HWND bitmapWindow;
+
+		HBITMAP bitmap;
+	};
+}
+
+#endif	 //	sw_FrameBufferGDI_hpp
diff --git a/src/WSI/FrameBufferOSX.hpp b/src/WSI/FrameBufferOSX.hpp
new file mode 100644
index 0000000..07f8d63
--- /dev/null
+++ b/src/WSI/FrameBufferOSX.hpp
@@ -0,0 +1,49 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferOSX_hpp
+#define sw_FrameBufferOSX_hpp
+
+#include "Main/FrameBuffer.hpp"
+
+#import <Cocoa/Cocoa.h>
+
+@class CALayer;
+
+namespace sw
+{
+	class FrameBufferOSX : public FrameBuffer
+	{
+	public:
+		FrameBufferOSX(CALayer *layer, int width, int height);
+		~FrameBufferOSX() override;
+
+		void flip(sw::Surface *source) override;
+		void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void *lock() override;
+		void unlock() override;
+
+	private:
+		int width;
+		int height;
+		CALayer *layer;
+		uint8_t *buffer;
+		CGDataProviderRef provider;
+		CGColorSpaceRef colorspace;
+		CGImageRef currentImage;
+	};
+}
+
+#endif   // sw_FrameBufferOSX
diff --git a/src/WSI/FrameBufferOSX.mm b/src/WSI/FrameBufferOSX.mm
new file mode 100644
index 0000000..6d58ae7
--- /dev/null
+++ b/src/WSI/FrameBufferOSX.mm
@@ -0,0 +1,103 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferOSX.hpp"
+
+#include "Common/Debug.hpp"
+
+#include <EGL/egl.h>
+#import <QuartzCore/QuartzCore.h>
+
+namespace sw {
+
+	FrameBufferOSX::FrameBufferOSX(CALayer* layer, int width, int height)
+		: FrameBuffer(width, height, false, false), width(width), height(height),
+		  layer(layer), buffer(nullptr), provider(nullptr), currentImage(nullptr)
+	{
+		format = sw::FORMAT_X8B8G8R8;
+		int bufferSize = width * height * 4 * sizeof(uint8_t);
+		buffer = new uint8_t[bufferSize];
+		provider = CGDataProviderCreateWithData(nullptr, buffer, bufferSize, nullptr);
+		colorspace = CGColorSpaceCreateDeviceRGB();
+	}
+
+	FrameBufferOSX::~FrameBufferOSX()
+	{
+		//[CATransaction begin];
+		//[layer setContents:nullptr];
+		//[CATransaction commit];
+
+		CGImageRelease(currentImage);
+		CGColorSpaceRelease(colorspace);
+		CGDataProviderRelease(provider);
+
+		delete[] buffer;
+	}
+
+	void FrameBufferOSX::flip(sw::Surface *source)
+	{
+		blit(source, nullptr, nullptr);
+	}
+
+	void FrameBufferOSX::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		copy(source);
+
+		int bytesPerRow = width * 4 * sizeof(uint8_t);
+		CGImageRef image = CGImageCreate(width, height, 8, 32, bytesPerRow, colorspace, kCGBitmapByteOrder32Big, provider, nullptr, false, kCGRenderingIntentDefault);
+
+		[CATransaction begin];
+		[layer setContents:(id)image];
+		[CATransaction commit];
+		[CATransaction flush];
+
+		if(currentImage)
+		{
+			CGImageRelease(currentImage);
+		}
+		currentImage = image;
+	}
+
+	void *FrameBufferOSX::lock()
+	{
+		stride = width * 4 * sizeof(uint8_t);
+		framebuffer = buffer;
+		return framebuffer;
+	};
+
+	void FrameBufferOSX::unlock()
+	{
+		framebuffer = nullptr;
+	};
+}
+
+sw::FrameBuffer *createFrameBuffer(void *display, EGLNativeWindowType nativeWindow, int width, int height)
+{
+	NSObject *window = reinterpret_cast<NSObject*>(nativeWindow);
+	CALayer *layer = nullptr;
+
+	if([window isKindOfClass:[NSView class]])
+	{
+		NSView *view = reinterpret_cast<NSView*>(window);
+		[view setWantsLayer:YES];
+		layer = [view layer];
+	}
+	else if([window isKindOfClass:[CALayer class]])
+	{
+		layer = reinterpret_cast<CALayer*>(window);
+	}
+	else ASSERT(0);
+
+	return new sw::FrameBufferOSX(layer, width, height);
+}
diff --git a/src/WSI/FrameBufferOzone.cpp b/src/WSI/FrameBufferOzone.cpp
new file mode 100644
index 0000000..95e0729
--- /dev/null
+++ b/src/WSI/FrameBufferOzone.cpp
@@ -0,0 +1,54 @@
+// Copyright 2017 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferOzone.hpp"
+
+namespace sw
+{
+	FrameBufferOzone::FrameBufferOzone(intptr_t display, intptr_t window, int width, int height) : FrameBuffer(width, height, false, false)
+	{
+		buffer = sw::Surface::create(width, height, 1, format, nullptr,
+		                             sw::Surface::pitchB(width, 0, format, true),
+		                             sw::Surface::sliceB(width, height, 0, format, true));
+	}
+
+	FrameBufferOzone::~FrameBufferOzone()
+	{
+		delete buffer;
+	}
+
+	void *FrameBufferOzone::lock()
+	{
+		framebuffer = buffer->lockInternal(0, 0, 0, sw::LOCK_READWRITE, sw::PUBLIC);
+
+		return framebuffer;
+	}
+
+	void FrameBufferOzone::unlock()
+	{
+		buffer->unlockInternal();
+
+		framebuffer = nullptr;
+	}
+
+	void FrameBufferOzone::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		copy(source);
+	}
+}
+
+NO_SANITIZE_FUNCTION sw::FrameBuffer *createFrameBuffer(void* display, intptr_t window, int width, int height)
+{
+	return new sw::FrameBufferOzone((intptr_t)display, window, width, height);
+}
diff --git a/src/WSI/FrameBufferOzone.hpp b/src/WSI/FrameBufferOzone.hpp
new file mode 100644
index 0000000..0dc9f60
--- /dev/null
+++ b/src/WSI/FrameBufferOzone.hpp
@@ -0,0 +1,40 @@
+// Copyright 2017 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferOzone_hpp
+#define sw_FrameBufferOzone_hpp
+
+#include "Main/FrameBuffer.hpp"
+
+namespace sw
+{
+	class FrameBufferOzone : public FrameBuffer
+	{
+	public:
+		FrameBufferOzone(intptr_t display, intptr_t window, int width, int height);
+
+		~FrameBufferOzone() override;
+
+		void flip(sw::Surface *source) override {blit(source, nullptr, nullptr);};
+		void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void *lock() override;
+		void unlock() override;
+
+	private:
+		sw::Surface* buffer;
+	};
+}
+
+#endif   // sw_FrameBufferOzone_hpp
diff --git a/src/WSI/FrameBufferWin.cpp b/src/WSI/FrameBufferWin.cpp
new file mode 100644
index 0000000..cad8954
--- /dev/null
+++ b/src/WSI/FrameBufferWin.cpp
@@ -0,0 +1,78 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferWin.hpp"
+
+namespace sw
+{
+	FrameBufferWin::FrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin) : FrameBuffer(width, height, fullscreen, topLeftOrigin), windowHandle(windowHandle)
+	{
+		if(!windowed)
+		{
+			// Force fullscreen window style (no borders)
+			originalWindowStyle = GetWindowLong(windowHandle, GWL_STYLE);
+			SetWindowLong(windowHandle, GWL_STYLE, WS_POPUP);
+		}
+	}
+
+	FrameBufferWin::~FrameBufferWin()
+	{
+		if(!windowed && GetWindowLong(windowHandle, GWL_STYLE) == WS_POPUP)
+		{
+			SetWindowLong(windowHandle, GWL_STYLE, originalWindowStyle);
+		}
+	}
+
+	void FrameBufferWin::updateBounds(HWND windowOverride)
+	{
+		HWND window = windowOverride ? windowOverride : windowHandle;
+
+		if(windowed)
+		{
+			GetClientRect(window, &bounds);
+			ClientToScreen(window, (POINT*)&bounds);
+			ClientToScreen(window, (POINT*)&bounds + 1);
+		}
+		else
+		{
+			SetRect(&bounds, 0, 0, GetSystemMetrics(SM_CXSCREEN), GetSystemMetrics(SM_CYSCREEN));
+		}
+	}
+}
+
+#include "FrameBufferDD.hpp"
+#include "FrameBufferGDI.hpp"
+#include "Common/Configurator.hpp"
+
+sw::FrameBufferWin *createFrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin)
+{
+	sw::Configurator ini("SwiftShader.ini");
+	int api = ini.getInteger("Testing", "FrameBufferAPI", 0);
+
+	if(api == 0 && topLeftOrigin)
+	{
+		return new sw::FrameBufferDD(windowHandle, width, height, fullscreen, topLeftOrigin);
+	}
+	else
+	{
+		return new sw::FrameBufferGDI(windowHandle, width, height, fullscreen, topLeftOrigin);
+	}
+
+	return 0;
+}
+
+sw::FrameBuffer *createFrameBuffer(void *display, HWND window, int width, int height)
+{
+	return createFrameBufferWin(window, width, height, false, false);
+}
diff --git a/src/WSI/FrameBufferWin.hpp b/src/WSI/FrameBufferWin.hpp
new file mode 100644
index 0000000..15c1e0e
--- /dev/null
+++ b/src/WSI/FrameBufferWin.hpp
@@ -0,0 +1,59 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef	sw_FrameBufferWin_hpp
+#define	sw_FrameBufferWin_hpp
+
+#include "FrameBuffer.hpp"
+
+namespace sw
+{
+	struct GammaRamp
+	{
+		short red[256];
+		short green[256];
+		short blue[256];
+	};
+
+	class FrameBufferWin : public FrameBuffer
+	{
+	public:
+		FrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+		~FrameBufferWin() override;
+
+		void flip(sw::Surface *source) override = 0;
+		void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override = 0;
+
+		virtual void flip(HWND windowOverride, sw::Surface *source) = 0;
+		virtual void blit(HWND windowOverride, sw::Surface *source, const Rect *sourceRect, const Rect *destRect) = 0;
+
+		virtual void setGammaRamp(GammaRamp *gammaRamp, bool calibrate) = 0;
+		virtual void getGammaRamp(GammaRamp *gammaRamp) = 0;
+
+		virtual void screenshot(void *destBuffer) = 0;
+		virtual bool getScanline(bool &inVerticalBlank, unsigned int &scanline) = 0;
+
+	protected:
+		void updateBounds(HWND windowOverride);
+
+		HWND windowHandle;
+		DWORD originalWindowStyle;
+		RECT bounds;
+	};
+}
+
+sw::FrameBufferWin *createFrameBufferWin(HWND windowHandle, int width, int height, bool fullscreen, bool topLeftOrigin);
+
+#endif	 //	sw_FrameBufferWin_hpp
diff --git a/src/WSI/FrameBufferX11.cpp b/src/WSI/FrameBufferX11.cpp
new file mode 100644
index 0000000..b3ae3b4
--- /dev/null
+++ b/src/WSI/FrameBufferX11.cpp
@@ -0,0 +1,192 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "FrameBufferX11.hpp"
+
+#include "libX11.hpp"
+#include "Common/Timer.hpp"
+
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+
+namespace sw
+{
+	static int (*PreviousXErrorHandler)(Display *display, XErrorEvent *event) = 0;
+	static bool shmBadAccess = false;
+
+	// Catches BadAcces errors so we can fall back to not using MIT-SHM
+	static int XShmErrorHandler(Display *display, XErrorEvent *event)
+	{
+		if(event->error_code == BadAccess)
+		{
+			shmBadAccess = true;
+			return 0;
+		}
+		else
+		{
+			return PreviousXErrorHandler(display, event);
+		}
+	}
+
+	FrameBufferX11::FrameBufferX11(Display *display, Window window, int width, int height) : FrameBuffer(width, height, false, false), ownX11(!display), x_display(display), x_window(window)
+	{
+		if(!x_display)
+		{
+			x_display = libX11->XOpenDisplay(0);
+			assert(x_display);
+		}
+
+		int screen = DefaultScreen(x_display);
+		x_gc = libX11->XDefaultGC(x_display, screen);
+		int depth = libX11->XDefaultDepth(x_display, screen);
+
+		XVisualInfo x_visual;
+		Status status = libX11->XMatchVisualInfo(x_display, screen, 32, TrueColor, &x_visual);
+		bool match = (status != 0 && x_visual.blue_mask == 0xFF);   // Prefer X8R8G8B8
+		Visual *visual = match ? x_visual.visual : libX11->XDefaultVisual(x_display, screen);
+
+		mit_shm = (libX11->XShmQueryExtension && libX11->XShmQueryExtension(x_display) == True);
+
+		if(mit_shm)
+		{
+			x_image = libX11->XShmCreateImage(x_display, visual, depth, ZPixmap, 0, &shminfo, width, height);
+
+			shminfo.shmid = shmget(IPC_PRIVATE, x_image->bytes_per_line * x_image->height, IPC_CREAT | SHM_R | SHM_W);
+			shminfo.shmaddr = x_image->data = (char*)shmat(shminfo.shmid, 0, 0);
+			shminfo.readOnly = False;
+
+			PreviousXErrorHandler = libX11->XSetErrorHandler(XShmErrorHandler);
+			libX11->XShmAttach(x_display, &shminfo);   // May produce a BadAccess error
+			libX11->XSync(x_display, False);
+			libX11->XSetErrorHandler(PreviousXErrorHandler);
+
+			if(shmBadAccess)
+			{
+				mit_shm = false;
+
+				XDestroyImage(x_image);
+				shmdt(shminfo.shmaddr);
+				shmctl(shminfo.shmid, IPC_RMID, 0);
+
+				shmBadAccess = false;
+			}
+		}
+
+		if(!mit_shm)
+		{
+			int bytes_per_line = width * 4;
+			int bytes_per_image = height * bytes_per_line;
+			char *buffer = (char*)malloc(bytes_per_image);
+			memset(buffer, 0, bytes_per_image);
+
+			x_image = libX11->XCreateImage(x_display, visual, depth, ZPixmap, 0, buffer, width, height, 32, bytes_per_line);
+			assert(x_image);
+
+			if(!x_image)
+			{
+				free(buffer);
+			}
+		}
+	}
+
+	FrameBufferX11::~FrameBufferX11()
+	{
+		if(!mit_shm)
+		{
+			XDestroyImage(x_image);
+		}
+		else
+		{
+			libX11->XShmDetach(x_display, &shminfo);
+			XDestroyImage(x_image);
+			shmdt(shminfo.shmaddr);
+			shmctl(shminfo.shmid, IPC_RMID, 0);
+		}
+
+		if(ownX11)
+		{
+			libX11->XCloseDisplay(x_display);
+		}
+	}
+
+	void *FrameBufferX11::lock()
+	{
+		if(x_image)
+		{
+			stride = x_image->bytes_per_line;
+			framebuffer = x_image->data;
+		}
+
+		return framebuffer;
+	}
+
+	void FrameBufferX11::unlock()
+	{
+		framebuffer = nullptr;
+	}
+
+	void FrameBufferX11::blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect)
+	{
+		copy(source);
+
+		if(!mit_shm)
+		{
+			libX11->XPutImage(x_display, x_window, x_gc, x_image, 0, 0, 0, 0, width, height);
+		}
+		else
+		{
+			libX11->XShmPutImage(x_display, x_window, x_gc, x_image, 0, 0, 0, 0, width, height, False);
+		}
+
+		libX11->XSync(x_display, False);
+
+		if(false)   // Draw the framerate on screen
+		{
+			static double fpsTime = sw::Timer::seconds();
+			static int frames = -1;
+
+			double time = sw::Timer::seconds();
+			double delta = time - fpsTime;
+			frames++;
+
+			static double FPS = 0.0;
+			static double maxFPS = 0.0;
+
+			if(delta > 1.0)
+			{
+				FPS = frames / delta;
+
+				fpsTime = time;
+				frames = 0;
+
+				if(FPS > maxFPS)
+				{
+					maxFPS = FPS;
+				}
+			}
+
+			char string[256];
+			sprintf(string, "FPS: %.2f (max: %.2f)", FPS, maxFPS);
+			libX11->XDrawString(x_display, x_window, x_gc, 50, 50, string, strlen(string));
+		}
+	}
+}
+
+NO_SANITIZE_FUNCTION sw::FrameBuffer *createFrameBuffer(void *display, Window window, int width, int height)
+{
+	return new sw::FrameBufferX11((::Display*)display, window, width, height);
+}
diff --git a/src/WSI/FrameBufferX11.hpp b/src/WSI/FrameBufferX11.hpp
new file mode 100644
index 0000000..dc96331
--- /dev/null
+++ b/src/WSI/FrameBufferX11.hpp
@@ -0,0 +1,52 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef sw_FrameBufferX11_hpp
+#define sw_FrameBufferX11_hpp
+
+#include "Main/FrameBuffer.hpp"
+#include "Common/Debug.hpp"
+
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/extensions/XShm.h>
+
+namespace sw
+{
+	class FrameBufferX11 : public FrameBuffer
+	{
+	public:
+		FrameBufferX11(Display *display, Window window, int width, int height);
+
+		~FrameBufferX11() override;
+
+		void flip(sw::Surface *source) override { blit(source, nullptr, nullptr); }
+		void blit(sw::Surface *source, const Rect *sourceRect, const Rect *destRect) override;
+
+		void *lock() override;
+		void unlock() override;
+
+	private:
+		const bool ownX11;
+		Display *x_display;
+		const Window x_window;
+		XImage *x_image = nullptr;
+		GC x_gc;
+
+		bool mit_shm;
+		XShmSegmentInfo shminfo;
+	};
+}
+
+#endif   // sw_FrameBufferX11_hpp
diff --git a/src/WSI/libX11.cpp b/src/WSI/libX11.cpp
new file mode 100644
index 0000000..f3723ff
--- /dev/null
+++ b/src/WSI/libX11.cpp
@@ -0,0 +1,84 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "libX11.hpp"
+
+#include "Common/SharedLibrary.hpp"
+
+#define Bool int
+
+LibX11exports::LibX11exports(void *libX11, void *libXext)
+{
+	XOpenDisplay = (Display *(*)(char*))getProcAddress(libX11, "XOpenDisplay");
+	XGetWindowAttributes = (Status (*)(Display*, Window, XWindowAttributes*))getProcAddress(libX11, "XGetWindowAttributes");
+	XDefaultScreenOfDisplay = (Screen *(*)(Display*))getProcAddress(libX11, "XDefaultScreenOfDisplay");
+	XWidthOfScreen = (int (*)(Screen*))getProcAddress(libX11, "XWidthOfScreen");
+	XHeightOfScreen = (int (*)(Screen*))getProcAddress(libX11, "XHeightOfScreen");
+	XPlanesOfScreen = (int (*)(Screen*))getProcAddress(libX11, "XPlanesOfScreen");
+	XDefaultGC = (GC (*)(Display*, int))getProcAddress(libX11, "XDefaultGC");
+	XDefaultDepth = (int (*)(Display*, int))getProcAddress(libX11, "XDefaultDepth");
+	XMatchVisualInfo = (Status (*)(Display*, int, int, int, XVisualInfo*))getProcAddress(libX11, "XMatchVisualInfo");
+	XDefaultVisual = (Visual *(*)(Display*, int screen_number))getProcAddress(libX11, "XDefaultVisual");
+	XSetErrorHandler = (int (*(*)(int (*)(Display*, XErrorEvent*)))(Display*, XErrorEvent*))getProcAddress(libX11, "XSetErrorHandler");
+	XSync = (int (*)(Display*, Bool))getProcAddress(libX11, "XSync");
+	XCreateImage = (XImage *(*)(Display*, Visual*, unsigned int, int, int, char*, unsigned int, unsigned int, int, int))getProcAddress(libX11, "XCreateImage");
+	XCloseDisplay = (int (*)(Display*))getProcAddress(libX11, "XCloseDisplay");
+	XPutImage = (int (*)(Display*, Drawable, GC, XImage*, int, int, int, int, unsigned int, unsigned int))getProcAddress(libX11, "XPutImage");
+	XDrawString = (int (*)(Display*, Drawable, GC, int, int, char*, int))getProcAddress(libX11, "XDrawString");
+
+	XShmQueryExtension = (Bool (*)(Display*))getProcAddress(libXext, "XShmQueryExtension");
+	XShmCreateImage = (XImage *(*)(Display*, Visual*, unsigned int, int, char*, XShmSegmentInfo*, unsigned int, unsigned int))getProcAddress(libXext, "XShmCreateImage");
+	XShmAttach = (Bool (*)(Display*, XShmSegmentInfo*))getProcAddress(libXext, "XShmAttach");
+	XShmDetach = (Bool (*)(Display*, XShmSegmentInfo*))getProcAddress(libXext, "XShmDetach");
+	XShmPutImage = (int (*)(Display*, Drawable, GC, XImage*, int, int, int, int, unsigned int, unsigned int, bool))getProcAddress(libXext, "XShmPutImage");
+}
+
+LibX11exports *LibX11::operator->()
+{
+	return loadExports();
+}
+
+LibX11exports *LibX11::loadExports()
+{
+	static void *libX11 = nullptr;
+	static void *libXext = nullptr;
+	static LibX11exports *libX11exports = nullptr;
+
+	if(!libX11)
+	{
+		if(getProcAddress(RTLD_DEFAULT, "XOpenDisplay"))   // Search the global scope for pre-loaded X11 library.
+		{
+			libX11exports = new LibX11exports(RTLD_DEFAULT, RTLD_DEFAULT);
+			libX11 = (void*)-1;   // No need to load it.
+		}
+		else
+		{
+			libX11 = loadLibrary("libX11.so");
+
+			if(libX11)
+			{
+				libXext = loadLibrary("libXext.so");
+				libX11exports = new LibX11exports(libX11, libXext);
+			}
+			else
+			{
+				libX11 = (void*)-1;   // Don't attempt loading more than once.
+			}
+		}
+	}
+
+	return libX11exports;
+}
+
+LibX11 libX11;
diff --git a/src/WSI/libX11.hpp b/src/WSI/libX11.hpp
new file mode 100644
index 0000000..c188386
--- /dev/null
+++ b/src/WSI/libX11.hpp
@@ -0,0 +1,69 @@
+// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef libX11_hpp
+#define libX11_hpp
+
+#define Bool int
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/extensions/XShm.h>
+
+struct LibX11exports
+{
+	LibX11exports(void *libX11, void *libXext);
+
+	Display *(*XOpenDisplay)(char *display_name);
+	Status (*XGetWindowAttributes)(Display *display, Window w, XWindowAttributes *window_attributes_return);
+	Screen *(*XDefaultScreenOfDisplay)(Display *display);
+	int (*XWidthOfScreen)(Screen *screen);
+	int (*XHeightOfScreen)(Screen *screen);
+	int (*XPlanesOfScreen)(Screen *screen);
+	GC (*XDefaultGC)(Display *display, int screen_number);
+	int (*XDefaultDepth)(Display *display, int screen_number);
+	Status (*XMatchVisualInfo)(Display *display, int screen, int depth, int screen_class, XVisualInfo *vinfo_return);
+	Visual *(*XDefaultVisual)(Display *display, int screen_number);
+	int (*(*XSetErrorHandler)(int (*handler)(Display*, XErrorEvent*)))(Display*, XErrorEvent*);
+	int (*XSync)(Display *display, Bool discard);
+	XImage *(*XCreateImage)(Display *display, Visual *visual, unsigned int depth, int format, int offset, char *data, unsigned int width, unsigned int height, int bitmap_pad, int bytes_per_line);
+	int (*XCloseDisplay)(Display *display);
+	int (*XPutImage)(Display *display, Drawable d, GC gc, XImage *image, int src_x, int src_y, int dest_x, int dest_y, unsigned int width, unsigned int height);
+	int (*XDrawString)(Display *display, Drawable d, GC gc, int x, int y, char *string, int length);
+
+	Bool (*XShmQueryExtension)(Display *display);
+	XImage *(*XShmCreateImage)(Display *display, Visual *visual, unsigned int depth, int format, char *data, XShmSegmentInfo *shminfo, unsigned int width, unsigned int height);
+	Bool (*XShmAttach)(Display *display, XShmSegmentInfo *shminfo);
+	Bool (*XShmDetach)(Display *display, XShmSegmentInfo *shminfo);
+	int (*XShmPutImage)(Display *display, Drawable d, GC gc, XImage *image, int src_x, int src_y, int dest_x, int dest_y, unsigned int width, unsigned int height, bool send_event);
+};
+
+#undef Bool
+
+class LibX11
+{
+public:
+	operator bool()
+	{
+		return loadExports();
+	}
+
+	LibX11exports *operator->();
+
+private:
+	LibX11exports *loadExports();
+};
+
+extern LibX11 libX11;
+
+#endif   // libX11_hpp