Update SwiftShader to April code dump.

April code dump from Transgaming. Adds new shader compiler.
diff --git a/src/Shader/PixelRoutine.cpp b/src/Shader/PixelRoutine.cpp
index f1ac5be..e1b735d 100644
--- a/src/Shader/PixelRoutine.cpp
+++ b/src/Shader/PixelRoutine.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -12,7 +12,6 @@
 #include "PixelRoutine.hpp"
 
 #include "Renderer.hpp"
-#include "PixelShader.hpp"
 #include "QuadRasterizer.hpp"
 #include "Surface.hpp"
 #include "Primitive.hpp"
@@ -30,17 +29,21 @@
 	extern bool complementaryDepthBuffer;
 	extern bool postBlendSRGB;
 	extern bool exactColorRounding;
+	extern bool booleanFaceRegister;
+	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
+	extern bool fullPixelPositionRegister;
 
-	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *pixelShader) : Rasterizer(state), pixelShader(pixelShader)
+	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : Rasterizer(state), shader(shader)
 	{
 		perturbate = false;
 		luminance = false;
 		previousScaling = false;
 
-		returns = false;
 		ifDepth = 0;
 		loopRepDepth = 0;
 		breakDepth = 0;
+		currentLabel = -1;
+		whileTest = false;
 
 		for(int i = 0; i < 2048; i++)
 		{
@@ -68,7 +71,7 @@
 		}
 
 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
-		const bool integerPipeline = pixelShaderVersion() <= 0x0104;
+		const bool integerPipeline = shaderVersion() <= 0x0104;
 
 		Int zMask[4];   // Depth mask
 		Int sMask[4];   // Stencil mask
@@ -86,18 +89,19 @@
 
 		Float4 f;
 
-		Color4i &current = r.ri[0];
-		Color4i &diffuse = r.vi[0];
-		Color4i &specular = r.vi[1];
+		Vector4i &current = r.ri[0];
+		Vector4i &diffuse = r.vi[0];
+		Vector4i &specular = r.vi[1];
 
 		Float4 (&z)[4] = r.z;
+		Float4 &w = r.w;
 		Float4 &rhw = r.rhw;
 		Float4 rhwCentroid;
 
 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,xQuad), 16);
 		Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(r.primitive + OFFSET(Primitive,yQuad), 16);
 
-		if(state.depthTestActive || state.pixelFogActive())
+		if(interpolateZ())
 		{
 			for(unsigned int q = 0; q < state.multiSample; q++)
 			{
@@ -151,9 +155,10 @@
 				YYYY += yyyy;
 			}
 
-			if(state.perspective)
+			if(interpolateW())
 			{
-				rhw = reciprocal(interpolate(xxxx, r.Dw, rhw, r.primitive + OFFSET(Primitive,w), false, false));
+				w = interpolate(xxxx, r.Dw, rhw, r.primitive + OFFSET(Primitive,w), false, false);
+				rhw = reciprocal(w);
 
 				if(state.centroid)
 				{
@@ -165,27 +170,15 @@
 			{
 				for(int component = 0; component < 4; component++)
 				{
-					Array<Float4> *pv;
-
-					switch(component)
-					{
-					case 0: pv = &r.vx; break;
-					case 1: pv = &r.vy; break;
-					case 2: pv = &r.vz; break;
-					case 3: pv = &r.vw; break;
-					}
-
-					Array<Float4> &v = *pv;
-
 					if(state.interpolant[interpolant].component & (1 << component))
 					{
 						if(!state.interpolant[interpolant].centroid)
 						{
-							v[interpolant] = interpolate(xxxx, r.Dv[interpolant][component], rhw, r.primitive + OFFSET(Primitive,V[interpolant][component]), state.interpolant[interpolant].flat & (1 << component), state.perspective);
+							r.vf[interpolant][component] = interpolate(xxxx, r.Dv[interpolant][component], rhw, r.primitive + OFFSET(Primitive,V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
 						}
 						else
 						{
-							v[interpolant] = interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive,V[interpolant][component]), state.interpolant[interpolant].flat & (1 << component), state.perspective);
+							r.vf[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, r.primitive + OFFSET(Primitive,V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
 						}
 					}
 				}
@@ -197,19 +190,19 @@
 				case 0:
 					break;
 				case 1:
-					rcp = reciprocal(Float4(r.vy[interpolant]));
-					r.vx[interpolant] = r.vx[interpolant] * rcp;
+					rcp = reciprocal(r.vf[interpolant].y);
+					r.vf[interpolant].x = r.vf[interpolant].x * rcp;
 					break;
 				case 2:
-					rcp = reciprocal(Float4(r.vz[interpolant]));
-					r.vx[interpolant] = r.vx[interpolant] * rcp;
-					r.vy[interpolant] = r.vy[interpolant] * rcp;
+					rcp = reciprocal(r.vf[interpolant].z);
+					r.vf[interpolant].x = r.vf[interpolant].x * rcp;
+					r.vf[interpolant].y = r.vf[interpolant].y * rcp;
 					break;
 				case 3:
-					rcp = reciprocal(Float4(r.vw[interpolant]));
-					r.vx[interpolant] = r.vx[interpolant] * rcp;
-					r.vy[interpolant] = r.vy[interpolant] * rcp;
-					r.vz[interpolant] = r.vz[interpolant] * rcp;
+					rcp = reciprocal(r.vf[interpolant].w);
+					r.vf[interpolant].x = r.vf[interpolant].x * rcp;
+					r.vf[interpolant].y = r.vf[interpolant].y * rcp;
+					r.vf[interpolant].z = r.vf[interpolant].z * rcp;
 					break;
 				}
 			}
@@ -221,32 +214,47 @@
 
 			if(integerPipeline)
 			{
-				if(state.color[0].component & 0x1) diffuse.x = convertFixed12(Float4(r.vx[0])); else diffuse.x = Short4(0x1000, 0x1000, 0x1000, 0x1000);
-				if(state.color[0].component & 0x2) diffuse.y = convertFixed12(Float4(r.vy[0])); else diffuse.y = Short4(0x1000, 0x1000, 0x1000, 0x1000);
-				if(state.color[0].component & 0x4) diffuse.z = convertFixed12(Float4(r.vz[0])); else diffuse.z = Short4(0x1000, 0x1000, 0x1000, 0x1000);
-				if(state.color[0].component & 0x8) diffuse.w = convertFixed12(Float4(r.vw[0])); else diffuse.w = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+				if(state.color[0].component & 0x1) diffuse.x = convertFixed12(r.vf[0].x); else diffuse.x = Short4(0x1000);
+				if(state.color[0].component & 0x2) diffuse.y = convertFixed12(r.vf[0].y); else diffuse.y = Short4(0x1000);
+				if(state.color[0].component & 0x4) diffuse.z = convertFixed12(r.vf[0].z); else diffuse.z = Short4(0x1000);
+				if(state.color[0].component & 0x8) diffuse.w = convertFixed12(r.vf[0].w); else diffuse.w = Short4(0x1000);
 
-				if(state.color[1].component & 0x1) specular.x = convertFixed12(Float4(r.vx[1])); else specular.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-				if(state.color[1].component & 0x2) specular.y = convertFixed12(Float4(r.vy[1])); else specular.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-				if(state.color[1].component & 0x4) specular.z = convertFixed12(Float4(r.vz[1])); else specular.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-				if(state.color[1].component & 0x8) specular.w = convertFixed12(Float4(r.vw[1])); else specular.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+				if(state.color[1].component & 0x1) specular.x = convertFixed12(r.vf[1].x); else specular.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+				if(state.color[1].component & 0x2) specular.y = convertFixed12(r.vf[1].y); else specular.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+				if(state.color[1].component & 0x4) specular.z = convertFixed12(r.vf[1].z); else specular.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+				if(state.color[1].component & 0x8) specular.w = convertFixed12(r.vf[1].w); else specular.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 			}
-			else if(pixelShaderVersion() >= 0x0300)
+			else if(shaderVersion() >= 0x0300)
 			{
-				if(pixelShader->vPosDeclared)
+				if(shader->vPosDeclared)
 				{
-					r.vPos.x = Float4(Float(x)) + Float4(0, 1, 0, 1);
-					r.vPos.y = Float4(Float(y)) + Float4(0, 0, 1, 1);
+					if(!halfIntegerCoordinates)
+					{
+						r.vPos.x = Float4(Float(x)) + Float4(0, 1, 0, 1);
+						r.vPos.y = Float4(Float(y)) + Float4(0, 0, 1, 1);
+					}
+					else
+					{
+						r.vPos.x = Float4(Float(x)) + Float4(0.5f, 1.5f, 0.5f, 1.5f);
+						r.vPos.y = Float4(Float(y)) + Float4(0.5f, 0.5f, 1.5f, 1.5f);
+					}
+
+					if(fullPixelPositionRegister)
+					{
+						r.vPos.z = z[0];   // FIXME: Centroid?
+						r.vPos.w = w;      // FIXME: Centroid?
+					}
 				}
 
-				if(pixelShader->vFaceDeclared)
+				if(shader->vFaceDeclared)
 				{
 					Float4 area = *Pointer<Float>(r.primitive + OFFSET(Primitive,area));
-					
-					r.vFace.x = area;
-					r.vFace.y = area;
-					r.vFace.z = area;
-					r.vFace.w = area;
+					Float4 face = booleanFaceRegister ? As<Float4>(CmpNLT(area, Float4(0.0f))) : area;
+
+					r.vFace.x = face;
+					r.vFace.y = face;
+					r.vFace.z = face;
+					r.vFace.w = face;
 				}
 			}
 
@@ -262,11 +270,11 @@
 					Long shaderTime = Ticks();
 				#endif
 
-				if(pixelShader)
+				if(shader)
 				{
-				//	pixelShader->print("PixelShader-%0.16llX.txt", state.shaderHash);
+				//	shader->print("PixelShader-%0.8X.txt", state.shaderID);
 
-					if(pixelShader->getVersion() <= 0x0104)
+					if(shader->getVersion() <= 0x0104)
 					{
 						ps_1_x(r, cMask);
 					}
@@ -278,7 +286,7 @@
 				else
 				{
 					current = diffuse;
-					Color4i temp(0x0000, 0x0000, 0x0000, 0x0000);
+					Vector4i temp(0x0000, 0x0000, 0x0000, 0x0000);
 
 					for(int stage = 0; stage < 8; stage++)
 					{
@@ -287,7 +295,7 @@
 							break;
 						}
 
-						Color4i texture;
+						Vector4i texture;
 
 						if(state.textureStage[stage].usesTexture)
 						{
@@ -306,10 +314,10 @@
 
 				if(integerPipeline)
 				{
-					current.r = Min(current.r, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.r = Max(current.r, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-					current.g = Min(current.g, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.g = Max(current.g, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-					current.b = Min(current.b, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.b = Max(current.b, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-					current.a = Min(current.a, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.a = Max(current.a, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+					current.x = Min(current.x, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.x = Max(current.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+					current.y = Min(current.y, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.y = Max(current.y, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+					current.z = Min(current.z, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.z = Max(current.z, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+					current.w = Min(current.w, Short4(0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF)); current.w = Max(current.w, Short4(0x0000, 0x0000, 0x0000, 0x0000));
 
 					alphaPass = alphaTest(r, cMask, current);
 				}
@@ -320,7 +328,7 @@
 					alphaPass = alphaTest(r, cMask, r.oC[0]);
 				}
 
-				if((pixelShader && pixelShader->containsTexkill()) || state.alphaTestActive())
+				if((shader && shader->containsKill()) || state.alphaTestActive())
 				{
 					for(unsigned int q = 0; q < state.multiSample; q++)
 					{
@@ -528,11 +536,11 @@
 
 		Float4 Z = z;
 
-		if(pixelShader && pixelShader->depthOverride())
+		if(shader && shader->depthOverride())
 		{
 			if(complementaryDepthBuffer)
 			{
-				Z = Float4(1, 1, 1, 1) - r.oDepth;
+				Z = Float4(1.0f) - r.oDepth;
 			}
 			else
 			{
@@ -655,15 +663,15 @@
 		return zMask != 0;
 	}
 
-	void PixelRoutine::blendTexture(Registers &r, Color4i &current, Color4i &temp, Color4i &texture, int stage)
+	void PixelRoutine::blendTexture(Registers &r, Vector4i &current, Vector4i &temp, Vector4i &texture, int stage)
 	{
-		Color4i *arg1;
-		Color4i *arg2;
-		Color4i *arg3;
-		Color4i res;
+		Vector4i *arg1;
+		Vector4i *arg2;
+		Vector4i *arg3;
+		Vector4i res;
 
-		Color4i constant;
-		Color4i tfactor;
+		Vector4i constant;
+		Vector4i tfactor;
 
 		const TextureStage::State &textureStage = state.textureStage[stage];
 
@@ -674,10 +682,10 @@
 		   textureStage.thirdArgument == TextureStage::SOURCE_CONSTANT ||
 		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_CONSTANT)
 		{
-			constant.r = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[0]));
-			constant.g = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[1]));
-			constant.b = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[2]));
-			constant.a = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[3]));
+			constant.x = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[0]));
+			constant.y = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[1]));
+			constant.z = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[2]));
+			constant.w = *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].constantColor4[3]));
 		}
 
 		if(textureStage.firstArgument == TextureStage::SOURCE_TFACTOR ||
@@ -687,10 +695,10 @@
 		   textureStage.thirdArgument == TextureStage::SOURCE_TFACTOR ||
 		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_TFACTOR)
 		{
-			tfactor.r = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[0]));
-			tfactor.g = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[1]));
-			tfactor.b = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[2]));
-			tfactor.a = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]));
+			tfactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[0]));
+			tfactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[1]));
+			tfactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[2]));
+			tfactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]));
 		}
 
 		// Premodulate
@@ -698,22 +706,22 @@
 		{
 			if(state.textureStage[stage - 1].stageOperation == TextureStage::STAGE_PREMODULATE)
 			{
-				current.r = MulHigh(current.r, texture.r) << 4;
-				current.g = MulHigh(current.g, texture.g) << 4;
-				current.b = MulHigh(current.b, texture.b) << 4;
+				current.x = MulHigh(current.x, texture.x) << 4;
+				current.y = MulHigh(current.y, texture.y) << 4;
+				current.z = MulHigh(current.z, texture.z) << 4;
 			}
 
 			if(state.textureStage[stage - 1].stageOperationAlpha == TextureStage::STAGE_PREMODULATE)
 			{
-				current.a = MulHigh(current.a, texture.a) << 4;
+				current.w = MulHigh(current.w, texture.w) << 4;
 			}
 		}
 
 		if(luminance)
 		{
-			texture.r = MulHigh(texture.r, r.L) << 4;
-			texture.g = MulHigh(texture.g, r.L) << 4;
-			texture.b = MulHigh(texture.b, r.L) << 4;
+			texture.x = MulHigh(texture.x, r.L) << 4;
+			texture.y = MulHigh(texture.y, r.L) << 4;
+			texture.z = MulHigh(texture.z, r.L) << 4;
 
 			luminance = false;
 		}
@@ -757,9 +765,9 @@
 			ASSERT(false);
 		}
 
-		Color4i mod1;
-		Color4i mod2;
-		Color4i mod3;
+		Vector4i mod1;
+		Vector4i mod2;
+		Vector4i mod3;
 
 		switch(textureStage.firstModifier)
 		{
@@ -767,30 +775,30 @@
 			break;
 		case TextureStage::MODIFIER_INVCOLOR:
 			{
-				mod1.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->r);
-				mod1.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->g);
-				mod1.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->b);
-				mod1.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->a);
+				mod1.x = SubSat(Short4(0x1000), arg1->x);
+				mod1.y = SubSat(Short4(0x1000), arg1->y);
+				mod1.z = SubSat(Short4(0x1000), arg1->z);
+				mod1.w = SubSat(Short4(0x1000), arg1->w);
 
 				arg1 = &mod1;
 			}
 			break;
 		case TextureStage::MODIFIER_ALPHA:
 			{
-				mod1.r = arg1->a;
-				mod1.g = arg1->a;
-				mod1.b = arg1->a;
-				mod1.a = arg1->a;
+				mod1.x = arg1->w;
+				mod1.y = arg1->w;
+				mod1.z = arg1->w;
+				mod1.w = arg1->w;
 
 				arg1 = &mod1;
 			}
 			break;
 		case TextureStage::MODIFIER_INVALPHA:
 			{
-				mod1.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->a);
-				mod1.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->a);
-				mod1.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->a);
-				mod1.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->a);
+				mod1.x = SubSat(Short4(0x1000), arg1->w);
+				mod1.y = SubSat(Short4(0x1000), arg1->w);
+				mod1.z = SubSat(Short4(0x1000), arg1->w);
+				mod1.w = SubSat(Short4(0x1000), arg1->w);
 
 				arg1 = &mod1;
 			}
@@ -805,30 +813,30 @@
 			break;
 		case TextureStage::MODIFIER_INVCOLOR:
 			{
-				mod2.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->r);
-				mod2.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->g);
-				mod2.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->b);
-				mod2.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->a);
+				mod2.x = SubSat(Short4(0x1000), arg2->x);
+				mod2.y = SubSat(Short4(0x1000), arg2->y);
+				mod2.z = SubSat(Short4(0x1000), arg2->z);
+				mod2.w = SubSat(Short4(0x1000), arg2->w);
 
 				arg2 = &mod2;
 			}
 			break;
 		case TextureStage::MODIFIER_ALPHA:
 			{
-				mod2.r = arg2->a;
-				mod2.g = arg2->a;
-				mod2.b = arg2->a;
-				mod2.a = arg2->a;
+				mod2.x = arg2->w;
+				mod2.y = arg2->w;
+				mod2.z = arg2->w;
+				mod2.w = arg2->w;
 
 				arg2 = &mod2;
 			}
 			break;
 		case TextureStage::MODIFIER_INVALPHA:
 			{
-				mod2.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->a);
-				mod2.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->a);
-				mod2.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->a);
-				mod2.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->a);
+				mod2.x = SubSat(Short4(0x1000), arg2->w);
+				mod2.y = SubSat(Short4(0x1000), arg2->w);
+				mod2.z = SubSat(Short4(0x1000), arg2->w);
+				mod2.w = SubSat(Short4(0x1000), arg2->w);
 
 				arg2 = &mod2;
 			}
@@ -843,30 +851,30 @@
 			break;
 		case TextureStage::MODIFIER_INVCOLOR:
 			{
-				mod3.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->r);
-				mod3.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->g);
-				mod3.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->b);
-				mod3.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->a);
+				mod3.x = SubSat(Short4(0x1000), arg3->x);
+				mod3.y = SubSat(Short4(0x1000), arg3->y);
+				mod3.z = SubSat(Short4(0x1000), arg3->z);
+				mod3.w = SubSat(Short4(0x1000), arg3->w);
 
 				arg3 = &mod3;
 			}
 			break;
 		case TextureStage::MODIFIER_ALPHA:
 			{
-				mod3.r = arg3->a;
-				mod3.g = arg3->a;
-				mod3.b = arg3->a;
-				mod3.a = arg3->a;
+				mod3.x = arg3->w;
+				mod3.y = arg3->w;
+				mod3.z = arg3->w;
+				mod3.w = arg3->w;
 
 				arg3 = &mod3;
 			}
 			break;
 		case TextureStage::MODIFIER_INVALPHA:
 			{
-				mod3.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->a);
-				mod3.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->a);
-				mod3.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->a);
-				mod3.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->a);
+				mod3.x = SubSat(Short4(0x1000), arg3->w);
+				mod3.y = SubSat(Short4(0x1000), arg3->w);
+				mod3.z = SubSat(Short4(0x1000), arg3->w);
+				mod3.w = SubSat(Short4(0x1000), arg3->w);
 
 				arg3 = &mod3;
 			}
@@ -881,211 +889,211 @@
 			break;
 		case TextureStage::STAGE_SELECTARG1:					// Arg1
 			{
-				res.r = arg1->r;
-				res.g = arg1->g;
-				res.b = arg1->b;
+				res.x = arg1->x;
+				res.y = arg1->y;
+				res.z = arg1->z;
 			}
 			break;
 		case TextureStage::STAGE_SELECTARG2:					// Arg2
 			{
-				res.r = arg2->r;
-				res.g = arg2->g;
-				res.b = arg2->b;
+				res.x = arg2->x;
+				res.y = arg2->y;
+				res.z = arg2->z;
 			}
 			break;
 		case TextureStage::STAGE_SELECTARG3:					// Arg3
 			{
-				res.r = arg3->r;
-				res.g = arg3->g;
-				res.b = arg3->b;
+				res.x = arg3->x;
+				res.y = arg3->y;
+				res.z = arg3->z;
 			}
 			break;
 		case TextureStage::STAGE_MODULATE:					// Arg1 * Arg2
 			{
-				res.r = MulHigh(arg1->r, arg2->r) << 4;
-				res.g = MulHigh(arg1->g, arg2->g) << 4;
-				res.b = MulHigh(arg1->b, arg2->b) << 4;
+				res.x = MulHigh(arg1->x, arg2->x) << 4;
+				res.y = MulHigh(arg1->y, arg2->y) << 4;
+				res.z = MulHigh(arg1->z, arg2->z) << 4;
 			}
 			break;
 		case TextureStage::STAGE_MODULATE2X:					// Arg1 * Arg2 * 2
 			{
-				res.r = MulHigh(arg1->r, arg2->r) << 5;
-				res.g = MulHigh(arg1->g, arg2->g) << 5;
-				res.b = MulHigh(arg1->b, arg2->b) << 5;
+				res.x = MulHigh(arg1->x, arg2->x) << 5;
+				res.y = MulHigh(arg1->y, arg2->y) << 5;
+				res.z = MulHigh(arg1->z, arg2->z) << 5;
 			}
 			break;
 		case TextureStage::STAGE_MODULATE4X:					// Arg1 * Arg2 * 4
 			{
-				res.r = MulHigh(arg1->r, arg2->r) << 6;
-				res.g = MulHigh(arg1->g, arg2->g) << 6;
-				res.b = MulHigh(arg1->b, arg2->b) << 6;
+				res.x = MulHigh(arg1->x, arg2->x) << 6;
+				res.y = MulHigh(arg1->y, arg2->y) << 6;
+				res.z = MulHigh(arg1->z, arg2->z) << 6;
 			}
 			break;
 		case TextureStage::STAGE_ADD:						// Arg1 + Arg2
 			{
-				res.r = AddSat(arg1->r, arg2->r);
-				res.g = AddSat(arg1->g, arg2->g);
-				res.b = AddSat(arg1->b, arg2->b);
+				res.x = AddSat(arg1->x, arg2->x);
+				res.y = AddSat(arg1->y, arg2->y);
+				res.z = AddSat(arg1->z, arg2->z);
 			}
 			break;
 		case TextureStage::STAGE_ADDSIGNED:					// Arg1 + Arg2 - 0.5
 			{
-				res.r = AddSat(arg1->r, arg2->r);
-				res.g = AddSat(arg1->g, arg2->g);
-				res.b = AddSat(arg1->b, arg2->b);
+				res.x = AddSat(arg1->x, arg2->x);
+				res.y = AddSat(arg1->y, arg2->y);
+				res.z = AddSat(arg1->z, arg2->z);
 
-				res.r = SubSat(res.r, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-				res.g = SubSat(res.g, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-				res.b = SubSat(res.b, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+				res.x = SubSat(res.x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+				res.y = SubSat(res.y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+				res.z = SubSat(res.z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
 			}
 			break;
 		case TextureStage::STAGE_ADDSIGNED2X:				// (Arg1 + Arg2 - 0.5) << 1
 			{
-				res.r = AddSat(arg1->r, arg2->r);
-				res.g = AddSat(arg1->g, arg2->g);
-				res.b = AddSat(arg1->b, arg2->b);
+				res.x = AddSat(arg1->x, arg2->x);
+				res.y = AddSat(arg1->y, arg2->y);
+				res.z = AddSat(arg1->z, arg2->z);
 
-				res.r = SubSat(res.r, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-				res.g = SubSat(res.g, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-				res.b = SubSat(res.b, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+				res.x = SubSat(res.x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+				res.y = SubSat(res.y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+				res.z = SubSat(res.z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
 
-				res.r = AddSat(res.r, res.r);
-				res.g = AddSat(res.g, res.g);
-				res.b = AddSat(res.b, res.b);
+				res.x = AddSat(res.x, res.x);
+				res.y = AddSat(res.y, res.y);
+				res.z = AddSat(res.z, res.z);
 			}
 			break;
 		case TextureStage::STAGE_SUBTRACT:					// Arg1 - Arg2
 			{
-				res.r = SubSat(arg1->r, arg2->r);
-				res.g = SubSat(arg1->g, arg2->g);
-				res.b = SubSat(arg1->b, arg2->b);
+				res.x = SubSat(arg1->x, arg2->x);
+				res.y = SubSat(arg1->y, arg2->y);
+				res.z = SubSat(arg1->z, arg2->z);
 			}
 			break;
 		case TextureStage::STAGE_ADDSMOOTH:					// Arg1 + Arg2 - Arg1 * Arg2
 			{
 				Short4 tmp;
 
-				tmp = MulHigh(arg1->r, arg2->r) << 4; res.r = AddSat(arg1->r, arg2->r); res.r = SubSat(res.r, tmp);
-				tmp = MulHigh(arg1->g, arg2->g) << 4; res.g = AddSat(arg1->g, arg2->g); res.g = SubSat(res.g, tmp);
-				tmp = MulHigh(arg1->b, arg2->b) << 4; res.b = AddSat(arg1->b, arg2->b); res.b = SubSat(res.b, tmp);
+				tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(arg1->x, arg2->x); res.x = SubSat(res.x, tmp);
+				tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(arg1->y, arg2->y); res.y = SubSat(res.y, tmp);
+				tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(arg1->z, arg2->z); res.z = SubSat(res.z, tmp);
 			}
 			break;
 		case TextureStage::STAGE_MULTIPLYADD:				// Arg3 + Arg1 * Arg2
 			{
-				res.r = MulHigh(arg1->r, arg2->r) << 4; res.r = AddSat(res.r, arg3->r);
-				res.g = MulHigh(arg1->g, arg2->g) << 4; res.g = AddSat(res.g, arg3->g);
-				res.b = MulHigh(arg1->b, arg2->b) << 4; res.b = AddSat(res.b, arg3->b);
+				res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg3->x);
+				res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg3->y);
+				res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg3->z);
 			}
 			break;
 		case TextureStage::STAGE_LERP:						// Arg3 * (Arg1 - Arg2) + Arg2
 			{
-				res.r = SubSat(arg1->r, arg2->r); res.r = MulHigh(res.r, arg3->r) << 4; res.r = AddSat(res.r, arg2->r);
-				res.g = SubSat(arg1->g, arg2->g); res.g = MulHigh(res.g, arg3->g) << 4; res.g = AddSat(res.g, arg2->g);
-				res.b = SubSat(arg1->b, arg2->b); res.b = MulHigh(res.b, arg3->b) << 4; res.b = AddSat(res.b, arg2->b);
+				res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, arg3->x) << 4; res.x = AddSat(res.x, arg2->x);
+				res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, arg3->y) << 4; res.y = AddSat(res.y, arg2->y);
+				res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, arg3->z) << 4; res.z = AddSat(res.z, arg2->z);
 			}
 			break;
-		case TextureStage::STAGE_DOT3:						// 2 * (Arg1.r - 0.5) * 2 * (Arg2.r - 0.5) + 2 * (Arg1.g - 0.5) * 2 * (Arg2.g - 0.5) + 2 * (Arg1.b - 0.5) * 2 * (Arg2.b - 0.5)
+		case TextureStage::STAGE_DOT3:						// 2 * (Arg1.x - 0.5) * 2 * (Arg2.x - 0.5) + 2 * (Arg1.y - 0.5) * 2 * (Arg2.y - 0.5) + 2 * (Arg1.z - 0.5) * 2 * (Arg2.z - 0.5)
 			{
 				Short4 tmp;
 
-				res.r = SubSat(arg1->r, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->r, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.r = MulHigh(res.r, tmp);
-				res.g = SubSat(arg1->g, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->g, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.g = MulHigh(res.g, tmp);
-				res.b = SubSat(arg1->b, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->b, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.b = MulHigh(res.b, tmp);
+				res.x = SubSat(arg1->x, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->x, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.x = MulHigh(res.x, tmp);
+				res.y = SubSat(arg1->y, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->y, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.y = MulHigh(res.y, tmp);
+				res.z = SubSat(arg1->z, Short4(0x0800, 0x0800, 0x0800, 0x0800)); tmp = SubSat(arg2->z, Short4(0x0800, 0x0800, 0x0800, 0x0800)); res.z = MulHigh(res.z, tmp);
 
-				res.r = res.r << 6;
-				res.g = res.g << 6;
-				res.b = res.b << 6;
+				res.x = res.x << 6;
+				res.y = res.y << 6;
+				res.z = res.z << 6;
 
-				res.r = AddSat(res.r, res.g);
-				res.r = AddSat(res.r, res.b);
+				res.x = AddSat(res.x, res.y);
+				res.x = AddSat(res.x, res.z);
 
 				// Clamp to [0, 1]
-				res.r = Max(res.r, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-				res.r = Min(res.r, Short4(0x1000, 0x1000, 0x1000, 0x1000));
+				res.x = Max(res.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+				res.x = Min(res.x, Short4(0x1000));
 
-				res.g = res.r;
-				res.b = res.r;
-				res.a = res.r;
+				res.y = res.x;
+				res.z = res.x;
+				res.w = res.x;
 			}
 			break;
 		case TextureStage::STAGE_BLENDCURRENTALPHA:			// Alpha * (Arg1 - Arg2) + Arg2
 			{
-				res.r = SubSat(arg1->r, arg2->r); res.r = MulHigh(res.r, current.a) << 4; res.r = AddSat(res.r, arg2->r);
-				res.g = SubSat(arg1->g, arg2->g); res.g = MulHigh(res.g, current.a) << 4; res.g = AddSat(res.g, arg2->g);
-				res.b = SubSat(arg1->b, arg2->b); res.b = MulHigh(res.b, current.a) << 4; res.b = AddSat(res.b, arg2->b);
+				res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, current.w) << 4; res.x = AddSat(res.x, arg2->x);
+				res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, current.w) << 4; res.y = AddSat(res.y, arg2->y);
+				res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, current.w) << 4; res.z = AddSat(res.z, arg2->z);
 			}
 			break;
 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:			// Alpha * (Arg1 - Arg2) + Arg2
 			{
-				res.r = SubSat(arg1->r, arg2->r); res.r = MulHigh(res.r, r.diffuse.a) << 4; res.r = AddSat(res.r, arg2->r);
-				res.g = SubSat(arg1->g, arg2->g); res.g = MulHigh(res.g, r.diffuse.a) << 4; res.g = AddSat(res.g, arg2->g);
-				res.b = SubSat(arg1->b, arg2->b); res.b = MulHigh(res.b, r.diffuse.a) << 4; res.b = AddSat(res.b, arg2->b);
+				res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, r.diffuse.w) << 4; res.x = AddSat(res.x, arg2->x);
+				res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, r.diffuse.w) << 4; res.y = AddSat(res.y, arg2->y);
+				res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, r.diffuse.w) << 4; res.z = AddSat(res.z, arg2->z);
 			}
 			break;
 		case TextureStage::STAGE_BLENDFACTORALPHA:			// Alpha * (Arg1 - Arg2) + Arg2
 			{
-				res.r = SubSat(arg1->r, arg2->r); res.r = MulHigh(res.r, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.r = AddSat(res.r, arg2->r);
-				res.g = SubSat(arg1->g, arg2->g); res.g = MulHigh(res.g, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.g = AddSat(res.g, arg2->g);
-				res.b = SubSat(arg1->b, arg2->b); res.b = MulHigh(res.b, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.b = AddSat(res.b, arg2->b);
+				res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.x = AddSat(res.x, arg2->x);
+				res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.y = AddSat(res.y, arg2->y);
+				res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.z = AddSat(res.z, arg2->z);
 			}
 			break;
 		case TextureStage::STAGE_BLENDTEXTUREALPHA:			// Alpha * (Arg1 - Arg2) + Arg2
 			{
-				res.r = SubSat(arg1->r, arg2->r); res.r = MulHigh(res.r, texture.a) << 4; res.r = AddSat(res.r, arg2->r);
-				res.g = SubSat(arg1->g, arg2->g); res.g = MulHigh(res.g, texture.a) << 4; res.g = AddSat(res.g, arg2->g);
-				res.b = SubSat(arg1->b, arg2->b); res.b = MulHigh(res.b, texture.a) << 4; res.b = AddSat(res.b, arg2->b);
+				res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, texture.w) << 4; res.x = AddSat(res.x, arg2->x);
+				res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, texture.w) << 4; res.y = AddSat(res.y, arg2->y);
+				res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, texture.w) << 4; res.z = AddSat(res.z, arg2->z);
 			}
 			break;
 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:		// Arg1 + Arg2 * (1 - Alpha)
 			{
-				res.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), texture.a); res.r = MulHigh(res.r, arg2->r) << 4; res.r = AddSat(res.r, arg1->r);
-				res.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), texture.a); res.g = MulHigh(res.g, arg2->g) << 4; res.g = AddSat(res.g, arg1->g);
-				res.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), texture.a); res.b = MulHigh(res.b, arg2->b) << 4; res.b = AddSat(res.b, arg1->b);
+				res.x = SubSat(Short4(0x1000), texture.w); res.x = MulHigh(res.x, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
+				res.y = SubSat(Short4(0x1000), texture.w); res.y = MulHigh(res.y, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
+				res.z = SubSat(Short4(0x1000), texture.w); res.z = MulHigh(res.z, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
 			}
 			break;
 		case TextureStage::STAGE_PREMODULATE:
 			{
-				res.r = arg1->r;
-				res.g = arg1->g;
-				res.b = arg1->b;
+				res.x = arg1->x;
+				res.y = arg1->y;
+				res.z = arg1->z;
 			}
 			break;
-		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:		// Arg1 + Arg1.a * Arg2
+		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:		// Arg1 + Arg1.w * Arg2
 			{
-				res.r = MulHigh(arg1->a, arg2->r) << 4; res.r = AddSat(res.r, arg1->r);
-				res.g = MulHigh(arg1->a, arg2->g) << 4; res.g = AddSat(res.g, arg1->g);
-				res.b = MulHigh(arg1->a, arg2->b) << 4; res.b = AddSat(res.b, arg1->b);
+				res.x = MulHigh(arg1->w, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
+				res.y = MulHigh(arg1->w, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
+				res.z = MulHigh(arg1->w, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
 			}
 			break;
-		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:		// Arg1 * Arg2 + Arg1.a
+		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:		// Arg1 * Arg2 + Arg1.w
 			{
-				res.r = MulHigh(arg1->r, arg2->r) << 4; res.r = AddSat(res.r, arg1->a);
-				res.g = MulHigh(arg1->g, arg2->g) << 4; res.g = AddSat(res.g, arg1->a);
-				res.b = MulHigh(arg1->b, arg2->b) << 4; res.b = AddSat(res.b, arg1->a);
+				res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg1->w);
+				res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg1->w);
+				res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg1->w);
 			}
 			break;
-		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:	// (1 - Arg1.a) * Arg2 + Arg1
+		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:	// (1 - Arg1.w) * Arg2 + Arg1
 			{
 				Short4 tmp;
 
-				res.r = AddSat(arg1->r, arg2->r); tmp = MulHigh(arg1->a, arg2->r) << 4; res.r = SubSat(res.r, tmp);
-				res.g = AddSat(arg1->g, arg2->g); tmp = MulHigh(arg1->a, arg2->g) << 4; res.g = SubSat(res.g, tmp);
-				res.b = AddSat(arg1->b, arg2->b); tmp = MulHigh(arg1->a, arg2->b) << 4; res.b = SubSat(res.b, tmp);
+				res.x = AddSat(arg1->x, arg2->x); tmp = MulHigh(arg1->w, arg2->x) << 4; res.x = SubSat(res.x, tmp);
+				res.y = AddSat(arg1->y, arg2->y); tmp = MulHigh(arg1->w, arg2->y) << 4; res.y = SubSat(res.y, tmp);
+				res.z = AddSat(arg1->z, arg2->z); tmp = MulHigh(arg1->w, arg2->z) << 4; res.z = SubSat(res.z, tmp);
 			}
 			break;
-		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:	// (1 - Arg1) * Arg2 + Arg1.a
+		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:	// (1 - Arg1) * Arg2 + Arg1.w
 			{
 				Short4 tmp;
 
-				res.r = AddSat(arg1->a, arg2->r); tmp = MulHigh(arg1->r, arg2->r) << 4; res.r = SubSat(res.r, tmp);
-				res.g = AddSat(arg1->a, arg2->g); tmp = MulHigh(arg1->g, arg2->g) << 4; res.g = SubSat(res.g, tmp);
-				res.b = AddSat(arg1->a, arg2->b); tmp = MulHigh(arg1->b, arg2->b) << 4; res.b = SubSat(res.b, tmp);
+				res.x = AddSat(arg1->w, arg2->x); tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = SubSat(res.x, tmp);
+				res.y = AddSat(arg1->w, arg2->y); tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = SubSat(res.y, tmp);
+				res.z = AddSat(arg1->w, arg2->z); tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = SubSat(res.z, tmp);
 			}
 			break;
 		case TextureStage::STAGE_BUMPENVMAP:
 			{
-				r.du = Float4(texture.r) * Float4(1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0);
-				r.dv = Float4(texture.g) * Float4(1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0);
+				r.du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
+				r.dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
 			
 				Float4 du2;
 				Float4 dv2;
@@ -1101,16 +1109,16 @@
 
 				perturbate = true;
 
-				res.r = r.current.r;
-				res.g = r.current.g;
-				res.b = r.current.b;
-				res.a = r.current.a;
+				res.x = r.current.x;
+				res.y = r.current.y;
+				res.z = r.current.z;
+				res.w = r.current.w;
 			}
 			break;
 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
 			{
-				r.du = Float4(texture.r) * Float4(1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0);
-				r.dv = Float4(texture.g) * Float4(1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0, 1.0f / 0x0FE0);
+				r.du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
+				r.dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
 			
 				Float4 du2;
 				Float4 dv2;
@@ -1127,19 +1135,19 @@
 
 				perturbate = true;
 
-				r.L = texture.b;
+				r.L = texture.z;
 				r.L = MulHigh(r.L, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].luminanceScale4)));
 				r.L = r.L << 4;
 				r.L = AddSat(r.L, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].luminanceOffset4)));
 				r.L = Max(r.L, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-				r.L = Min(r.L, Short4(0x1000, 0x1000, 0x1000, 0x1000));
+				r.L = Min(r.L, Short4(0x1000));
 
 				luminance = true;
 
-				res.r = r.current.r;
-				res.g = r.current.g;
-				res.b = r.current.b;
-				res.a = r.current.a;
+				res.x = r.current.x;
+				res.y = r.current.y;
+				res.z = r.current.z;
+				res.w = r.current.w;
 			}
 			break;
 		default:
@@ -1193,7 +1201,7 @@
 				break;
 			case TextureStage::MODIFIER_INVCOLOR:
 				{
-					mod1.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->a);
+					mod1.w = SubSat(Short4(0x1000), arg1->w);
 
 					arg1 = &mod1;
 				}
@@ -1205,7 +1213,7 @@
 				break;
 			case TextureStage::MODIFIER_INVALPHA:
 				{
-					mod1.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg1->a);
+					mod1.w = SubSat(Short4(0x1000), arg1->w);
 
 					arg1 = &mod1;
 				}
@@ -1220,7 +1228,7 @@
 				break;
 			case TextureStage::MODIFIER_INVCOLOR:
 				{
-					mod2.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->a);
+					mod2.w = SubSat(Short4(0x1000), arg2->w);
 
 					arg2 = &mod2;
 				}
@@ -1232,7 +1240,7 @@
 				break;
 			case TextureStage::MODIFIER_INVALPHA:
 				{
-					mod2.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg2->a);
+					mod2.w = SubSat(Short4(0x1000), arg2->w);
 
 					arg2 = &mod2;
 				}
@@ -1247,7 +1255,7 @@
 				break;
 			case TextureStage::MODIFIER_INVCOLOR:
 				{
-					mod3.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->a);
+					mod3.w = SubSat(Short4(0x1000), arg3->w);
 
 					arg3 = &mod3;
 				}
@@ -1259,7 +1267,7 @@
 				break;
 			case TextureStage::MODIFIER_INVALPHA:
 				{
-					mod3.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), arg3->a);
+					mod3.w = SubSat(Short4(0x1000), arg3->w);
 
 					arg3 = &mod3;
 				}
@@ -1274,104 +1282,104 @@
 				break;
 			case TextureStage::STAGE_SELECTARG1:					// Arg1
 				{
-					res.a = arg1->a;
+					res.w = arg1->w;
 				}
 				break;
 			case TextureStage::STAGE_SELECTARG2:					// Arg2
 				{
-					res.a = arg2->a;
+					res.w = arg2->w;
 				}
 				break;
 			case TextureStage::STAGE_SELECTARG3:					// Arg3
 				{
-					res.a = arg3->a;
+					res.w = arg3->w;
 				}
 				break;
 			case TextureStage::STAGE_MODULATE:					// Arg1 * Arg2
 				{
-					res.a = MulHigh(arg1->a, arg2->a) << 4;
+					res.w = MulHigh(arg1->w, arg2->w) << 4;
 				}
 				break;
 			case TextureStage::STAGE_MODULATE2X:					// Arg1 * Arg2 * 2
 				{
-					res.a = MulHigh(arg1->a, arg2->a) << 5;
+					res.w = MulHigh(arg1->w, arg2->w) << 5;
 				}
 				break;
 			case TextureStage::STAGE_MODULATE4X:					// Arg1 * Arg2 * 4
 				{
-					res.a = MulHigh(arg1->a, arg2->a) << 6;
+					res.w = MulHigh(arg1->w, arg2->w) << 6;
 				}
 				break;
 			case TextureStage::STAGE_ADD:						// Arg1 + Arg2
 				{
-					res.a = AddSat(arg1->a, arg2->a);
+					res.w = AddSat(arg1->w, arg2->w);
 				}
 				break;
 			case TextureStage::STAGE_ADDSIGNED:					// Arg1 + Arg2 - 0.5
 				{
-					res.a = AddSat(arg1->a, arg2->a);
-					res.a = SubSat(res.a, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+					res.w = AddSat(arg1->w, arg2->w);
+					res.w = SubSat(res.w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
 				}
 				break;
 			case TextureStage::STAGE_ADDSIGNED2X:					// (Arg1 + Arg2 - 0.5) << 1
 				{
-					res.a = AddSat(arg1->a, arg2->a);
-					res.a = SubSat(res.a, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-					res.a = AddSat(res.a, res.a);
+					res.w = AddSat(arg1->w, arg2->w);
+					res.w = SubSat(res.w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+					res.w = AddSat(res.w, res.w);
 				}
 				break;
 			case TextureStage::STAGE_SUBTRACT:					// Arg1 - Arg2
 				{
-					res.a = SubSat(arg1->a, arg2->a);
+					res.w = SubSat(arg1->w, arg2->w);
 				}
 				break;
 			case TextureStage::STAGE_ADDSMOOTH:					// Arg1 + Arg2 - Arg1 * Arg2
 				{
 					Short4 tmp;
 
-					tmp = MulHigh(arg1->a, arg2->a) << 4; res.a = AddSat(arg1->a, arg2->a); res.a = SubSat(res.a, tmp);
+					tmp = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(arg1->w, arg2->w); res.w = SubSat(res.w, tmp);
 				}
 				break;
 			case TextureStage::STAGE_MULTIPLYADD:				// Arg3 + Arg1 * Arg2
 				{
-					res.a = MulHigh(arg1->a, arg2->a) << 4; res.a = AddSat(res.a, arg3->a);
+					res.w = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(res.w, arg3->w);
 				}
 				break;
 			case TextureStage::STAGE_LERP:						// Arg3 * (Arg1 - Arg2) + Arg2
 				{
-					res.a = SubSat(arg1->a, arg2->a); res.a = MulHigh(res.a, arg3->a) << 4; res.a = AddSat(res.a, arg2->a);
+					res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, arg3->w) << 4; res.w = AddSat(res.w, arg2->w);
 				}
 				break;
 			case TextureStage::STAGE_DOT3:
 				break;   // Already computed in color channel
 			case TextureStage::STAGE_BLENDCURRENTALPHA:			// Alpha * (Arg1 - Arg2) + Arg2
 				{
-					res.a = SubSat(arg1->a, arg2->a); res.a = MulHigh(res.a, current.a) << 4; res.a = AddSat(res.a, arg2->a);
+					res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, current.w) << 4; res.w = AddSat(res.w, arg2->w);
 				}
 				break;
 			case TextureStage::STAGE_BLENDDIFFUSEALPHA:			// Arg1 * (Alpha) + Arg2 * (1 - Alpha)
 				{
-					res.a = SubSat(arg1->a, arg2->a); res.a = MulHigh(res.a, r.diffuse.a) << 4; res.a = AddSat(res.a, arg2->a);
+					res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, r.diffuse.w) << 4; res.w = AddSat(res.w, arg2->w);
 				}
 				break;
 			case TextureStage::STAGE_BLENDFACTORALPHA:
 				{
-					res.a = SubSat(arg1->a, arg2->a); res.a = MulHigh(res.a, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.a = AddSat(res.a, arg2->a);
+					res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, *Pointer<Short4>(r.data + OFFSET(DrawData,factor.textureFactor4[3]))) << 4; res.w = AddSat(res.w, arg2->w);
 				}
 				break;
 			case TextureStage::STAGE_BLENDTEXTUREALPHA:			// Arg1 * (Alpha) + Arg2 * (1 - Alpha)
 				{
-					res.a = SubSat(arg1->a, arg2->a); res.a = MulHigh(res.a, texture.a) << 4; res.a = AddSat(res.a, arg2->a);
+					res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, texture.w) << 4; res.w = AddSat(res.w, arg2->w);
 				}
 				break;
 			case TextureStage::STAGE_BLENDTEXTUREALPHAPM:		// Arg1 + Arg2 * (1 - Alpha)
 				{
-					res.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), texture.a); res.a = MulHigh(res.a, arg2->a) << 4; res.a = AddSat(res.a, arg1->a);
+					res.w = SubSat(Short4(0x1000), texture.w); res.w = MulHigh(res.w, arg2->w) << 4; res.w = AddSat(res.w, arg1->w);
 				}
 				break;
 			case TextureStage::STAGE_PREMODULATE:
 				{
-					res.a = arg1->a;
+					res.w = arg1->w;
 				}
 				break;
 			case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
@@ -1421,9 +1429,9 @@
 		case TextureStage::STAGE_ADDSIGNED2X:
 		case TextureStage::STAGE_SUBTRACT:
 		case TextureStage::STAGE_ADDSMOOTH:
-			res.r = Max(res.r, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-			res.g = Max(res.g, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-			res.b = Max(res.b, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+			res.x = Max(res.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+			res.y = Max(res.y, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+			res.z = Max(res.z, Short4(0x0000, 0x0000, 0x0000, 0x0000));
 			break;
 		default:
 			ASSERT(false);
@@ -1462,7 +1470,7 @@
 		case TextureStage::STAGE_ADDSIGNED2X:
 		case TextureStage::STAGE_SUBTRACT:
 		case TextureStage::STAGE_ADDSMOOTH:
-			res.a = Max(res.a, Short4(0x0000, 0x0000, 0x0000, 0x0000));
+			res.w = Max(res.w, Short4(0x0000, 0x0000, 0x0000, 0x0000));
 			break;
 		default:
 			ASSERT(false);
@@ -1498,9 +1506,9 @@
 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
-			res.r = Min(res.r, Short4(0x1000, 0x1000, 0x1000, 0x1000));
-			res.g = Min(res.g, Short4(0x1000, 0x1000, 0x1000, 0x1000));
-			res.b = Min(res.b, Short4(0x1000, 0x1000, 0x1000, 0x1000));
+			res.x = Min(res.x, Short4(0x1000));
+			res.y = Min(res.y, Short4(0x1000));
+			res.z = Min(res.z, Short4(0x1000));
 			break;
 		default:
 			ASSERT(false);
@@ -1536,7 +1544,7 @@
 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
-			res.a = Min(res.a, Short4(0x1000, 0x1000, 0x1000, 0x1000));
+			res.w = Min(res.w, Short4(0x1000));
 			break;
 		default:
 			ASSERT(false);
@@ -1545,16 +1553,16 @@
 		switch(textureStage.destinationArgument)
 		{
 		case TextureStage::DESTINATION_CURRENT:
-			current.r = res.r;
-			current.g = res.g;
-			current.b = res.b;
-			current.a = res.a;
+			current.x = res.x;
+			current.y = res.y;
+			current.z = res.z;
+			current.w = res.w;
 			break;
 		case TextureStage::DESTINATION_TEMP:
-			temp.r = res.r;
-			temp.g = res.g;
-			temp.b = res.b;
-			temp.a = res.a;
+			temp.x = res.x;
+			temp.y = res.y;
+			temp.z = res.z;
+			temp.w = res.w;
 			break;
 		default:
 			ASSERT(false);
@@ -1623,7 +1631,7 @@
 		cMask[3] &= aMask3;
 	}
 
-	Bool PixelRoutine::alphaTest(Registers &r, Int cMask[4], Color4i &current)
+	Bool PixelRoutine::alphaTest(Registers &r, Int cMask[4], Vector4i &current)
 	{
 		if(!state.alphaTestActive())
 		{
@@ -1634,7 +1642,7 @@
 
 		if(state.transparencyAntialiasing == Context::TRANSPARENCY_NONE)
 		{
-			alphaTest(r, aMask, current.a);
+			alphaTest(r, aMask, current.w);
 
 			for(unsigned int q = 0; q < state.multiSample; q++)
 			{
@@ -1643,7 +1651,7 @@
 		}
 		else if(state.transparencyAntialiasing == Context::TRANSPARENCY_ALPHA_TO_COVERAGE)
 		{
-			Float4 alpha = Float4(current.a) * Float4(1.0f / 0x1000);
+			Float4 alpha = Float4(current.w) * Float4(1.0f / 0x1000);
 
 			alphaToCoverage(r, cMask, alpha);
 		}
@@ -1659,7 +1667,7 @@
 		return pass != 0x0;
 	}
 
-	Bool PixelRoutine::alphaTest(Registers &r, Int cMask[4], Color4f &c0)
+	Bool PixelRoutine::alphaTest(Registers &r, Int cMask[4], Vector4f &c0)
 	{
 		if(!state.alphaTestActive())
 		{
@@ -1670,7 +1678,7 @@
 
 		if(state.transparencyAntialiasing == Context::TRANSPARENCY_NONE)
 		{
-			Short4 alpha = RoundShort4(c0.a * Float4(0x1000, 0x1000, 0x1000, 0x1000));
+			Short4 alpha = RoundShort4(c0.w * Float4(0x1000));
 
 			alphaTest(r, aMask, alpha);
 
@@ -1681,7 +1689,7 @@
 		}
 		else if(state.transparencyAntialiasing == Context::TRANSPARENCY_ALPHA_TO_COVERAGE)
 		{
-			alphaToCoverage(r, cMask, c0.a);
+			alphaToCoverage(r, cMask, c0.w);
 		}
 		else ASSERT(false);
 
@@ -1695,7 +1703,7 @@
 		return pass != 0x0;
 	}
 
-	void PixelRoutine::fogBlend(Registers &r, Color4i &current, Float4 &f, Float4 &z, Float4 &rhw)
+	void PixelRoutine::fogBlend(Registers &r, Vector4i &current, Float4 &f, Float4 &z, Float4 &rhw)
 	{
 		if(!state.fogActive)
 		{
@@ -1709,18 +1717,18 @@
 		
 		UShort4 fog = convertFixed16(f, true);
 
-		current.r = As<Short4>(MulHigh(As<UShort4>(current.r), fog));
-		current.g = As<Short4>(MulHigh(As<UShort4>(current.g), fog));
-		current.b = As<Short4>(MulHigh(As<UShort4>(current.b), fog));
+		current.x = As<Short4>(MulHigh(As<UShort4>(current.x), fog));
+		current.y = As<Short4>(MulHigh(As<UShort4>(current.y), fog));
+		current.z = As<Short4>(MulHigh(As<UShort4>(current.z), fog));
 
-		UShort4 invFog = UShort4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - fog;
+		UShort4 invFog = UShort4(0xFFFFu) - fog;
 
-		current.r += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(r.data + OFFSET(DrawData,fog.color4[0]))));
-		current.g += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(r.data + OFFSET(DrawData,fog.color4[1]))));
-		current.b += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(r.data + OFFSET(DrawData,fog.color4[2]))));
+		current.x += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(r.data + OFFSET(DrawData,fog.color4[0]))));
+		current.y += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(r.data + OFFSET(DrawData,fog.color4[1]))));
+		current.z += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(r.data + OFFSET(DrawData,fog.color4[2]))));
 	}
 
-	void PixelRoutine::fogBlend(Registers &r, Color4f &c0, Float4 &fog, Float4 &z, Float4 &rhw)
+	void PixelRoutine::fogBlend(Registers &r, Vector4f &c0, Float4 &fog, Float4 &z, Float4 &rhw)
 	{
 		if(!state.fogActive)
 		{
@@ -1731,21 +1739,21 @@
 		{
 			pixelFog(r, fog, z, rhw);
 
-			fog = Min(fog, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-			fog = Max(fog, Float4(0.0f, 0.0f, 0.0f, 0.0f));
+			fog = Min(fog, Float4(1.0f));
+			fog = Max(fog, Float4(0.0f));
 		}
 
-		c0.r -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
-		c0.g -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
-		c0.b -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
+		c0.x -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
+		c0.y -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
+		c0.z -= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
 
-		c0.r *= fog;
-		c0.g *= fog;
-		c0.b *= fog;
+		c0.x *= fog;
+		c0.y *= fog;
+		c0.z *= fog;
 
-		c0.r += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
-		c0.g += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
-		c0.b += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
+		c0.x += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[0]));
+		c0.y += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[1]));
+		c0.z += *Pointer<Float4>(r.data + OFFSET(DrawData,fog.colorF[2]));
 	}
 
 	void PixelRoutine::pixelFog(Registers &r, Float4 &visibility, Float4 &z, Float4 &rhw)
@@ -1762,7 +1770,7 @@
 			{
 				if(complementaryDepthBuffer)
 				{
-					zw = Float4(1.0f, 1.0f, 1.0f, 1.0f) - z;
+					zw = Float4(1.0f) - z;
 				}
 				else
 				{
@@ -1781,12 +1789,12 @@
 			break;
 		case Context::FOG_EXP:
 			zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE));
-			zw = exponential(zw, true);
+			zw = exponential2(zw, true);
 			break;
 		case Context::FOG_EXP2:
 			zw *= *Pointer<Float4>(r.data + OFFSET(DrawData,fog.densityE2));
 			zw *= zw;
-			zw = exponential(zw, true);
+			zw = exponential2(zw, true);
 			zw = Rcp_pp(zw);
 			break;
 		default:
@@ -1794,16 +1802,16 @@
 		}
 	}
 
-	void PixelRoutine::specularPixel(Color4i &current, Color4i &specular)
+	void PixelRoutine::specularPixel(Vector4i &current, Vector4i &specular)
 	{
 		if(!state.specularAdd)
 		{
 			return;
 		}
 
-		current.r = AddSat(current.r, specular.r);
-		current.g = AddSat(current.g, specular.g);
-		current.b = AddSat(current.b, specular.b);
+		current.x = AddSat(current.x, specular.x);
+		current.y = AddSat(current.y, specular.y);
+		current.z = AddSat(current.z, specular.z);
 	}
 
 	void PixelRoutine::writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
@@ -1815,11 +1823,11 @@
 
 		Float4 Z = z;
 
-		if(pixelShader && pixelShader->depthOverride())
+		if(shader && shader->depthOverride())
 		{
 			if(complementaryDepthBuffer)
 			{
-				Z = Float4(1, 1, 1, 1) - r.oDepth;
+				Z = Float4(1.0f) - r.oDepth;
 			}
 			else
 			{
@@ -2009,12 +2017,12 @@
 		}
 	}
 
-	void PixelRoutine::sampleTexture(Registers &r, Color4i &c, int coordinates, int stage, bool project)
+	void PixelRoutine::sampleTexture(Registers &r, Vector4i &c, int coordinates, int stage, bool project)
 	{
-		Float4 u = r.vx[2 + coordinates];
-		Float4 v = r.vy[2 + coordinates];
-		Float4 w = r.vz[2 + coordinates];
-		Float4 q = r.vw[2 + coordinates];
+		Float4 u = r.vf[2 + coordinates].x;
+		Float4 v = r.vf[2 + coordinates].y;
+		Float4 w = r.vf[2 + coordinates].z;
+		Float4 q = r.vf[2 + coordinates].w;
 
 		if(perturbate)
 		{
@@ -2027,15 +2035,15 @@
 		sampleTexture(r, c, stage, u, v, w, q, project);
 	}
 
-	void PixelRoutine::sampleTexture(Registers &r, Color4i &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project, bool bias, bool fixed12)
+	void PixelRoutine::sampleTexture(Registers &r, Vector4i &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project, bool bias, bool fixed12)
 	{
-		Color4f dsx;
-		Color4f dsy;
+		Vector4f dsx;
+		Vector4f dsy;
 
 		sampleTexture(r, c, stage, u, v, w, q, dsx, dsy, project, bias, fixed12, false);
 	}
 
-	void PixelRoutine::sampleTexture(Registers &r, Color4i &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool project, bool bias, bool fixed12, bool gradients, bool lodProvided)
+	void PixelRoutine::sampleTexture(Registers &r, Vector4i &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project, bool bias, bool fixed12, bool gradients, bool lodProvided)
 	{
 		#if PERF_PROFILE
 			Long texTime = Ticks();
@@ -2063,7 +2071,31 @@
 		#endif
 	}
 
-	void PixelRoutine::sampleTexture(Registers &r, Color4f &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool project, bool bias, bool gradients, bool lodProvided)
+	void PixelRoutine::sampleTexture(Registers &r, Vector4f &c, const Src &sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project, bool bias, bool gradients, bool lodProvided)
+	{
+		if(sampler.type == Shader::PARAMETER_SAMPLER && sampler.rel.type == Shader::PARAMETER_VOID)
+		{	
+			sampleTexture(r, c, sampler.index, u, v, w, q, dsx, dsy, project, bias, gradients, lodProvided);	
+		}
+		else
+		{
+			Int index = As<Int>(Float(reg(r, sampler).x.x));
+
+			for(int i = 0; i < 16; i++)
+			{
+				if(shader->usesSampler(i))
+				{
+					If(index == i)
+					{
+						sampleTexture(r, c, i, u, v, w, q, dsx, dsy, project, bias, gradients, lodProvided);
+						// FIXME: When the sampler states are the same, we could use one sampler and just index the texture
+					}
+				}
+			}
+		}
+	}
+
+	void PixelRoutine::sampleTexture(Registers &r, Vector4f &c, int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project, bool bias, bool gradients, bool lodProvided)
 	{
 		#if PERF_PROFILE
 			Long texTime = Ticks();
@@ -2091,7 +2123,7 @@
 		#endif
 	}
 
-	void PixelRoutine::clampColor(Color4f oC[4])
+	void PixelRoutine::clampColor(Vector4f oC[4])
 	{
 		for(int index = 0; index < 4; index++)
 		{
@@ -2108,10 +2140,10 @@
 			case FORMAT_A8R8G8B8:
 			case FORMAT_X8R8G8B8:
 			case FORMAT_G16R16:
-				oC[index].r = Max(oC[index].r, Float4(0.0f, 0.0f, 0.0f, 0.0f)); oC[index].r = Min(oC[index].r, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-				oC[index].g = Max(oC[index].g, Float4(0.0f, 0.0f, 0.0f, 0.0f)); oC[index].g = Min(oC[index].g, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-				oC[index].b = Max(oC[index].b, Float4(0.0f, 0.0f, 0.0f, 0.0f)); oC[index].b = Min(oC[index].b, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-				oC[index].a = Max(oC[index].a, Float4(0.0f, 0.0f, 0.0f, 0.0f)); oC[index].a = Min(oC[index].a, Float4(1.0f, 1.0f, 1.0f, 1.0f));
+				oC[index].x = Max(oC[index].x, Float4(0.0f)); oC[index].x = Min(oC[index].x, Float4(1.0f));
+				oC[index].y = Max(oC[index].y, Float4(0.0f)); oC[index].y = Min(oC[index].y, Float4(1.0f));
+				oC[index].z = Max(oC[index].z, Float4(0.0f)); oC[index].z = Min(oC[index].z, Float4(1.0f));
+				oC[index].w = Max(oC[index].w, Float4(0.0f)); oC[index].w = Min(oC[index].w, Float4(1.0f));
 				break;
 			case FORMAT_R32F:
 			case FORMAT_G32R32F:
@@ -2123,14 +2155,14 @@
 		}
 	}
 
-	void PixelRoutine::rasterOperation(Color4i &current, Registers &r, Float4 &fog, Pointer<Byte> &cBuffer, Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+	void PixelRoutine::rasterOperation(Vector4i &current, Registers &r, Float4 &fog, Pointer<Byte> &cBuffer, Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
 	{
 		if(!state.colorWriteActive(0))
 		{
 			return;
 		}
 
-		Color4f oC;
+		Vector4f oC;
 
 		switch(state.targetFormat[0])
 		{
@@ -2144,10 +2176,10 @@
 			}
 			else
 			{
-				current.r <<= 4;
-				current.g <<= 4;
-				current.b <<= 4;
-				current.a <<= 4;
+				current.x <<= 4;
+				current.y <<= 4;
+				current.z <<= 4;
+				current.w <<= 4;
 			}
 
 			fogBlend(r, current, fog, r.z[0], r.rhw);
@@ -2155,7 +2187,7 @@
 			for(unsigned int q = 0; q < state.multiSample; q++)
 			{
 				Pointer<Byte> buffer = cBuffer + q * *Pointer<Int>(r.data + OFFSET(DrawData,colorSliceB[0]));
-				Color4i color = current;
+				Vector4i color = current;
 
 				if(state.multiSampleMask & (1 << q))
 				{
@@ -2173,7 +2205,7 @@
 			for(unsigned int q = 0; q < state.multiSample; q++)
 			{
 				Pointer<Byte> buffer = cBuffer + q * *Pointer<Int>(r.data + OFFSET(DrawData,colorSliceB[0]));
-				Color4f color = oC;
+				Vector4f color = oC;
 
 				if(state.multiSampleMask & (1 << q))
 				{
@@ -2187,7 +2219,7 @@
 		}
 	}
 
-	void PixelRoutine::rasterOperation(Color4f oC[4], Registers &r, Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
+	void PixelRoutine::rasterOperation(Vector4f oC[4], Registers &r, Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
 	{
 		for(int index = 0; index < 4; index++)
 		{
@@ -2198,9 +2230,9 @@
 
 			if(!postBlendSRGB && state.writeSRGB)
 			{
-				oC[index].r = linearToSRGB(oC[index].r);
-				oC[index].g = linearToSRGB(oC[index].g);
-				oC[index].b = linearToSRGB(oC[index].b);
+				oC[index].x = linearToSRGB(oC[index].x);
+				oC[index].y = linearToSRGB(oC[index].y);
+				oC[index].z = linearToSRGB(oC[index].z);
 			}
 
 			if(index == 0)
@@ -2217,12 +2249,12 @@
 				for(unsigned int q = 0; q < state.multiSample; q++)
 				{
 					Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(r.data + OFFSET(DrawData,colorSliceB[index]));
-					Color4i color;
+					Vector4i color;
 
-					color.r = convertFixed16(oC[index].r, false);
-					color.g = convertFixed16(oC[index].g, false);
-					color.b = convertFixed16(oC[index].b, false);
-					color.a = convertFixed16(oC[index].a, false);
+					color.x = convertFixed16(oC[index].x, false);
+					color.y = convertFixed16(oC[index].y, false);
+					color.z = convertFixed16(oC[index].z, false);
+					color.w = convertFixed16(oC[index].w, false);
 
 					if(state.multiSampleMask & (1 << q))
 					{
@@ -2237,7 +2269,7 @@
 				for(unsigned int q = 0; q < state.multiSample; q++)
 				{
 					Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(r.data + OFFSET(DrawData,colorSliceB[index]));
-					Color4f color = oC[index];
+					Vector4f color = oC[index];
 
 					if(state.multiSampleMask & (1 << q))
 					{
@@ -2252,7 +2284,7 @@
 		}
 	}
 
-	void PixelRoutine::blendFactor(Registers &r, const Color4i &blendFactor, const Color4i &current, const Color4i &pixel, Context::BlendFactor blendFactorActive)
+	void PixelRoutine::blendFactor(Registers &r, const Vector4i &blendFactor, const Vector4i &current, const Vector4i &pixel, Context::BlendFactor blendFactorActive)
 	{
 		switch(blendFactorActive)
 		{
@@ -2263,77 +2295,77 @@
 			// Optimized
 			break;
 		case Context::BLEND_SOURCE:
-			blendFactor.r = current.r;
-			blendFactor.g = current.g;
-			blendFactor.b = current.b;
+			blendFactor.x = current.x;
+			blendFactor.y = current.y;
+			blendFactor.z = current.z;
 			break;
 		case Context::BLEND_INVSOURCE:
-			blendFactor.r = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.r;
-			blendFactor.g = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.g;
-			blendFactor.b = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.b;
+			blendFactor.x = Short4(0xFFFFu) - current.x;
+			blendFactor.y = Short4(0xFFFFu) - current.y;
+			blendFactor.z = Short4(0xFFFFu) - current.z;
 			break;
 		case Context::BLEND_DEST:
-			blendFactor.r = pixel.r;
-			blendFactor.g = pixel.g;
-			blendFactor.b = pixel.b;
+			blendFactor.x = pixel.x;
+			blendFactor.y = pixel.y;
+			blendFactor.z = pixel.z;
 			break;
 		case Context::BLEND_INVDEST:
-			blendFactor.r = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.r;
-			blendFactor.g = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.g;
-			blendFactor.b = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.b;
+			blendFactor.x = Short4(0xFFFFu) - pixel.x;
+			blendFactor.y = Short4(0xFFFFu) - pixel.y;
+			blendFactor.z = Short4(0xFFFFu) - pixel.z;
 			break;
 		case Context::BLEND_SOURCEALPHA:
-			blendFactor.r = current.a;
-			blendFactor.g = current.a;
-			blendFactor.b = current.a;
+			blendFactor.x = current.w;
+			blendFactor.y = current.w;
+			blendFactor.z = current.w;
 			break;
 		case Context::BLEND_INVSOURCEALPHA:
-			blendFactor.r = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.a;
-			blendFactor.g = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.a;
-			blendFactor.b = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.a;
+			blendFactor.x = Short4(0xFFFFu) - current.w;
+			blendFactor.y = Short4(0xFFFFu) - current.w;
+			blendFactor.z = Short4(0xFFFFu) - current.w;
 			break;
 		case Context::BLEND_DESTALPHA:
-			blendFactor.r = pixel.a;
-			blendFactor.g = pixel.a;
-			blendFactor.b = pixel.a;
+			blendFactor.x = pixel.w;
+			blendFactor.y = pixel.w;
+			blendFactor.z = pixel.w;
 			break;
 		case Context::BLEND_INVDESTALPHA:
-			blendFactor.r = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.a;
-			blendFactor.g = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.a;
-			blendFactor.b = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.a;
+			blendFactor.x = Short4(0xFFFFu) - pixel.w;
+			blendFactor.y = Short4(0xFFFFu) - pixel.w;
+			blendFactor.z = Short4(0xFFFFu) - pixel.w;
 			break;
 		case Context::BLEND_SRCALPHASAT:
-			blendFactor.r = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.a;
-			blendFactor.r = Min(As<UShort4>(blendFactor.r), As<UShort4>(current.a));
-			blendFactor.g = blendFactor.r;
-			blendFactor.b = blendFactor.r;
+			blendFactor.x = Short4(0xFFFFu) - pixel.w;
+			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
+			blendFactor.y = blendFactor.x;
+			blendFactor.z = blendFactor.x;
 			break;
 		case Context::BLEND_CONSTANT:
-			blendFactor.r = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
-			blendFactor.g = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
-			blendFactor.b = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
+			blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[0]));
+			blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[1]));
+			blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[2]));
 			break;
 		case Context::BLEND_INVCONSTANT:
-			blendFactor.r = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
-			blendFactor.g = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
-			blendFactor.b = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
+			blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
+			blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
+			blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
 			break;
 		case Context::BLEND_CONSTANTALPHA:
-			blendFactor.r = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
-			blendFactor.g = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
-			blendFactor.b = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
 			break;
 		case Context::BLEND_INVCONSTANTALPHA:
-			blendFactor.r = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-			blendFactor.g = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
-			blendFactor.b = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			blendFactor.x = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			blendFactor.y = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			blendFactor.z = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
 			break;
 		default:
 			ASSERT(false);
 		}
 	}
 	
-	void PixelRoutine::blendFactorAlpha(Registers &r, const Color4i &blendFactor, const Color4i &current, const Color4i &pixel, Context::BlendFactor blendFactorAlphaActive)
+	void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4i &blendFactor, const Vector4i &current, const Vector4i &pixel, Context::BlendFactor blendFactorAlphaActive)
 	{
 		switch(blendFactorAlphaActive)
 		{
@@ -2344,46 +2376,46 @@
 			// Optimized
 			break;
 		case Context::BLEND_SOURCE:
-			blendFactor.a = current.a;
+			blendFactor.w = current.w;
 			break;
 		case Context::BLEND_INVSOURCE:
-			blendFactor.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.a;
+			blendFactor.w = Short4(0xFFFFu) - current.w;
 			break;
 		case Context::BLEND_DEST:
-			blendFactor.a = pixel.a;
+			blendFactor.w = pixel.w;
 			break;
 		case Context::BLEND_INVDEST:
-			blendFactor.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.a;
+			blendFactor.w = Short4(0xFFFFu) - pixel.w;
 			break;
 		case Context::BLEND_SOURCEALPHA:
-			blendFactor.a = current.a;
+			blendFactor.w = current.w;
 			break;
 		case Context::BLEND_INVSOURCEALPHA:
-			blendFactor.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - current.a;
+			blendFactor.w = Short4(0xFFFFu) - current.w;
 			break;
 		case Context::BLEND_DESTALPHA:
-			blendFactor.a = pixel.a;
+			blendFactor.w = pixel.w;
 			break;
 		case Context::BLEND_INVDESTALPHA:
-			blendFactor.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF) - pixel.a;
+			blendFactor.w = Short4(0xFFFFu) - pixel.w;
 			break;
 		case Context::BLEND_SRCALPHASAT:
-			blendFactor.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+			blendFactor.w = Short4(0xFFFFu);
 			break;
 		case Context::BLEND_CONSTANT:
 		case Context::BLEND_CONSTANTALPHA:
-			blendFactor.a = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
+			blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.blendConstant4W[3]));
 			break;
 		case Context::BLEND_INVCONSTANT:
 		case Context::BLEND_INVCONSTANTALPHA:
-			blendFactor.a = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
+			blendFactor.w = *Pointer<Short4>(r.data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
 			break;
 		default:
 			ASSERT(false);
 		}
 	}
 
-	void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Color4i &current, Int &x)
+	void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4i &current, Int &x)
 	{
 		if(!state.alphaBlendActive)
 		{
@@ -2392,7 +2424,7 @@
 		 
 		Pointer<Byte> buffer;
 
-		Color4i pixel;
+		Vector4i pixel;
 		Short4 c01;
 		Short4 c23;
 
@@ -2404,74 +2436,74 @@
 			c01 = *Pointer<Short4>(buffer);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 			c23 = *Pointer<Short4>(buffer);
-			pixel.b = c01;
-			pixel.g = c01;
-			pixel.b = UnpackLow(As<Byte8>(pixel.b), As<Byte8>(c23));
-			pixel.g = UnpackHigh(As<Byte8>(pixel.g), As<Byte8>(c23));
-			pixel.r = pixel.b;
-			pixel.b = UnpackLow(As<Byte8>(pixel.b), As<Byte8>(pixel.g));
-			pixel.r = UnpackHigh(As<Byte8>(pixel.r), As<Byte8>(pixel.g));
-			pixel.g = pixel.b;
-			pixel.a = pixel.r;
-			pixel.r = UnpackLow(As<Byte8>(pixel.r), As<Byte8>(pixel.r));
-			pixel.g = UnpackHigh(As<Byte8>(pixel.g), As<Byte8>(pixel.g));
-			pixel.b = UnpackLow(As<Byte8>(pixel.b), As<Byte8>(pixel.b));
-			pixel.a = UnpackHigh(As<Byte8>(pixel.a), As<Byte8>(pixel.a));
+			pixel.z = c01;
+			pixel.y = c01;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+			pixel.x = pixel.z;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+			pixel.y = pixel.z;
+			pixel.w = pixel.x;
+			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
 			break;
 		case FORMAT_X8R8G8B8:
 			buffer = cBuffer + 4 * x;
 			c01 = *Pointer<Short4>(buffer);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 			c23 = *Pointer<Short4>(buffer);
-			pixel.b = c01;
-			pixel.g = c01;
-			pixel.b = UnpackLow(As<Byte8>(pixel.b), As<Byte8>(c23));
-			pixel.g = UnpackHigh(As<Byte8>(pixel.g), As<Byte8>(c23));
-			pixel.r = pixel.b;
-			pixel.b = UnpackLow(As<Byte8>(pixel.b), As<Byte8>(pixel.g));
-			pixel.r = UnpackHigh(As<Byte8>(pixel.r), As<Byte8>(pixel.g));
-			pixel.g = pixel.b;
-			pixel.r = UnpackLow(As<Byte8>(pixel.r), As<Byte8>(pixel.r));
-			pixel.g = UnpackHigh(As<Byte8>(pixel.g), As<Byte8>(pixel.g));
-			pixel.b = UnpackLow(As<Byte8>(pixel.b), As<Byte8>(pixel.b));
-			pixel.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+			pixel.z = c01;
+			pixel.y = c01;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
+			pixel.x = pixel.z;
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
+			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
+			pixel.y = pixel.z;
+			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
+			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
+			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
+			pixel.w = Short4(0xFFFFu);
 			break;
 		case FORMAT_A8G8R8B8Q:
 			UNIMPLEMENTED();
-		//	pixel.b = UnpackLow(As<Byte8>(pixel.b), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-		//	pixel.r = UnpackHigh(As<Byte8>(pixel.r), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-		//	pixel.g = UnpackLow(As<Byte8>(pixel.g), *Pointer<Byte8>(cBuffer + 8 * x + 8));
-		//	pixel.a = UnpackHigh(As<Byte8>(pixel.a), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
 			break;
 		case FORMAT_X8G8R8B8Q:
 			UNIMPLEMENTED();
-		//	pixel.b = UnpackLow(As<Byte8>(pixel.b), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-		//	pixel.r = UnpackHigh(As<Byte8>(pixel.r), *Pointer<Byte8>(cBuffer + 8 * x + 0));
-		//	pixel.g = UnpackLow(As<Byte8>(pixel.g), *Pointer<Byte8>(cBuffer + 8 * x + 8));
-		//	pixel.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
+		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
+		//	pixel.w = Short4(0xFFFFu);
 			break;
 		case FORMAT_A16B16G16R16:
 			buffer  = cBuffer;
-			pixel.r = *Pointer<Short4>(buffer + 8 * x);
-			pixel.g = *Pointer<Short4>(buffer + 8 * x + 8);
+			pixel.x = *Pointer<Short4>(buffer + 8 * x);
+			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.b = *Pointer<Short4>(buffer + 8 * x);
-			pixel.a = *Pointer<Short4>(buffer + 8 * x + 8);
-			transpose4x4(pixel.r, pixel.g, pixel.b, pixel.a);
+			pixel.z = *Pointer<Short4>(buffer + 8 * x);
+			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
+			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
 			break;
 		case FORMAT_G16R16:
 			buffer = cBuffer;
-			pixel.r = *Pointer<Short4>(buffer  + 4 * x);
+			pixel.x = *Pointer<Short4>(buffer  + 4 * x);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.g = *Pointer<Short4>(buffer  + 4 * x);
-			pixel.b = pixel.r;
-			pixel.r = As<Short4>(UnpackLow(pixel.r, pixel.g));
-			pixel.b = As<Short4>(UnpackHigh(pixel.b, pixel.g));
-			pixel.g = pixel.b;
-			pixel.r = As<Short4>(UnpackLow(pixel.r, pixel.b));
-			pixel.g = As<Short4>(UnpackHigh(pixel.g, pixel.b));
-			pixel.b = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
-			pixel.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+			pixel.y = *Pointer<Short4>(buffer  + 4 * x);
+			pixel.z = pixel.x;
+			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
+			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
+			pixel.y = pixel.z;
+			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
+			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
+			pixel.z = Short4(0xFFFFu);
+			pixel.w = Short4(0xFFFFu);
 			break;
 		default:
 			ASSERT(false);
@@ -2483,65 +2515,65 @@
 		}
 
 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
-		Color4i sourceFactor;
-		Color4i destFactor;
+		Vector4i sourceFactor;
+		Vector4i destFactor;
 
 		blendFactor(r, sourceFactor, current, pixel, (Context::BlendFactor)state.sourceBlendFactor);
 		blendFactor(r, destFactor, current, pixel, (Context::BlendFactor)state.destBlendFactor);
 
 		if(state.sourceBlendFactor != Context::BLEND_ONE && state.sourceBlendFactor != Context::BLEND_ZERO)
 		{
-			current.r = MulHigh(As<UShort4>(current.r), As<UShort4>(sourceFactor.r));
-			current.g = MulHigh(As<UShort4>(current.g), As<UShort4>(sourceFactor.g));
-			current.b = MulHigh(As<UShort4>(current.b), As<UShort4>(sourceFactor.b));
+			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
+			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
+			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
 		}
 	
 		if(state.destBlendFactor != Context::BLEND_ONE && state.destBlendFactor != Context::BLEND_ZERO)
 		{
-			pixel.r = MulHigh(As<UShort4>(pixel.r), As<UShort4>(destFactor.r));
-			pixel.g = MulHigh(As<UShort4>(pixel.g), As<UShort4>(destFactor.g));
-			pixel.b = MulHigh(As<UShort4>(pixel.b), As<UShort4>(destFactor.b));
+			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
+			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
+			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
 		}
 
 		switch(state.blendOperation)
 		{
 		case Context::BLENDOP_ADD:
-			current.r = AddSat(As<UShort4>(current.r), As<UShort4>(pixel.r));
-			current.g = AddSat(As<UShort4>(current.g), As<UShort4>(pixel.g));
-			current.b = AddSat(As<UShort4>(current.b), As<UShort4>(pixel.b));
+			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
 			break;
 		case Context::BLENDOP_SUB:
-			current.r = SubSat(As<UShort4>(current.r), As<UShort4>(pixel.r));
-			current.g = SubSat(As<UShort4>(current.g), As<UShort4>(pixel.g));
-			current.b = SubSat(As<UShort4>(current.b), As<UShort4>(pixel.b));
+			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
 			break;
 		case Context::BLENDOP_INVSUB:
-			current.r = SubSat(As<UShort4>(pixel.r), As<UShort4>(current.r));
-			current.g = SubSat(As<UShort4>(pixel.g), As<UShort4>(current.g));
-			current.b = SubSat(As<UShort4>(pixel.b), As<UShort4>(current.b));
+			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
+			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
+			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
 			break;
 		case Context::BLENDOP_MIN:
-			current.r = Min(As<UShort4>(current.r), As<UShort4>(pixel.r));
-			current.g = Min(As<UShort4>(current.g), As<UShort4>(pixel.g));
-			current.b = Min(As<UShort4>(current.b), As<UShort4>(pixel.b));
+			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
 			break;
 		case Context::BLENDOP_MAX:
-			current.r = Max(As<UShort4>(current.r), As<UShort4>(pixel.r));
-			current.g = Max(As<UShort4>(current.g), As<UShort4>(pixel.g));
-			current.b = Max(As<UShort4>(current.b), As<UShort4>(pixel.b));
+			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
+			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
+			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
 			break;
 		case Context::BLENDOP_SOURCE:
 			// No operation
 			break;
 		case Context::BLENDOP_DEST:
-			current.r = pixel.r;
-			current.g = pixel.g;
-			current.b = pixel.b;
+			current.x = pixel.x;
+			current.y = pixel.y;
+			current.z = pixel.z;
 			break;
 		case Context::BLENDOP_NULL:
-			current.r = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-			current.g = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-			current.b = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			current.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			current.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			current.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 			break;
 		default:
 			ASSERT(false);
@@ -2552,46 +2584,46 @@
 
 		if(state.sourceBlendFactorAlpha != Context::BLEND_ONE && state.sourceBlendFactorAlpha != Context::BLEND_ZERO)
 		{
-			current.a = MulHigh(As<UShort4>(current.a), As<UShort4>(sourceFactor.a));
+			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
 		}
 	
 		if(state.destBlendFactorAlpha != Context::BLEND_ONE && state.destBlendFactorAlpha != Context::BLEND_ZERO)
 		{
-			pixel.a = MulHigh(As<UShort4>(pixel.a), As<UShort4>(destFactor.a));
+			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
 		}
 
 		switch(state.blendOperationAlpha)
 		{
 		case Context::BLENDOP_ADD:
-			current.a = AddSat(As<UShort4>(current.a), As<UShort4>(pixel.a));
+			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
 			break;
 		case Context::BLENDOP_SUB:
-			current.a = SubSat(As<UShort4>(current.a), As<UShort4>(pixel.a));
+			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
 			break;
 		case Context::BLENDOP_INVSUB:
-			current.a = SubSat(As<UShort4>(pixel.a), As<UShort4>(current.a));
+			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
 			break;
 		case Context::BLENDOP_MIN:
-			current.a = Min(As<UShort4>(current.a), As<UShort4>(pixel.a));
+			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
 			break;
 		case Context::BLENDOP_MAX:
-			current.a = Max(As<UShort4>(current.a), As<UShort4>(pixel.a));
+			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
 			break;
 		case Context::BLENDOP_SOURCE:
 			// No operation
 			break;
 		case Context::BLENDOP_DEST:
-			current.a = pixel.a;
+			current.w = pixel.w;
 			break;
 		case Context::BLENDOP_NULL:
-			current.a = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			current.w = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 			break;
 		default:
 			ASSERT(false);
 		}
 	}
 
-	void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Color4i &current, Int &sMask, Int &zMask, Int &cMask)
+	void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4i &current, Int &sMask, Int &zMask, Int &cMask)
 	{
 		if(!state.colorWriteActive(index))
 		{
@@ -2612,10 +2644,10 @@
 			case FORMAT_X8R8G8B8:
 			case FORMAT_A8R8G8B8:
 				{
-					current.r = current.r - As<Short4>(As<UShort4>(current.r) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
-					current.g = current.g - As<Short4>(As<UShort4>(current.g) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
-					current.b = current.b - As<Short4>(As<UShort4>(current.b) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
-					current.a = current.a - As<Short4>(As<UShort4>(current.a) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
+					current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
+					current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
+					current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
+					current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080, 0x0080, 0x0080, 0x0080);
 				}
 				break;
 			}
@@ -2629,78 +2661,78 @@
 		{
 		case FORMAT_X8G8R8B8Q:
 			UNIMPLEMENTED();
-		//	current.r = As<Short4>(As<UShort4>(current.r) >> 8);
-		//	current.g = As<Short4>(As<UShort4>(current.g) >> 8);
-		//	current.b = As<Short4>(As<UShort4>(current.b) >> 8);
+		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
 
-		//	current.b = As<Short4>(Pack(As<UShort4>(current.b), As<UShort4>(current.r)));
-		//	current.g = As<Short4>(Pack(As<UShort4>(current.g), As<UShort4>(current.g)));
+		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
 			break;
 		case FORMAT_A8G8R8B8Q:
 			UNIMPLEMENTED();
-		//	current.r = As<Short4>(As<UShort4>(current.r) >> 8);
-		//	current.g = As<Short4>(As<UShort4>(current.g) >> 8);
-		//	current.b = As<Short4>(As<UShort4>(current.b) >> 8);
-		//	current.a = As<Short4>(As<UShort4>(current.a) >> 8);
+		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
 
-		//	current.b = As<Short4>(Pack(As<UShort4>(current.b), As<UShort4>(current.r)));
-		//	current.g = As<Short4>(Pack(As<UShort4>(current.g), As<UShort4>(current.a)));
+		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
 			break;
 		case FORMAT_X8R8G8B8:
 		case FORMAT_A8R8G8B8:
 			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
 			{
-				current.r = As<Short4>(As<UShort4>(current.r) >> 8);
-				current.g = As<Short4>(As<UShort4>(current.g) >> 8);
-				current.b = As<Short4>(As<UShort4>(current.b) >> 8);
+				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
 
-				current.b = As<Short4>(Pack(As<UShort4>(current.b), As<UShort4>(current.r)));
-				current.g = As<Short4>(Pack(As<UShort4>(current.g), As<UShort4>(current.g)));
+				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
 
-				current.r = current.b;
-				current.b = UnpackLow(As<Byte8>(current.b), As<Byte8>(current.g));
-				current.r = UnpackHigh(As<Byte8>(current.r), As<Byte8>(current.g));
-				current.g = current.b;
-				current.b = As<Short4>(UnpackLow(current.b, current.r));
-				current.g = As<Short4>(UnpackHigh(current.g, current.r));
+				current.x = current.z;
+				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+				current.y = current.z;
+				current.z = As<Short4>(UnpackLow(current.z, current.x));
+				current.y = As<Short4>(UnpackHigh(current.y, current.x));
 			}
 			else
 			{
-				current.r = As<Short4>(As<UShort4>(current.r) >> 8);
-				current.g = As<Short4>(As<UShort4>(current.g) >> 8);
-				current.b = As<Short4>(As<UShort4>(current.b) >> 8);
-				current.a = As<Short4>(As<UShort4>(current.a) >> 8);
+				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
 
-				current.b = As<Short4>(Pack(As<UShort4>(current.b), As<UShort4>(current.r)));
-				current.g = As<Short4>(Pack(As<UShort4>(current.g), As<UShort4>(current.a)));
+				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
+				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
 
-				current.r = current.b;
-				current.b = UnpackLow(As<Byte8>(current.b), As<Byte8>(current.g));
-				current.r = UnpackHigh(As<Byte8>(current.r), As<Byte8>(current.g));
-				current.g = current.b;
-				current.b = As<Short4>(UnpackLow(current.b, current.r));
-				current.g = As<Short4>(UnpackHigh(current.g, current.r));
+				current.x = current.z;
+				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+				current.y = current.z;
+				current.z = As<Short4>(UnpackLow(current.z, current.x));
+				current.y = As<Short4>(UnpackHigh(current.y, current.x));
 			}
 			break;
 		case FORMAT_G16R16:
-			current.b = current.r;
-			current.r = As<Short4>(UnpackLow(current.r, current.g));
-			current.b = As<Short4>(UnpackHigh(current.b, current.g));
-			current.g = current.b;
+			current.z = current.x;
+			current.x = As<Short4>(UnpackLow(current.x, current.y));
+			current.z = As<Short4>(UnpackHigh(current.z, current.y));
+			current.y = current.z;
 			break;
 		case FORMAT_A16B16G16R16:
-			transpose4x4(current.r, current.g, current.b, current.a);
+			transpose4x4(current.x, current.y, current.z, current.w);
 			break;
 		case FORMAT_R32F:
 		case FORMAT_G32R32F:
 		case FORMAT_A32B32G32R32F:
 			{
-				Color4f oC;
+				Vector4f oC;
 
-				oC.r = convertUnsigned16(UShort4(current.r));
-				oC.g = convertUnsigned16(UShort4(current.g));
-				oC.b = convertUnsigned16(UShort4(current.b));
-				oC.a = convertUnsigned16(UShort4(current.a));
+				oC.x = convertUnsigned16(UShort4(current.x));
+				oC.y = convertUnsigned16(UShort4(current.y));
+				oC.z = convertUnsigned16(UShort4(current.z));
+				oC.w = convertUnsigned16(UShort4(current.w));
 
 				writeColor(r, index, cBuffer, x, oC, sMask, zMask, cMask);
 			}
@@ -2709,8 +2741,8 @@
 			ASSERT(false);
 		}
 
-		Short4 c01 = current.b;
-		Short4 c23 = current.g;
+		Short4 c01 = current.z;
+		Short4 c23 = current.y;
 
 		Int xMask;   // Combination of all masks
 
@@ -2816,15 +2848,15 @@
 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
 			{
 				Short4 masked = value;
-				current.r &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+				current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
 				masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
-				current.r |= masked;
+				current.x |= masked;
 			}
 
-			current.r &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
+			current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD01Q) + xMask * 8);
 			value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
-			current.r |= value;
-			*Pointer<Short4>(buffer) = current.r;
+			current.x |= value;
+			*Pointer<Short4>(buffer) = current.x;
 
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 
@@ -2833,15 +2865,15 @@
 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
 			{
 				Short4 masked = value;
-				current.g &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
+				current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
 				masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
-				current.g |= masked;
+				current.y |= masked;
 			}
 
-			current.g &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
+			current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskD23Q) + xMask * 8);
 			value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
-			current.g |= value;
-			*Pointer<Short4>(buffer) = current.g;
+			current.y |= value;
+			*Pointer<Short4>(buffer) = current.y;
 			break;
 		case FORMAT_A16B16G16R16:
 			buffer = cBuffer + 8 * x;
@@ -2852,15 +2884,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{
 					Short4 masked = value;
-					current.r &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
 					masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-					current.r |= masked;
+					current.x |= masked;
 				}
 
-				current.r &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
+				current.x &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
 				value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
-				current.r |= value;
-				*Pointer<Short4>(buffer) = current.r;
+				current.x |= value;
+				*Pointer<Short4>(buffer) = current.x;
 			}
 
 			{
@@ -2869,15 +2901,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{
 					Short4 masked = value;
-					current.g &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
 					masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-					current.g |= masked;
+					current.y |= masked;
 				}
 
-				current.g &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
+				current.y &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
 				value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
-				current.g |= value;
-				*Pointer<Short4>(buffer + 8) = current.g;
+				current.y |= value;
+				*Pointer<Short4>(buffer + 8) = current.y;
 			}
 
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
@@ -2888,15 +2920,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{
 					Short4 masked = value;
-					current.b &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
 					masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-					current.b |= masked;
+					current.z |= masked;
 				}
 
-				current.b &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
+				current.z &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
 				value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
-				current.b |= value;
-				*Pointer<Short4>(buffer) = current.b;
+				current.z |= value;
+				*Pointer<Short4>(buffer) = current.z;
 			}
 
 			{
@@ -2905,15 +2937,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{
 					Short4 masked = value;
-					current.a &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
+					current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
 					masked &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
-					current.a |= masked;
+					current.w |= masked;
 				}
 
-				current.a &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
+				current.w &= *Pointer<Short4>(r.constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
 				value &= *Pointer<Short4>(r.constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
-				current.a |= value;
-				*Pointer<Short4>(buffer + 8) = current.a;
+				current.w |= value;
+				*Pointer<Short4>(buffer + 8) = current.w;
 			}
 			break;
 		default:
@@ -2921,7 +2953,7 @@
 		}
 	}
 
-	void PixelRoutine::blendFactor(Registers &r, const Color4f &blendFactor, const Color4f &oC, const Color4f &pixel, Context::BlendFactor blendFactorActive) 
+	void PixelRoutine::blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, Context::BlendFactor blendFactorActive) 
 	{
 		switch(blendFactorActive)
 		{
@@ -2932,67 +2964,67 @@
 			// Optimized
 			break;
 		case Context::BLEND_SOURCE:
-			blendFactor.r = oC.r;
-			blendFactor.g = oC.g;
-			blendFactor.b = oC.b;
+			blendFactor.x = oC.x;
+			blendFactor.y = oC.y;
+			blendFactor.z = oC.z;
 			break;
 		case Context::BLEND_INVSOURCE:
-			blendFactor.r = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.r;
-			blendFactor.g = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.g;
-			blendFactor.b = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.b;
+			blendFactor.x = Float4(1.0f) - oC.x;
+			blendFactor.y = Float4(1.0f) - oC.y;
+			blendFactor.z = Float4(1.0f) - oC.z;
 			break;
 		case Context::BLEND_DEST:
-			blendFactor.r = pixel.r;
-			blendFactor.g = pixel.g;
-			blendFactor.b = pixel.b;
+			blendFactor.x = pixel.x;
+			blendFactor.y = pixel.y;
+			blendFactor.z = pixel.z;
 			break;
 		case Context::BLEND_INVDEST:
-			blendFactor.r = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.r;
-			blendFactor.g = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.g;
-			blendFactor.b = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.b;
+			blendFactor.x = Float4(1.0f) - pixel.x;
+			blendFactor.y = Float4(1.0f) - pixel.y;
+			blendFactor.z = Float4(1.0f) - pixel.z;
 			break;
 		case Context::BLEND_SOURCEALPHA:
-			blendFactor.r = oC.a;
-			blendFactor.g = oC.a;
-			blendFactor.b = oC.a;
+			blendFactor.x = oC.w;
+			blendFactor.y = oC.w;
+			blendFactor.z = oC.w;
 			break;
 		case Context::BLEND_INVSOURCEALPHA:
-			blendFactor.r = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.a;
-			blendFactor.g = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.a;
-			blendFactor.b = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.a;
+			blendFactor.x = Float4(1.0f) - oC.w;
+			blendFactor.y = Float4(1.0f) - oC.w;
+			blendFactor.z = Float4(1.0f) - oC.w;
 			break;
 		case Context::BLEND_DESTALPHA:
-			blendFactor.r = pixel.a;
-			blendFactor.g = pixel.a;
-			blendFactor.b = pixel.a;
+			blendFactor.x = pixel.w;
+			blendFactor.y = pixel.w;
+			blendFactor.z = pixel.w;
 			break;
 		case Context::BLEND_INVDESTALPHA:
-			blendFactor.r = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.a;
-			blendFactor.g = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.a;
-			blendFactor.b = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.a;
+			blendFactor.x = Float4(1.0f) - pixel.w;
+			blendFactor.y = Float4(1.0f) - pixel.w;
+			blendFactor.z = Float4(1.0f) - pixel.w;
 			break;
 		case Context::BLEND_SRCALPHASAT:
-			blendFactor.r = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.a;
-			blendFactor.r = Min(blendFactor.r, oC.a);
-			blendFactor.g = blendFactor.r;
-			blendFactor.b = blendFactor.r;
+			blendFactor.x = Float4(1.0f) - pixel.w;
+			blendFactor.x = Min(blendFactor.x, oC.w);
+			blendFactor.y = blendFactor.x;
+			blendFactor.z = blendFactor.x;
 			break;
 		case Context::BLEND_CONSTANT:
-			blendFactor.r = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
-			blendFactor.g = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
-			blendFactor.b = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
+			blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[0]));
+			blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[1]));
+			blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[2]));
 			break;
 		case Context::BLEND_INVCONSTANT:
-			blendFactor.r = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
-			blendFactor.g = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
-			blendFactor.b = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
+			blendFactor.x = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
+			blendFactor.y = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
+			blendFactor.z = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
 			break;
 		default:
 			ASSERT(false);
 		}
 	}
 
-	void PixelRoutine::blendFactorAlpha(Registers &r, const Color4f &blendFactor, const Color4f &oC, const Color4f &pixel, Context::BlendFactor blendFactorAlphaActive) 
+	void PixelRoutine::blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, Context::BlendFactor blendFactorAlphaActive) 
 	{
 		switch(blendFactorAlphaActive)
 		{
@@ -3003,44 +3035,44 @@
 			// Optimized
 			break;
 		case Context::BLEND_SOURCE:
-			blendFactor.a = oC.a;
+			blendFactor.w = oC.w;
 			break;
 		case Context::BLEND_INVSOURCE:
-			blendFactor.a = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.a;
+			blendFactor.w = Float4(1.0f) - oC.w;
 			break;
 		case Context::BLEND_DEST:
-			blendFactor.a = pixel.a;
+			blendFactor.w = pixel.w;
 			break;
 		case Context::BLEND_INVDEST:
-			blendFactor.a = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.a;
+			blendFactor.w = Float4(1.0f) - pixel.w;
 			break;
 		case Context::BLEND_SOURCEALPHA:
-			blendFactor.a = oC.a;
+			blendFactor.w = oC.w;
 			break;
 		case Context::BLEND_INVSOURCEALPHA:
-			blendFactor.a = Float4(1.0f, 1.0f, 1.0f, 1.0f) - oC.a;
+			blendFactor.w = Float4(1.0f) - oC.w;
 			break;
 		case Context::BLEND_DESTALPHA:
-			blendFactor.a = pixel.a;
+			blendFactor.w = pixel.w;
 			break;
 		case Context::BLEND_INVDESTALPHA:
-			blendFactor.a = Float4(1.0f, 1.0f, 1.0f, 1.0f) - pixel.a;
+			blendFactor.w = Float4(1.0f) - pixel.w;
 			break;
 		case Context::BLEND_SRCALPHASAT:
-			blendFactor.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+			blendFactor.w = Float4(1.0f);
 			break;
 		case Context::BLEND_CONSTANT:
-			blendFactor.a = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
+			blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.blendConstant4F[3]));
 			break;
 		case Context::BLEND_INVCONSTANT:
-			blendFactor.a = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
+			blendFactor.w = *Pointer<Float4>(r.data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
 			break;
 		default:
 			ASSERT(false);
 		}
 	}
 
-	void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Color4f &oC, Int &x)
+	void PixelRoutine::alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
 	{
 		if(!state.alphaBlendActive)
 		{
@@ -3048,9 +3080,9 @@
 		}
 
 		Pointer<Byte> buffer;
-		Color4f pixel;
+		Vector4f pixel;
 
-		Color4i color;
+		Vector4i color;
 		Short4 c01;
 		Short4 c23;
 
@@ -3062,126 +3094,126 @@
 			c01 = *Pointer<Short4>(buffer);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 			c23 = *Pointer<Short4>(buffer);
-			color.b = c01;
-			color.g = c01;
-			color.b = UnpackLow(As<Byte8>(color.b), As<Byte8>(c23));
-			color.g = UnpackHigh(As<Byte8>(color.g), As<Byte8>(c23));
-			color.r = color.b;
-			color.b = UnpackLow(As<Byte8>(color.b), As<Byte8>(color.g));
-			color.r = UnpackHigh(As<Byte8>(color.r), As<Byte8>(color.g));
-			color.g = color.b;
-			color.a = color.r;
-			color.r = UnpackLow(As<Byte8>(color.r), As<Byte8>(color.r));
-			color.g = UnpackHigh(As<Byte8>(color.g), As<Byte8>(color.g));
-			color.b = UnpackLow(As<Byte8>(color.b), As<Byte8>(color.b));
-			color.a = UnpackHigh(As<Byte8>(color.a), As<Byte8>(color.a));
+			color.z = c01;
+			color.y = c01;
+			color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(c23));
+			color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(c23));
+			color.x = color.z;
+			color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.y));
+			color.x = UnpackHigh(As<Byte8>(color.x), As<Byte8>(color.y));
+			color.y = color.z;
+			color.w = color.x;
+			color.x = UnpackLow(As<Byte8>(color.x), As<Byte8>(color.x));
+			color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(color.y));
+			color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.z));
+			color.w = UnpackHigh(As<Byte8>(color.w), As<Byte8>(color.w));
 
-			pixel.r = convertUnsigned16(As<UShort4>(color.r));
-			pixel.g = convertUnsigned16(As<UShort4>(color.g));
-			pixel.b = convertUnsigned16(As<UShort4>(color.b));
-			pixel.a = convertUnsigned16(As<UShort4>(color.a));
+			pixel.x = convertUnsigned16(As<UShort4>(color.x));
+			pixel.y = convertUnsigned16(As<UShort4>(color.y));
+			pixel.z = convertUnsigned16(As<UShort4>(color.z));
+			pixel.w = convertUnsigned16(As<UShort4>(color.w));
 			break;
 		case FORMAT_X8R8G8B8:
 			buffer = cBuffer + 4 * x;
 			c01 = *Pointer<Short4>(buffer);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 			c23 = *Pointer<Short4>(buffer);
-			color.b = c01;
-			color.g = c01;
-			color.b = UnpackLow(As<Byte8>(color.b), As<Byte8>(c23));
-			color.g = UnpackHigh(As<Byte8>(color.g), As<Byte8>(c23));
-			color.r = color.b;
-			color.b = UnpackLow(As<Byte8>(color.b), As<Byte8>(color.g));
-			color.r = UnpackHigh(As<Byte8>(color.r), As<Byte8>(color.g));
-			color.g = color.b;
-			color.r = UnpackLow(As<Byte8>(color.r), As<Byte8>(color.r));
-			color.g = UnpackHigh(As<Byte8>(color.g), As<Byte8>(color.g));
-			color.b = UnpackLow(As<Byte8>(color.b), As<Byte8>(color.b));
+			color.z = c01;
+			color.y = c01;
+			color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(c23));
+			color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(c23));
+			color.x = color.z;
+			color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.y));
+			color.x = UnpackHigh(As<Byte8>(color.x), As<Byte8>(color.y));
+			color.y = color.z;
+			color.x = UnpackLow(As<Byte8>(color.x), As<Byte8>(color.x));
+			color.y = UnpackHigh(As<Byte8>(color.y), As<Byte8>(color.y));
+			color.z = UnpackLow(As<Byte8>(color.z), As<Byte8>(color.z));
 
-			pixel.r = convertUnsigned16(As<UShort4>(color.r));
-			pixel.g = convertUnsigned16(As<UShort4>(color.g));
-			pixel.b = convertUnsigned16(As<UShort4>(color.b));
-			pixel.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+			pixel.x = convertUnsigned16(As<UShort4>(color.x));
+			pixel.y = convertUnsigned16(As<UShort4>(color.y));
+			pixel.z = convertUnsigned16(As<UShort4>(color.z));
+			pixel.w = Float4(1.0f);
 			break;
 		case FORMAT_A8G8R8B8Q:
 UNIMPLEMENTED();
-		//	UnpackLow(pixel.b, qword_ptr [cBuffer+8*x+0]);
-		//	UnpackHigh(pixel.r, qword_ptr [cBuffer+8*x+0]);
-		//	UnpackLow(pixel.g, qword_ptr [cBuffer+8*x+8]);
-		//	UnpackHigh(pixel.a, qword_ptr [cBuffer+8*x+8]);
+		//	UnpackLow(pixel.z, qword_ptr [cBuffer+8*x+0]);
+		//	UnpackHigh(pixel.x, qword_ptr [cBuffer+8*x+0]);
+		//	UnpackLow(pixel.y, qword_ptr [cBuffer+8*x+8]);
+		//	UnpackHigh(pixel.w, qword_ptr [cBuffer+8*x+8]);
 			break;
 		case FORMAT_X8G8R8B8Q:
 UNIMPLEMENTED();
-		//	UnpackLow(pixel.b, qword_ptr [cBuffer+8*x+0]);
-		//	UnpackHigh(pixel.r, qword_ptr [cBuffer+8*x+0]);
-		//	UnpackLow(pixel.g, qword_ptr [cBuffer+8*x+8]);
-		//	pixel.a = Short4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+		//	UnpackLow(pixel.z, qword_ptr [cBuffer+8*x+0]);
+		//	UnpackHigh(pixel.x, qword_ptr [cBuffer+8*x+0]);
+		//	UnpackLow(pixel.y, qword_ptr [cBuffer+8*x+8]);
+		//	pixel.w = Short4(0xFFFFu);
 			break;
 		case FORMAT_A16B16G16R16:
 			buffer  = cBuffer;
-			color.r = *Pointer<Short4>(buffer + 8 * x);
-			color.g = *Pointer<Short4>(buffer + 8 * x + 8);
+			color.x = *Pointer<Short4>(buffer + 8 * x);
+			color.y = *Pointer<Short4>(buffer + 8 * x + 8);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-			color.b = *Pointer<Short4>(buffer + 8 * x);
-			color.a = *Pointer<Short4>(buffer + 8 * x + 8);
+			color.z = *Pointer<Short4>(buffer + 8 * x);
+			color.w = *Pointer<Short4>(buffer + 8 * x + 8);
 			
-			transpose4x4(color.r, color.g, color.b, color.a);
+			transpose4x4(color.x, color.y, color.z, color.w);
 
-			pixel.r = convertUnsigned16(As<UShort4>(color.r));
-			pixel.g = convertUnsigned16(As<UShort4>(color.g));
-			pixel.b = convertUnsigned16(As<UShort4>(color.b));
-			pixel.a = convertUnsigned16(As<UShort4>(color.a));
+			pixel.x = convertUnsigned16(As<UShort4>(color.x));
+			pixel.y = convertUnsigned16(As<UShort4>(color.y));
+			pixel.z = convertUnsigned16(As<UShort4>(color.z));
+			pixel.w = convertUnsigned16(As<UShort4>(color.w));
 			break;
 		case FORMAT_G16R16:
 			buffer = cBuffer;
-			color.r = *Pointer<Short4>(buffer  + 4 * x);
+			color.x = *Pointer<Short4>(buffer  + 4 * x);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-			color.g = *Pointer<Short4>(buffer  + 4 * x);
-			color.b = color.r;
-			color.r = As<Short4>(UnpackLow(color.r, color.g));
-			color.b = As<Short4>(UnpackHigh(color.b, color.g));
-			color.g = color.b;
-			color.r = As<Short4>(UnpackLow(color.r, color.b));
-			color.g = As<Short4>(UnpackHigh(color.g, color.b));
+			color.y = *Pointer<Short4>(buffer  + 4 * x);
+			color.z = color.x;
+			color.x = As<Short4>(UnpackLow(color.x, color.y));
+			color.z = As<Short4>(UnpackHigh(color.z, color.y));
+			color.y = color.z;
+			color.x = As<Short4>(UnpackLow(color.x, color.z));
+			color.y = As<Short4>(UnpackHigh(color.y, color.z));
 			
-			pixel.r = convertUnsigned16(As<UShort4>(color.r));
-			pixel.g = convertUnsigned16(As<UShort4>(color.g));
-			pixel.b = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-			pixel.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+			pixel.x = convertUnsigned16(As<UShort4>(color.x));
+			pixel.y = convertUnsigned16(As<UShort4>(color.y));
+			pixel.z = Float4(1.0f);
+			pixel.w = Float4(1.0f);
 			break;
 		case FORMAT_R32F:
 			buffer = cBuffer;
 			// FIXME: movlps
-			pixel.r.x = *Pointer<Float>(buffer + 4 * x + 0);
-			pixel.r.y = *Pointer<Float>(buffer + 4 * x + 4);
+			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
+			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 			// FIXME: movhps
-			pixel.r.z = *Pointer<Float>(buffer + 4 * x + 0);
-			pixel.r.w = *Pointer<Float>(buffer + 4 * x + 4);
-			pixel.g = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-			pixel.b = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-			pixel.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
+			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
+			pixel.y = Float4(1.0f);
+			pixel.z = Float4(1.0f);
+			pixel.w = Float4(1.0f);
 			break;
 		case FORMAT_G32R32F:
 			buffer = cBuffer;
-			pixel.r = *Pointer<Float4>(buffer + 8 * x, 16);
+			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.g = *Pointer<Float4>(buffer + 8 * x, 16);
-			pixel.b = pixel.r;
-			pixel.r = ShuffleLowHigh(pixel.r, pixel.g, 0x88);
-			pixel.b = ShuffleLowHigh(pixel.b, pixel.g, 0xDD);
-			pixel.g = pixel.b;
-			pixel.b = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-			pixel.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
+			pixel.z = pixel.x;
+			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
+			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
+			pixel.y = pixel.z;
+			pixel.z = Float4(1.0f);
+			pixel.w = Float4(1.0f);
 			break;
 		case FORMAT_A32B32G32R32F:
 			buffer = cBuffer;
-			pixel.r = *Pointer<Float4>(buffer + 16 * x, 16);
-			pixel.g = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
+			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
-			pixel.b = *Pointer<Float4>(buffer + 16 * x, 16);
-			pixel.a = *Pointer<Float4>(buffer + 16 * x + 16, 16);
-			transpose4x4(pixel.r, pixel.g, pixel.b, pixel.a);
+			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
+			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
+			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
 			break;
 		default:
 			ASSERT(false);
@@ -3189,71 +3221,71 @@
 
 		if(postBlendSRGB && state.writeSRGB)
 		{
-			sRGBtoLinear(pixel.r);
-			sRGBtoLinear(pixel.g);
-			sRGBtoLinear(pixel.b);
+			sRGBtoLinear(pixel.x);
+			sRGBtoLinear(pixel.y);
+			sRGBtoLinear(pixel.z);
 		}
 
 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
-		Color4f sourceFactor;
-		Color4f destFactor;
+		Vector4f sourceFactor;
+		Vector4f destFactor;
 
 		blendFactor(r, sourceFactor, oC, pixel, (Context::BlendFactor)state.sourceBlendFactor);
 		blendFactor(r, destFactor, oC, pixel, (Context::BlendFactor)state.destBlendFactor);
 
 		if(state.sourceBlendFactor != Context::BLEND_ONE && state.sourceBlendFactor != Context::BLEND_ZERO)
 		{
-			oC.r *= sourceFactor.r;
-			oC.g *= sourceFactor.g;
-			oC.b *= sourceFactor.b;
+			oC.x *= sourceFactor.x;
+			oC.y *= sourceFactor.y;
+			oC.z *= sourceFactor.z;
 		}
 	
 		if(state.destBlendFactor != Context::BLEND_ONE && state.destBlendFactor != Context::BLEND_ZERO)
 		{
-			pixel.r *= destFactor.r;
-			pixel.g *= destFactor.g;
-			pixel.b *= destFactor.b;
+			pixel.x *= destFactor.x;
+			pixel.y *= destFactor.y;
+			pixel.z *= destFactor.z;
 		}
 
 		switch(state.blendOperation)
 		{
 		case Context::BLENDOP_ADD:
-			oC.r += pixel.r;
-			oC.g += pixel.g;
-			oC.b += pixel.b;
+			oC.x += pixel.x;
+			oC.y += pixel.y;
+			oC.z += pixel.z;
 			break;
 		case Context::BLENDOP_SUB:
-			oC.r -= pixel.r;
-			oC.g -= pixel.g;
-			oC.b -= pixel.b;
+			oC.x -= pixel.x;
+			oC.y -= pixel.y;
+			oC.z -= pixel.z;
 			break;
 		case Context::BLENDOP_INVSUB:
-			oC.r = pixel.r - oC.r;
-			oC.g = pixel.g - oC.g;
-			oC.b = pixel.b - oC.b;
+			oC.x = pixel.x - oC.x;
+			oC.y = pixel.y - oC.y;
+			oC.z = pixel.z - oC.z;
 			break;
 		case Context::BLENDOP_MIN:
-			oC.r = Min(oC.r, pixel.r);
-			oC.g = Min(oC.g, pixel.g);
-			oC.b = Min(oC.b, pixel.b);
+			oC.x = Min(oC.x, pixel.x);
+			oC.y = Min(oC.y, pixel.y);
+			oC.z = Min(oC.z, pixel.z);
 			break;
 		case Context::BLENDOP_MAX:
-			oC.r = Max(oC.r, pixel.r);
-			oC.g = Max(oC.g, pixel.g);
-			oC.b = Max(oC.b, pixel.b);
+			oC.x = Max(oC.x, pixel.x);
+			oC.y = Max(oC.y, pixel.y);
+			oC.z = Max(oC.z, pixel.z);
 			break;
 		case Context::BLENDOP_SOURCE:
 			// No operation
 			break;
 		case Context::BLENDOP_DEST:
-			oC.r = pixel.r;
-			oC.g = pixel.g;
-			oC.b = pixel.b;
+			oC.x = pixel.x;
+			oC.y = pixel.y;
+			oC.z = pixel.z;
 			break;
 		case Context::BLENDOP_NULL:
-			oC.r = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			oC.g = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			oC.b = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+			oC.x = Float4(0.0f);
+			oC.y = Float4(0.0f);
+			oC.z = Float4(0.0f);
 			break;
 		default:
 			ASSERT(false);
@@ -3264,54 +3296,54 @@
 
 		if(state.sourceBlendFactorAlpha != Context::BLEND_ONE && state.sourceBlendFactorAlpha != Context::BLEND_ZERO)
 		{
-			oC.a *= sourceFactor.a;
+			oC.w *= sourceFactor.w;
 		}
 	
 		if(state.destBlendFactorAlpha != Context::BLEND_ONE && state.destBlendFactorAlpha != Context::BLEND_ZERO)
 		{
-			pixel.a *= destFactor.a;
+			pixel.w *= destFactor.w;
 		}
 
 		switch(state.blendOperationAlpha)
 		{
 		case Context::BLENDOP_ADD:
-			oC.a += pixel.a;
+			oC.w += pixel.w;
 			break;
 		case Context::BLENDOP_SUB:
-			oC.a -= pixel.a;
+			oC.w -= pixel.w;
 			break;
 		case Context::BLENDOP_INVSUB:
-			pixel.a -= oC.a;
-			oC.a = pixel.a;
+			pixel.w -= oC.w;
+			oC.w = pixel.w;
 			break;
 		case Context::BLENDOP_MIN:	
-			oC.a = Min(oC.a, pixel.a);
+			oC.w = Min(oC.w, pixel.w);
 			break;
 		case Context::BLENDOP_MAX:	
-			oC.a = Max(oC.a, pixel.a);
+			oC.w = Max(oC.w, pixel.w);
 			break;
 		case Context::BLENDOP_SOURCE:
 			// No operation
 			break;
 		case Context::BLENDOP_DEST:
-			oC.a = pixel.a;
+			oC.w = pixel.w;
 			break;
 		case Context::BLENDOP_NULL:
-			oC.a = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+			oC.w = Float4(0.0f);
 			break;
 		default:
 			ASSERT(false);
 		}
 	}
 
-	void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Color4f &oC, Int &sMask, Int &zMask, Int &cMask)
+	void PixelRoutine::writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
 	{
 		if(!state.colorWriteActive(index))
 		{
 			return;
 		}
 
-		Color4i color;
+		Vector4i color;
 
 		switch(state.targetFormat[index])
 		{
@@ -3325,13 +3357,13 @@
 		case FORMAT_R32F:
 			break;
 		case FORMAT_G32R32F:
-			oC.b = oC.r;
-			oC.r = UnpackLow(oC.r, oC.g);
-			oC.b = UnpackHigh(oC.b, oC.g);
-			oC.g = oC.b;
+			oC.z = oC.x;
+			oC.x = UnpackLow(oC.x, oC.y);
+			oC.z = UnpackHigh(oC.z, oC.y);
+			oC.y = oC.z;
 			break;
 		case FORMAT_A32B32G32R32F:
-			transpose4x4(oC.r, oC.g, oC.b, oC.a);
+			transpose4x4(oC.x, oC.y, oC.z, oC.w);
 			break;
 		default:
 			ASSERT(false);
@@ -3375,19 +3407,19 @@
 				value.z = *Pointer<Float>(buffer + 0);
 				value.w = *Pointer<Float>(buffer + 4);
 
-				oC.r = As<Float4>(As<Int4>(oC.r) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
-				oC.r = As<Float4>(As<Int4>(oC.r) | As<Int4>(value));
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
 
 				// FIXME: movhps
-				*Pointer<Float>(buffer + 0) = oC.r.z;
-				*Pointer<Float>(buffer + 4) = oC.r.w;
+				*Pointer<Float>(buffer + 0) = oC.x.z;
+				*Pointer<Float>(buffer + 4) = oC.x.w;
 
 				buffer -= *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 
 				// FIXME: movlps
-				*Pointer<Float>(buffer + 0) = oC.r.x;
-				*Pointer<Float>(buffer + 4) = oC.r.y;
+				*Pointer<Float>(buffer + 0) = oC.x.x;
+				*Pointer<Float>(buffer + 4) = oC.x.y;
 			}
 			break;
 		case FORMAT_G32R32F:
@@ -3398,15 +3430,15 @@
 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
 			{
 				Float4 masked = value;
-				oC.r = As<Float4>(As<Int4>(oC.r) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
-				oC.r = As<Float4>(As<Int4>(oC.r) | As<Int4>(masked));
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
 			}
 
-			oC.r = As<Float4>(As<Int4>(oC.r) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
+			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
-			oC.r = As<Float4>(As<Int4>(oC.r) | As<Int4>(value));
-			*Pointer<Float4>(buffer) = oC.r;
+			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+			*Pointer<Float4>(buffer) = oC.x;
 
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
 
@@ -3417,15 +3449,15 @@
 				Float4 masked;
 
 				masked = value;
-				oC.g = As<Float4>(As<Int4>(oC.g) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
+				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
-				oC.g = As<Float4>(As<Int4>(oC.g) | As<Int4>(masked));
+				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
 			}
 
-			oC.g = As<Float4>(As<Int4>(oC.g) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
+			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
-			oC.g = As<Float4>(As<Int4>(oC.g) | As<Int4>(value));
-			*Pointer<Float4>(buffer) = oC.g;
+			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
+			*Pointer<Float4>(buffer) = oC.y;
 			break;
 		case FORMAT_A32B32G32R32F:
 			buffer = cBuffer + 16 * x;
@@ -3436,15 +3468,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{
 					Float4 masked = value;
-					oC.r = As<Float4>(As<Int4>(oC.r) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.r = As<Float4>(As<Int4>(oC.r) | As<Int4>(masked));
+					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
 				}
 				
-				oC.r = As<Float4>(As<Int4>(oC.r) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
+				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
-				oC.r = As<Float4>(As<Int4>(oC.r) | As<Int4>(value));
-				*Pointer<Float4>(buffer, 16) = oC.r;
+				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
+				*Pointer<Float4>(buffer, 16) = oC.x;
 			}
 
 			{
@@ -3453,15 +3485,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{	
 					Float4 masked = value;
-					oC.g = As<Float4>(As<Int4>(oC.g) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.g = As<Float4>(As<Int4>(oC.g) | As<Int4>(masked));
+					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
 				}
 
-				oC.g = As<Float4>(As<Int4>(oC.g) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
+				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
-				oC.g = As<Float4>(As<Int4>(oC.g) | As<Int4>(value));
-				*Pointer<Float4>(buffer + 16, 16) = oC.g;
+				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
+				*Pointer<Float4>(buffer + 16, 16) = oC.y;
 			}
 
 			buffer += *Pointer<Int>(r.data + OFFSET(DrawData,colorPitchB[index]));
@@ -3472,15 +3504,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{
 					Float4 masked = value;
-					oC.b = As<Float4>(As<Int4>(oC.b) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.b = As<Float4>(As<Int4>(oC.b) | As<Int4>(masked));
+					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
 				}
 
-				oC.b = As<Float4>(As<Int4>(oC.b) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
+				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
-				oC.b = As<Float4>(As<Int4>(oC.b) | As<Int4>(value));
-				*Pointer<Float4>(buffer, 16) = oC.b;
+				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
+				*Pointer<Float4>(buffer, 16) = oC.z;
 			}
 
 			{
@@ -3489,15 +3521,15 @@
 				if(rgbaWriteMask != 0x0000000F)
 				{
 					Float4 masked = value;
-					oC.a = As<Float4>(As<Int4>(oC.a) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
+					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
-					oC.a = As<Float4>(As<Int4>(oC.a) | As<Int4>(masked));
+					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
 				}
 
-				oC.a = As<Float4>(As<Int4>(oC.a) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
+				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(r.constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(r.constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
-				oC.a = As<Float4>(As<Int4>(oC.a) | As<Int4>(value));
-				*Pointer<Float4>(buffer + 16, 16) = oC.a;
+				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
+				*Pointer<Float4>(buffer + 16, 16) = oC.w;
 			}
 			break;
 		default:
@@ -3508,88 +3540,91 @@
 	void PixelRoutine::ps_1_x(Registers &r, Int cMask[4])
 	{
 		int pad = 0;        // Count number of texm3x3pad instructions
-		Color4i dPairing;   // Destination for first pairing instruction
+		Vector4i dPairing;   // Destination for first pairing instruction
 
-		for(int i = 0; i < pixelShader->getLength(); i++)
+		for(int i = 0; i < shader->getLength(); i++)
 		{
-			const ShaderInstruction *instruction = pixelShader->getInstruction(i);
-			Op::Opcode opcode = instruction->getOpcode();
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
 
 		//	#ifndef NDEBUG   // FIXME: Centralize debug output control
-		//		pixelShader->printInstruction(i, "debug.txt");
+		//		shader->printInstruction(i, "debug.txt");
 		//	#endif
 
-			if(opcode == Op::OPCODE_DCL || opcode == Op::OPCODE_DEF || opcode == Op::OPCODE_DEFI || opcode == Op::OPCODE_DEFB)
+			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
 			{
 				continue;
 			}
 
-			const Dst &dst = instruction->getDestinationParameter();
-			const Src &src0 = instruction->getSourceParameter(0);
-			const Src &src1 = instruction->getSourceParameter(1);
-			const Src &src2 = instruction->getSourceParameter(2);
-			const Src &src3 = instruction->getSourceParameter(3);
+			const Dst &dst = instruction->dst;
+			const Src &src0 = instruction->src[0];
+			const Src &src1 = instruction->src[1];
+			const Src &src2 = instruction->src[2];
 
-			bool pairing = i + 1 < pixelShader->getLength() && pixelShader->getInstruction(i + 1)->isCoissue();   // First instruction of pair
-			bool coissue = instruction->isCoissue();                                                                                // Second instruction of pair
+			unsigned short version = shader->getVersion();
+			bool pairing = i + 1 < shader->getLength() && shader->getInstruction(i + 1)->coissue;   // First instruction of pair
+			bool coissue = instruction->coissue;                                                              // Second instruction of pair
 
-			Color4i d;
-			Color4i s0;
-			Color4i s1;
-			Color4i s2;
-			Color4i s3;
+			Vector4i d;
+			Vector4i s0;
+			Vector4i s1;
+			Vector4i s2;
 
-			if(src0.type != Src::PARAMETER_VOID) s0 = regi(r, src0);
-			if(src1.type != Src::PARAMETER_VOID) s1 = regi(r, src1);
-			if(src2.type != Src::PARAMETER_VOID) s2 = regi(r, src2);
-			if(src3.type != Src::PARAMETER_VOID) s3 = regi(r, src3);
+			if(src0.type != Shader::PARAMETER_VOID) s0 = regi(r, src0);
+			if(src1.type != Shader::PARAMETER_VOID) s1 = regi(r, src1);
+			if(src2.type != Shader::PARAMETER_VOID) s2 = regi(r, src2);
+
+			Float4 u = version < 0x0104 ? r.vf[2 + dst.index].x : r.vf[2 + src0.index].x;
+			Float4 v = version < 0x0104 ? r.vf[2 + dst.index].y : r.vf[2 + src0.index].y;
+			Float4 s = version < 0x0104 ? r.vf[2 + dst.index].z : r.vf[2 + src0.index].z;
+			Float4 t = version < 0x0104 ? r.vf[2 + dst.index].w : r.vf[2 + src0.index].w;
 
 			switch(opcode)
 			{
-			case Op::OPCODE_PS_1_0:															break;
-			case Op::OPCODE_PS_1_1:															break;
-			case Op::OPCODE_PS_1_2:															break;
-			case Op::OPCODE_PS_1_3:															break;
-			case Op::OPCODE_PS_1_4:															break;
+			case Shader::OPCODE_PS_1_0:															break;
+			case Shader::OPCODE_PS_1_1:															break;
+			case Shader::OPCODE_PS_1_2:															break;
+			case Shader::OPCODE_PS_1_3:															break;
+			case Shader::OPCODE_PS_1_4:															break;
 
-			case Op::OPCODE_DEF:															break;
+			case Shader::OPCODE_DEF:															break;
 
-			case Op::OPCODE_NOP:															break;
-			case Op::OPCODE_MOV:			MOV(d, s0);										break;
-			case Op::OPCODE_ADD:			ADD(d, s0, s1);									break;
-			case Op::OPCODE_SUB:			SUB(d, s0, s1);									break;
-			case Op::OPCODE_MAD:			MAD(d, s0, s1, s2);								break;
-			case Op::OPCODE_MUL:			MUL(d, s0, s1);									break;
-			case Op::OPCODE_DP3:			DP3(d, s0, s1);									break;
-			case Op::OPCODE_DP4:			DP4(d, s0, s1);									break;
-			case Op::OPCODE_LRP:			LRP(d, s0, s1, s2);								break;
-			case Op::OPCODE_TEXCOORD:
-				if(pixelShader->getVersion() < 0x0104)
+			case Shader::OPCODE_NOP:															break;
+			case Shader::OPCODE_MOV:			MOV(d, s0);										break;
+			case Shader::OPCODE_ADD:			ADD(d, s0, s1);									break;
+			case Shader::OPCODE_SUB:			SUB(d, s0, s1);									break;
+			case Shader::OPCODE_MAD:			MAD(d, s0, s1, s2);								break;
+			case Shader::OPCODE_MUL:			MUL(d, s0, s1);									break;
+			case Shader::OPCODE_DP3:			DP3(d, s0, s1);									break;
+			case Shader::OPCODE_DP4:			DP4(d, s0, s1);									break;
+			case Shader::OPCODE_LRP:			LRP(d, s0, s1, s2);								break;
+			case Shader::OPCODE_TEXCOORD:
+				if(version < 0x0104)
 				{
-					TEXCOORD(d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index);
+					TEXCOORD(d, u, v, s, dst.index);
 				}
 				else
 				{
 					if((src0.swizzle & 0x30) == 0x20)   // .xyz
 					{
-						TEXCRD(d, Float4(r.vx[2 + src0.index]), Float4(r.vy[2 + src0.index]), Float4(r.vz[2 + src0.index]), src0.index, src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DZ || src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DW);
+						TEXCRD(d, u, v, s, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
 					}
 					else   // .xyw
 					{
-						TEXCRD(d, Float4(r.vx[2 + src0.index]), Float4(r.vy[2 + src0.index]), Float4(r.vw[2 + src0.index]), src0.index, src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DZ || src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DW);
+						TEXCRD(d, u, v, t, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
 					}
 				}
 				break;
-			case Op::OPCODE_TEXKILL:
-				if(pixelShader->getVersion() < 0x0104)
+			case Shader::OPCODE_TEXKILL:
+				if(version < 0x0104)
 				{
-					TEXKILL(cMask, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]));
+					TEXKILL(cMask, u, v, s);
 				}
-				else if(pixelShader->getVersion() == 0x0104)
+				else if(version == 0x0104)
 				{
-					if(dst.type == Dst::PARAMETER_TEXTURE)
+					if(dst.type == Shader::PARAMETER_TEXTURE)
 					{
-						TEXKILL(cMask, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]));
+						TEXKILL(cMask, u, v, s);
 					}
 					else
 					{
@@ -3598,92 +3633,92 @@
 				}
 				else ASSERT(false);
 				break;
-			case Op::OPCODE_TEX:
-				if(pixelShader->getVersion() < 0x0104)
+			case Shader::OPCODE_TEX:
+				if(version < 0x0104)
 				{
-					TEX(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index, false);
+					TEX(r, d, u, v, s, dst.index, false);
 				}
-				else if(pixelShader->getVersion() == 0x0104)
+				else if(version == 0x0104)
 				{
-					if(src0.type == Src::PARAMETER_TEXTURE)
+					if(src0.type == Shader::PARAMETER_TEXTURE)
 					{
 						if((src0.swizzle & 0x30) == 0x20)   // .xyz
 						{
-							TEX(r, d, Float4(r.vx[2 + src0.index]), Float4(r.vy[2 + src0.index]), Float4(r.vz[2 + src0.index]), dst.index, src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DZ || src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DW);
+							TEX(r, d, u, v, s, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
 						}
 						else   // .xyw
 						{
-							TEX(r, d, Float4(r.vx[2 + src0.index]), Float4(r.vy[2 + src0.index]), Float4(r.vw[2 + src0.index]), dst.index, src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DZ || src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DW);
+							TEX(r, d, u, v, t, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
 						}
 					}
 					else
 					{
-						TEXLD(r, d, s0, dst.index, src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DZ || src0.modifier == ShaderInstruction::SourceParameter::MODIFIER_DW);
+						TEXLD(r, d, s0, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
 					}
 				}
 				else ASSERT(false);
 				break;
-			case Op::OPCODE_TEXBEM:			TEXBEM(r, d, s0, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index);	break;
-			case Op::OPCODE_TEXBEML:		TEXBEML(r, d, s0, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index);	break;
-			case Op::OPCODE_TEXREG2AR:		TEXREG2AR(r, d, s0, dst.index);					break;
-			case Op::OPCODE_TEXREG2GB:		TEXREG2GB(r, d, s0, dst.index);					break;
-			case Op::OPCODE_TEXM3X2PAD:		TEXM3X2PAD(r, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), s0, 0, src0.modifier == Src::MODIFIER_SIGN);	break;
-			case Op::OPCODE_TEXM3X2TEX:		TEXM3X2TEX(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index, s0, src0.modifier == Src::MODIFIER_SIGN);	break;
-			case Op::OPCODE_TEXM3X3PAD:		TEXM3X3PAD(r, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), s0, pad++ % 2, src0.modifier == Src::MODIFIER_SIGN);	break;
-			case Op::OPCODE_TEXM3X3TEX:		TEXM3X3TEX(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index, s0, src0.modifier == Src::MODIFIER_SIGN);	break;
-			case Op::OPCODE_TEXM3X3SPEC:	TEXM3X3SPEC(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index, s0, s1);		break;
-			case Op::OPCODE_TEXM3X3VSPEC:	TEXM3X3VSPEC(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index, s0);		break;
-			case Op::OPCODE_CND:			CND(d, s0, s1, s2);								break;
-			case Op::OPCODE_TEXREG2RGB:		TEXREG2RGB(r, d, s0, dst.index);				break;
-			case Op::OPCODE_TEXDP3TEX:		TEXDP3TEX(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), dst.index, s0);	break;
-			case Op::OPCODE_TEXM3X2DEPTH:	TEXM3X2DEPTH(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), s0, src0.modifier == Src::MODIFIER_SIGN);	break;
-			case Op::OPCODE_TEXDP3:			TEXDP3(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), s0);				break;
-			case Op::OPCODE_TEXM3X3:		TEXM3X3(r, d, Float4(r.vx[2 + dst.index]), Float4(r.vy[2 + dst.index]), Float4(r.vz[2 + dst.index]), s0, src0.modifier == Src::MODIFIER_SIGN); 	break;
-			case Op::OPCODE_TEXDEPTH:		TEXDEPTH(r);									break;
-			case Op::OPCODE_CMP:			CMP(d, s0, s1, s2);								break;
-			case Op::OPCODE_BEM:			BEM(r, d, s0, s1, dst.index);					break;
-			case Op::OPCODE_PHASE:															break;
-			case Op::OPCODE_END:															break;
+			case Shader::OPCODE_TEXBEM:			TEXBEM(r, d, s0, u, v, s, dst.index);	break;
+			case Shader::OPCODE_TEXBEML:		TEXBEML(r, d, s0, u, v, s, dst.index);	break;
+			case Shader::OPCODE_TEXREG2AR:		TEXREG2AR(r, d, s0, dst.index);					break;
+			case Shader::OPCODE_TEXREG2GB:		TEXREG2GB(r, d, s0, dst.index);					break;
+			case Shader::OPCODE_TEXM3X2PAD:		TEXM3X2PAD(r, u, v, s, s0, 0, src0.modifier == Shader::MODIFIER_SIGN);	break;
+			case Shader::OPCODE_TEXM3X2TEX:		TEXM3X2TEX(r, d, u, v, s, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN);	break;
+			case Shader::OPCODE_TEXM3X3PAD:		TEXM3X3PAD(r, u, v, s, s0, pad++ % 2, src0.modifier == Shader::MODIFIER_SIGN);	break;
+			case Shader::OPCODE_TEXM3X3TEX:		TEXM3X3TEX(r, d, u, v, s, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN);	break;
+			case Shader::OPCODE_TEXM3X3SPEC:	TEXM3X3SPEC(r, d, u, v, s, dst.index, s0, s1);		break;
+			case Shader::OPCODE_TEXM3X3VSPEC:	TEXM3X3VSPEC(r, d, u, v, s, dst.index, s0);		break;
+			case Shader::OPCODE_CND:			CND(d, s0, s1, s2);								break;
+			case Shader::OPCODE_TEXREG2RGB:		TEXREG2RGB(r, d, s0, dst.index);				break;
+			case Shader::OPCODE_TEXDP3TEX:		TEXDP3TEX(r, d, u, v, s, dst.index, s0);	break;
+			case Shader::OPCODE_TEXM3X2DEPTH:	TEXM3X2DEPTH(r, d, u, v, s, s0, src0.modifier == Shader::MODIFIER_SIGN);	break;
+			case Shader::OPCODE_TEXDP3:			TEXDP3(r, d, u, v, s, s0);				break;
+			case Shader::OPCODE_TEXM3X3:		TEXM3X3(r, d, u, v, s, s0, src0.modifier == Shader::MODIFIER_SIGN); 	break;
+			case Shader::OPCODE_TEXDEPTH:		TEXDEPTH(r);									break;
+			case Shader::OPCODE_CMP0:			CMP(d, s0, s1, s2);								break;
+			case Shader::OPCODE_BEM:			BEM(r, d, s0, s1, dst.index);					break;
+			case Shader::OPCODE_PHASE:															break;
+			case Shader::OPCODE_END:															break;
 			default:
 				ASSERT(false);
 			}
 
-			if(dst.type != Dst::PARAMETER_VOID && opcode != Op::OPCODE_TEXKILL)
+			if(dst.type != Shader::PARAMETER_VOID && opcode != Shader::OPCODE_TEXKILL)
 			{
 				if(dst.shift > 0)
 				{
-					if(dst.mask & 0x1) {d.r = AddSat(d.r, d.r); if(dst.shift > 1) d.r = AddSat(d.r, d.r); if(dst.shift > 2) d.r = AddSat(d.r, d.r);}
-					if(dst.mask & 0x2) {d.g = AddSat(d.g, d.g); if(dst.shift > 1) d.g = AddSat(d.g, d.g); if(dst.shift > 2) d.g = AddSat(d.g, d.g);}
-					if(dst.mask & 0x4) {d.b = AddSat(d.b, d.b); if(dst.shift > 1) d.b = AddSat(d.b, d.b); if(dst.shift > 2) d.b = AddSat(d.b, d.b);}
-					if(dst.mask & 0x8) {d.a = AddSat(d.a, d.a); if(dst.shift > 1) d.a = AddSat(d.a, d.a); if(dst.shift > 2) d.a = AddSat(d.a, d.a);}
+					if(dst.mask & 0x1) {d.x = AddSat(d.x, d.x); if(dst.shift > 1) d.x = AddSat(d.x, d.x); if(dst.shift > 2) d.x = AddSat(d.x, d.x);}
+					if(dst.mask & 0x2) {d.y = AddSat(d.y, d.y); if(dst.shift > 1) d.y = AddSat(d.y, d.y); if(dst.shift > 2) d.y = AddSat(d.y, d.y);}
+					if(dst.mask & 0x4) {d.z = AddSat(d.z, d.z); if(dst.shift > 1) d.z = AddSat(d.z, d.z); if(dst.shift > 2) d.z = AddSat(d.z, d.z);}
+					if(dst.mask & 0x8) {d.w = AddSat(d.w, d.w); if(dst.shift > 1) d.w = AddSat(d.w, d.w); if(dst.shift > 2) d.w = AddSat(d.w, d.w);}
 				}
 				else if(dst.shift < 0)
 				{
-					if(dst.mask & 0x1) d.r = d.r >> -dst.shift;
-					if(dst.mask & 0x2) d.g = d.g >> -dst.shift;
-					if(dst.mask & 0x4) d.b = d.b >> -dst.shift;
-					if(dst.mask & 0x8) d.a = d.a >> -dst.shift;
+					if(dst.mask & 0x1) d.x = d.x >> -dst.shift;
+					if(dst.mask & 0x2) d.y = d.y >> -dst.shift;
+					if(dst.mask & 0x4) d.z = d.z >> -dst.shift;
+					if(dst.mask & 0x8) d.w = d.w >> -dst.shift;
 				}
 
 				if(dst.saturate)
 				{
-					if(dst.mask & 0x1) {d.r = Min(d.r, Short4(0x1000, 0x1000, 0x1000, 0x1000)); d.r = Max(d.r, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
-					if(dst.mask & 0x2) {d.g = Min(d.g, Short4(0x1000, 0x1000, 0x1000, 0x1000)); d.g = Max(d.g, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
-					if(dst.mask & 0x4) {d.b = Min(d.b, Short4(0x1000, 0x1000, 0x1000, 0x1000)); d.b = Max(d.b, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
-					if(dst.mask & 0x8) {d.a = Min(d.a, Short4(0x1000, 0x1000, 0x1000, 0x1000)); d.a = Max(d.a, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
+					if(dst.mask & 0x1) {d.x = Min(d.x, Short4(0x1000)); d.x = Max(d.x, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
+					if(dst.mask & 0x2) {d.y = Min(d.y, Short4(0x1000)); d.y = Max(d.y, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
+					if(dst.mask & 0x4) {d.z = Min(d.z, Short4(0x1000)); d.z = Max(d.z, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
+					if(dst.mask & 0x8) {d.w = Min(d.w, Short4(0x1000)); d.w = Max(d.w, Short4(0x0000, 0x0000, 0x0000, 0x0000));}
 				}
 
 				if(pairing)
 				{
-					if(dst.mask & 0x1) dPairing.r = d.r;
-					if(dst.mask & 0x2) dPairing.g = d.g;
-					if(dst.mask & 0x4) dPairing.b = d.b;
-					if(dst.mask & 0x8) dPairing.a = d.a;
+					if(dst.mask & 0x1) dPairing.x = d.x;
+					if(dst.mask & 0x2) dPairing.y = d.y;
+					if(dst.mask & 0x4) dPairing.z = d.z;
+					if(dst.mask & 0x8) dPairing.w = d.w;
 				}
 			
 				if(coissue)
 				{
-					const Dst &dst = pixelShader->getInstruction(i - 1)->getDestinationParameter();
+					const Dst &dst = shader->getInstruction(i - 1)->dst;
 
 					writeDestination(r, dPairing, dst);
 				}
@@ -3700,47 +3735,60 @@
 	{
 		r.enableIndex = 0;
 		r.stackIndex = 0;
-		
-		for(int i = 0; i < pixelShader->getLength(); i++)
+
+		bool out[4][4] = {false};
+
+		// Create all call site return blocks up front
+		for(int i = 0; i < shader->getLength(); i++)
 		{
-			const ShaderInstruction *instruction = pixelShader->getInstruction(i);
-			Op::Opcode opcode = instruction->getOpcode();
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
 
-		//	#ifndef NDEBUG   // FIXME: Centralize debug output control
-		//		pixelShader->printInstruction(i, "debug.txt");
-		//	#endif
+			if(opcode == Shader::OPCODE_CALL || opcode == Shader::OPCODE_CALLNZ)
+			{
+				const Dst &dst = instruction->dst;
 
-			if(opcode == Op::OPCODE_DCL || opcode == Op::OPCODE_DEF || opcode == Op::OPCODE_DEFI || opcode == Op::OPCODE_DEFB)
+				ASSERT(callRetBlock[dst.label].size() == dst.callSite);
+				callRetBlock[dst.label].push_back(Nucleus::createBasicBlock());
+			}
+		}
+		
+		for(int i = 0; i < shader->getLength(); i++)
+		{
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
+
+			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
 			{
 				continue;
 			}
 
-			const Dst &dst = instruction->getDestinationParameter();
-			const Src &src0 = instruction->getSourceParameter(0);
-			const Src &src1 = instruction->getSourceParameter(1);
-			const Src &src2 = instruction->getSourceParameter(2);
-			const Src &src3 = instruction->getSourceParameter(3);
+			const Dst &dst = instruction->dst;
+			const Src &src0 = instruction->src[0];
+			const Src &src1 = instruction->src[1];
+			const Src &src2 = instruction->src[2];
+			const Src &src3 = instruction->src[3];
 
-			bool predicate = instruction->isPredicate();
-			Control control = instruction->getControl();
+			bool predicate = instruction->predicate;
+			Control control = instruction->control;
 			bool pp = dst.partialPrecision;
-			bool project = instruction->isProject();
-			bool bias = instruction->isBias();
+			bool project = instruction->project;
+			bool bias = instruction->bias;
 
-			Color4f d;
-			Color4f s0;
-			Color4f s1;
-			Color4f s2;
-			Color4f s3;
+			Vector4f d;
+			Vector4f s0;
+			Vector4f s1;
+			Vector4f s2;
+			Vector4f s3;
 
-			if(opcode == Op::OPCODE_TEXKILL)
+			if(opcode == Shader::OPCODE_TEXKILL)   // Takes destination as input
 			{
-				if(dst.type == Dst::PARAMETER_TEXTURE)
+				if(dst.type == Shader::PARAMETER_TEXTURE)
 				{
-					d.x = r.vx[2 + dst.index];
-					d.y = r.vy[2 + dst.index];
-					d.z = r.vz[2 + dst.index];
-					d.w = r.vw[2 + dst.index];
+					d.x = r.vf[2 + dst.index].x;
+					d.y = r.vf[2 + dst.index].y;
+					d.z = r.vf[2 + dst.index].z;
+					d.w = r.vf[2 + dst.index].w;
 				}
 				else
 				{
@@ -3748,120 +3796,212 @@
 				}
 			}
 
-			if(src0.type != Src::PARAMETER_VOID) s0 = reg(r, src0);
-			if(src1.type != Src::PARAMETER_VOID) s1 = reg(r, src1);
-			if(src2.type != Src::PARAMETER_VOID) s2 = reg(r, src2);
-			if(src3.type != Src::PARAMETER_VOID) s3 = reg(r, src3);
+			if(src0.type != Shader::PARAMETER_VOID) s0 = reg(r, src0);
+			if(src1.type != Shader::PARAMETER_VOID) s1 = reg(r, src1);
+			if(src2.type != Shader::PARAMETER_VOID) s2 = reg(r, src2);
+			if(src3.type != Shader::PARAMETER_VOID) s3 = reg(r, src3);
 
 			switch(opcode)
 			{
-			case Op::OPCODE_PS_2_0:														break;
-			case Op::OPCODE_PS_2_x:														break;
-			case Op::OPCODE_PS_3_0:														break;
-			case Op::OPCODE_DEF:														break;
-			case Op::OPCODE_DCL:														break;
-			case Op::OPCODE_NOP:														break;
-			case Op::OPCODE_MOV:		mov(d, s0);										break;
-			case Op::OPCODE_ADD:		add(d, s0, s1);									break;
-			case Op::OPCODE_SUB:		sub(d, s0, s1);									break;
-			case Op::OPCODE_MUL:		mul(d, s0, s1);									break;
-			case Op::OPCODE_MAD:		mad(d, s0, s1, s2);								break;
-			case Op::OPCODE_DP2ADD:		dp2add(d, s0, s1, s2);							break;
-			case Op::OPCODE_DP3:		dp3(d, s0, s1);									break;
-			case Op::OPCODE_DP4:		dp4(d, s0, s1);									break;
-			case Op::OPCODE_CMP:		cmp(d, s0, s1, s2);								break;
-			case Op::OPCODE_FRC:		frc(d, s0);										break;
-			case Op::OPCODE_EXP:		exp(d, s0, pp);									break;
-			case Op::OPCODE_LOG:		log(d, s0, pp);									break;
-			case Op::OPCODE_RCP:		rcp(d, s0, pp);									break;
-			case Op::OPCODE_RSQ:		rsq(d, s0, pp);									break;
-			case Op::OPCODE_MIN:		min(d, s0, s1);									break;
-			case Op::OPCODE_MAX:		max(d, s0, s1);									break;
-			case Op::OPCODE_LRP:		lrp(d, s0, s1, s2);								break;
-			case Op::OPCODE_POW:		pow(d, s0, s1, pp);								break;
-			case Op::OPCODE_CRS:		crs(d, s0, s1);									break;
-			case Op::OPCODE_NRM:		nrm(d, s0, pp);									break;
-			case Op::OPCODE_ABS:		abs(d, s0);										break;
-			case Op::OPCODE_SINCOS:		sincos(d, s0, pp);								break;
-			case Op::OPCODE_M4X4:		M4X4(r, d, s0, src1);							break;
-			case Op::OPCODE_M4X3:		M4X3(r, d, s0, src1);							break;
-			case Op::OPCODE_M3X4:		M3X4(r, d, s0, src1);							break;
-			case Op::OPCODE_M3X3:		M3X3(r, d, s0, src1);							break;
-			case Op::OPCODE_M3X2:		M3X2(r, d, s0, src1);							break;
-			case Op::OPCODE_TEX:		TEXLD(r, d, s0, src1, project, bias);			break;
-			case Op::OPCODE_TEXLDD:		TEXLDD(r, d, s0, src1, s2, s3, project, bias);	break;
-			case Op::OPCODE_TEXLDL:		TEXLDL(r, d, s0, src1, project, bias);			break;
-			case Op::OPCODE_TEXKILL:	TEXKILL(cMask, d, dst.mask);					break;
-			case Op::OPCODE_DSX:		DSX(d, s0);										break;
-			case Op::OPCODE_DSY:		DSY(d, s0);										break;
-			case Op::OPCODE_BREAK:		BREAK(r);										break;
-			case Op::OPCODE_BREAKC:		BREAKC(r, s0, s1, control);						break;
-			case Op::OPCODE_BREAKP:		BREAKP(r, src0);								break;
-			case Op::OPCODE_CALL:		CALL(r, dst.index);								break;
-			case Op::OPCODE_CALLNZ:		CALLNZ(r, dst.index, src0);						break;
-			case Op::OPCODE_ELSE:		ELSE(r);										break;
-			case Op::OPCODE_ENDIF:		ENDIF(r);										break;
-			case Op::OPCODE_ENDLOOP:	ENDLOOP(r);										break;
-			case Op::OPCODE_ENDREP:		ENDREP(r);										break;
-			case Op::OPCODE_IF:			IF(r, src0);									break;
-			case Op::OPCODE_IFC:		IFC(r, s0, s1, control);						break;
-			case Op::OPCODE_LABEL:		LABEL(dst.index);								break;
-			case Op::OPCODE_LOOP:		LOOP(r, src1);									break;
-			case Op::OPCODE_REP:		REP(r, src0);									break;
-			case Op::OPCODE_RET:		RET(r);											break;
-			case Op::OPCODE_SETP:		setp(d, s0, s1, control);						break;
-			case Op::OPCODE_END:														break;
+			case Shader::OPCODE_PS_2_0:														break;
+			case Shader::OPCODE_PS_2_x:														break;
+			case Shader::OPCODE_PS_3_0:														break;
+			case Shader::OPCODE_DEF:														break;
+			case Shader::OPCODE_DCL:														break;
+			case Shader::OPCODE_NOP:														break;
+			case Shader::OPCODE_MOV:		mov(d, s0);										break;
+			case Shader::OPCODE_F2B:		f2b(d, s0);										break;
+			case Shader::OPCODE_B2F:		b2f(d, s0);										break;
+			case Shader::OPCODE_ADD:		add(d, s0, s1);									break;
+			case Shader::OPCODE_SUB:		sub(d, s0, s1);									break;
+			case Shader::OPCODE_MUL:		mul(d, s0, s1);									break;
+			case Shader::OPCODE_MAD:		mad(d, s0, s1, s2);								break;
+			case Shader::OPCODE_DP1:		dp1(d, s0, s1);									break;
+			case Shader::OPCODE_DP2:		dp2(d, s0, s1);									break;
+			case Shader::OPCODE_DP2ADD:		dp2add(d, s0, s1, s2);							break;
+			case Shader::OPCODE_DP3:		dp3(d, s0, s1);									break;
+			case Shader::OPCODE_DP4:		dp4(d, s0, s1);									break;
+			case Shader::OPCODE_CMP0:		cmp0(d, s0, s1, s2);							break;
+			case Shader::OPCODE_ICMP:		icmp(d, s0, s1, control);						break;
+			case Shader::OPCODE_SELECT:		select(d, s0, s1, s2);							break;
+			case Shader::OPCODE_EXTRACT:	extract(d.x, s0, s1.x);							break;
+			case Shader::OPCODE_INSERT:		insert(d, s0, s1.x, s2.x);						break;
+			case Shader::OPCODE_FRC:		frc(d, s0);										break;
+			case Shader::OPCODE_TRUNC:      trunc(d, s0);                                   break;
+			case Shader::OPCODE_FLOOR:      floor(d, s0);                                   break;
+			case Shader::OPCODE_CEIL:       ceil(d, s0);                                    break;
+			case Shader::OPCODE_EXP2X:		exp2x(d, s0, pp);								break;
+			case Shader::OPCODE_EXP2:		exp2(d, s0, pp);								break;
+			case Shader::OPCODE_LOG2X:		log2x(d, s0, pp);								break;
+			case Shader::OPCODE_LOG2:		log2(d, s0, pp);								break;
+			case Shader::OPCODE_EXP:		exp(d, s0, pp);									break;
+			case Shader::OPCODE_LOG:		log(d, s0, pp);									break;
+			case Shader::OPCODE_RCPX:		rcpx(d, s0, pp);								break;
+			case Shader::OPCODE_DIV:		div(d, s0, s1);									break;
+			case Shader::OPCODE_MOD:		mod(d, s0, s1);									break;
+			case Shader::OPCODE_RSQX:		rsqx(d, s0, pp);								break;
+			case Shader::OPCODE_SQRT:		sqrt(d, s0, pp);								break;
+			case Shader::OPCODE_RSQ:		rsq(d, s0, pp);									break;
+			case Shader::OPCODE_LEN2:		len2(d.x, s0, pp);								break;
+			case Shader::OPCODE_LEN3:		len3(d.x, s0, pp);								break;
+			case Shader::OPCODE_LEN4:		len4(d.x, s0, pp);								break;
+			case Shader::OPCODE_DIST1:		dist1(d.x, s0, s1, pp);							break;
+			case Shader::OPCODE_DIST2:		dist2(d.x, s0, s1, pp);							break;
+			case Shader::OPCODE_DIST3:		dist3(d.x, s0, s1, pp);							break;
+			case Shader::OPCODE_DIST4:		dist4(d.x, s0, s1, pp);							break;
+			case Shader::OPCODE_MIN:		min(d, s0, s1);									break;
+			case Shader::OPCODE_MAX:		max(d, s0, s1);									break;
+			case Shader::OPCODE_LRP:		lrp(d, s0, s1, s2);								break;
+			case Shader::OPCODE_STEP:		step(d, s0, s1);								break;
+			case Shader::OPCODE_SMOOTH:		smooth(d, s0, s1, s2);							break;
+			case Shader::OPCODE_POWX:		powx(d, s0, s1, pp);							break;
+			case Shader::OPCODE_POW:		pow(d, s0, s1, pp);								break;
+			case Shader::OPCODE_SGN:		sgn(d, s0);										break;
+			case Shader::OPCODE_CRS:		crs(d, s0, s1);									break;
+			case Shader::OPCODE_FORWARD1:	forward1(d, s0, s1, s2);						break;
+			case Shader::OPCODE_FORWARD2:	forward2(d, s0, s1, s2);						break;
+			case Shader::OPCODE_FORWARD3:	forward3(d, s0, s1, s2);						break;
+			case Shader::OPCODE_FORWARD4:	forward4(d, s0, s1, s2);						break;
+			case Shader::OPCODE_REFLECT1:	reflect1(d, s0, s1);							break;
+			case Shader::OPCODE_REFLECT2:	reflect2(d, s0, s1);							break;
+			case Shader::OPCODE_REFLECT3:	reflect3(d, s0, s1);							break;
+			case Shader::OPCODE_REFLECT4:	reflect4(d, s0, s1);							break;
+			case Shader::OPCODE_REFRACT1:	refract1(d, s0, s1, s2.x);						break;
+			case Shader::OPCODE_REFRACT2:	refract2(d, s0, s1, s2.x);						break;
+			case Shader::OPCODE_REFRACT3:	refract3(d, s0, s1, s2.x);						break;
+			case Shader::OPCODE_REFRACT4:	refract4(d, s0, s1, s2.x);						break;
+			case Shader::OPCODE_NRM2:		nrm2(d, s0, pp);								break;
+			case Shader::OPCODE_NRM3:		nrm3(d, s0, pp);								break;
+			case Shader::OPCODE_NRM4:		nrm4(d, s0, pp);								break;
+			case Shader::OPCODE_ABS:		abs(d, s0);										break;
+			case Shader::OPCODE_SINCOS:		sincos(d, s0, pp);								break;
+			case Shader::OPCODE_COS:		cos(d, s0, pp);									break;
+			case Shader::OPCODE_SIN:		sin(d, s0, pp);									break;
+			case Shader::OPCODE_TAN:		tan(d, s0, pp);									break;
+			case Shader::OPCODE_ACOS:		acos(d, s0, pp);								break;
+			case Shader::OPCODE_ASIN:		asin(d, s0, pp);								break;
+			case Shader::OPCODE_ATAN:		atan(d, s0, pp);								break;
+			case Shader::OPCODE_ATAN2:		atan2(d, s0, s1, pp);							break;
+			case Shader::OPCODE_M4X4:		M4X4(r, d, s0, src1);							break;
+			case Shader::OPCODE_M4X3:		M4X3(r, d, s0, src1);							break;
+			case Shader::OPCODE_M3X4:		M3X4(r, d, s0, src1);							break;
+			case Shader::OPCODE_M3X3:		M3X3(r, d, s0, src1);							break;
+			case Shader::OPCODE_M3X2:		M3X2(r, d, s0, src1);							break;
+			case Shader::OPCODE_TEX:		TEXLD(r, d, s0, src1, project, bias);			break;
+			case Shader::OPCODE_TEXLDD:		TEXLDD(r, d, s0, src1, s2, s3, project, bias);	break;
+			case Shader::OPCODE_TEXLDL:		TEXLDL(r, d, s0, src1, project, bias);			break;
+			case Shader::OPCODE_TEXKILL:	TEXKILL(cMask, d, dst.mask);					break;
+			case Shader::OPCODE_DISCARD:	DISCARD(r, cMask, instruction);					break;
+			case Shader::OPCODE_DFDX:		DFDX(d, s0);									break;
+			case Shader::OPCODE_DFDY:		DFDY(d, s0);									break;
+			case Shader::OPCODE_FWIDTH:		FWIDTH(d, s0);									break;
+			case Shader::OPCODE_BREAK:		BREAK(r);										break;
+			case Shader::OPCODE_BREAKC:		BREAKC(r, s0, s1, control);						break;
+			case Shader::OPCODE_BREAKP:		BREAKP(r, src0);								break;
+			case Shader::OPCODE_CONTINUE:	CONTINUE(r);									break;
+			case Shader::OPCODE_TEST:		TEST();											break;
+			case Shader::OPCODE_CALL:		CALL(r, dst.label, dst.callSite);               break;
+			case Shader::OPCODE_CALLNZ:		CALLNZ(r, dst.label, dst.callSite, src0);       break;
+			case Shader::OPCODE_ELSE:		ELSE(r);										break;
+			case Shader::OPCODE_ENDIF:		ENDIF(r);										break;
+			case Shader::OPCODE_ENDLOOP:	ENDLOOP(r);										break;
+			case Shader::OPCODE_ENDREP:		ENDREP(r);										break;
+			case Shader::OPCODE_ENDWHILE:	ENDWHILE(r);	     							break;
+			case Shader::OPCODE_IF:			IF(r, src0);									break;
+			case Shader::OPCODE_IFC:		IFC(r, s0, s1, control);						break;
+			case Shader::OPCODE_LABEL:		LABEL(dst.index);								break;
+			case Shader::OPCODE_LOOP:		LOOP(r, src1);									break;
+			case Shader::OPCODE_REP:		REP(r, src0);									break;
+			case Shader::OPCODE_WHILE:		WHILE(r, src0);									break;
+			case Shader::OPCODE_RET:		RET(r);											break;
+			case Shader::OPCODE_LEAVE:		LEAVE(r);										break;
+			case Shader::OPCODE_CMP:		cmp(d, s0, s1, control);						break;
+			case Shader::OPCODE_ALL:		all(d.x, s0);									break;
+			case Shader::OPCODE_ANY:		any(d.x, s0);									break;
+			case Shader::OPCODE_NOT:		not(d, s0);										break;
+			case Shader::OPCODE_OR:			or(d.x, s0.x, s1.x);							break;
+			case Shader::OPCODE_XOR:		xor(d.x, s0.x, s1.x);							break;
+			case Shader::OPCODE_AND:		and(d.x, s0.x, s1.x);							break;
+			case Shader::OPCODE_END:														break;
 			default:
 				ASSERT(false);
 			}
 
-			if(dst.type != Dst::PARAMETER_VOID && dst.type != Dst::PARAMETER_LABEL && opcode != Op::OPCODE_TEXKILL)
+			if(dst.type != Shader::PARAMETER_VOID && dst.type != Shader::PARAMETER_LABEL && opcode != Shader::OPCODE_TEXKILL && opcode != Shader::OPCODE_NOP)
 			{
-				if(dst.saturate)
+				if(dst.integer)
 				{
-					if(dst.x) d.r = Max(d.r, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					if(dst.y) d.g = Max(d.g, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					if(dst.z) d.b = Max(d.b, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					if(dst.w) d.a = Max(d.a, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-
-					if(dst.x) d.r = Min(d.r, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-					if(dst.y) d.g = Min(d.g, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-					if(dst.z) d.b = Min(d.b, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-					if(dst.w) d.a = Min(d.a, Float4(1.0f, 1.0f, 1.0f, 1.0f));
+					switch(opcode)
+					{
+					case Shader::OPCODE_DIV:
+						if(dst.x) d.x = Trunc(d.x);
+						if(dst.y) d.y = Trunc(d.y);
+						if(dst.z) d.z = Trunc(d.z);
+						if(dst.w) d.w = Trunc(d.w);
+						break;
+					default:
+						break;   // No truncation to integer required when arguments are integer
+					}
 				}
 
-				if(pixelShader->containsDynamicBranching())
+				if(dst.saturate)
 				{
-					Color4f pDst;   // FIXME: Rename
+					if(dst.x) d.x = Max(d.x, Float4(0.0f));
+					if(dst.y) d.y = Max(d.y, Float4(0.0f));
+					if(dst.z) d.z = Max(d.z, Float4(0.0f));
+					if(dst.w) d.w = Max(d.w, Float4(0.0f));
+
+					if(dst.x) d.x = Min(d.x, Float4(1.0f));
+					if(dst.y) d.y = Min(d.y, Float4(1.0f));
+					if(dst.z) d.z = Min(d.z, Float4(1.0f));
+					if(dst.w) d.w = Min(d.w, Float4(1.0f));
+				}
+
+				if(shader->containsDynamicBranching())
+				{
+					Vector4f pDst;   // FIXME: Rename
 
 					switch(dst.type)
 					{
-					case Dst::PARAMETER_TEMP:
-						if(dst.x) pDst.x = r.rf[dst.index].x;
-						if(dst.y) pDst.y = r.rf[dst.index].y;
-						if(dst.z) pDst.z = r.rf[dst.index].z;
-						if(dst.w) pDst.w = r.rf[dst.index].w;
+					case Shader::PARAMETER_TEMP:
+						if(dst.rel.type == Shader::PARAMETER_VOID)
+						{
+							if(dst.x) pDst.x = r.rf[dst.index].x;
+							if(dst.y) pDst.y = r.rf[dst.index].y;
+							if(dst.z) pDst.z = r.rf[dst.index].z;
+							if(dst.w) pDst.w = r.rf[dst.index].w;
+						}
+						else
+						{
+							Int a = relativeAddress(r, dst);
+
+							if(dst.x) pDst.x = r.rf[dst.index + a].x;
+							if(dst.y) pDst.y = r.rf[dst.index + a].y;
+							if(dst.z) pDst.z = r.rf[dst.index + a].z;
+							if(dst.w) pDst.w = r.rf[dst.index + a].w;
+						}
 						break;
-					case Dst::PARAMETER_COLOROUT:
+					case Shader::PARAMETER_COLOROUT:
+						ASSERT(dst.rel.type == Shader::PARAMETER_VOID);
 						if(dst.x) pDst.x = r.oC[dst.index].x;
 						if(dst.y) pDst.y = r.oC[dst.index].y;
 						if(dst.z) pDst.z = r.oC[dst.index].z;
 						if(dst.w) pDst.w = r.oC[dst.index].w;
 						break;
-					case Dst::PARAMETER_PREDICATE:
+					case Shader::PARAMETER_PREDICATE:
 						if(dst.x) pDst.x = r.p0.x;
 						if(dst.y) pDst.y = r.p0.y;
 						if(dst.z) pDst.z = r.p0.z;
 						if(dst.w) pDst.w = r.p0.w;
 						break;
-					case Dst::PARAMETER_DEPTHOUT:
+					case Shader::PARAMETER_DEPTHOUT:
 						pDst.x = r.oDepth;
 						break;
 					default:
 						ASSERT(false);
 					}
 				
-					Int4 enable = r.enableStack[r.enableIndex] & r.enableBreak;
+					Int4 enable = enableMask(r, instruction);
 
 					Int4 xEnable = enable;
 					Int4 yEnable = enable;
@@ -3870,14 +4010,14 @@
 
 					if(predicate)
 					{
-						unsigned char pSwizzle = instruction->getPredicateSwizzle();
+						unsigned char pSwizzle = instruction->predicateSwizzle;
 
 						Float4 xPredicate = r.p0[(pSwizzle >> 0) & 0x03];
 						Float4 yPredicate = r.p0[(pSwizzle >> 2) & 0x03];
 						Float4 zPredicate = r.p0[(pSwizzle >> 4) & 0x03];
 						Float4 wPredicate = r.p0[(pSwizzle >> 6) & 0x03];
 
-						if(!instruction->isPredicateNot())
+						if(!instruction->predicateNot)
 						{
 							if(dst.x) xEnable = xEnable & As<Int4>(xPredicate);
 							if(dst.y) yEnable = yEnable & As<Int4>(yPredicate);
@@ -3906,25 +4046,38 @@
 
 				switch(dst.type)
 				{
-				case Dst::PARAMETER_TEMP:
-					if(dst.x) r.rf[dst.index].x = d.x;
-					if(dst.y) r.rf[dst.index].y = d.y;
-					if(dst.z) r.rf[dst.index].z = d.z;
-					if(dst.w) r.rf[dst.index].w = d.w;
+				case Shader::PARAMETER_TEMP:
+					if(dst.rel.type == Shader::PARAMETER_VOID)
+					{
+						if(dst.x) r.rf[dst.index].x = d.x;
+						if(dst.y) r.rf[dst.index].y = d.y;
+						if(dst.z) r.rf[dst.index].z = d.z;
+						if(dst.w) r.rf[dst.index].w = d.w;
+					}
+					else
+					{
+						Int a = relativeAddress(r, dst);
+
+						if(dst.x) r.rf[dst.index + a].x = d.x;
+						if(dst.y) r.rf[dst.index + a].y = d.y;
+						if(dst.z) r.rf[dst.index + a].z = d.z;
+						if(dst.w) r.rf[dst.index + a].w = d.w;
+					}
 					break;
-				case Dst::PARAMETER_COLOROUT:
-					if(dst.x) r.oC[dst.index].x = d.x;
-					if(dst.y) r.oC[dst.index].y = d.y;
-					if(dst.z) r.oC[dst.index].z = d.z;
-					if(dst.w) r.oC[dst.index].w = d.w;
+				case Shader::PARAMETER_COLOROUT:
+					ASSERT(dst.rel.type == Shader::PARAMETER_VOID);
+					if(dst.x) {r.oC[dst.index].x = d.x; out[dst.index][0] = true;}
+					if(dst.y) {r.oC[dst.index].y = d.y; out[dst.index][1] = true;}
+					if(dst.z) {r.oC[dst.index].z = d.z; out[dst.index][2] = true;}
+					if(dst.w) {r.oC[dst.index].w = d.w; out[dst.index][3] = true;}
 					break;
-				case Dst::PARAMETER_PREDICATE:
+				case Shader::PARAMETER_PREDICATE:
 					if(dst.x) r.p0.x = d.x;
 					if(dst.y) r.p0.y = d.y;
 					if(dst.z) r.p0.z = d.z;
 					if(dst.w) r.p0.w = d.w;
 					break;
-				case Dst::PARAMETER_DEPTHOUT:
+				case Shader::PARAMETER_DEPTHOUT:
 					r.oDepth = d.x;
 					break;
 				default:
@@ -3933,36 +4086,47 @@
 			}
 		}
 
-		if(returns)
+		if(currentLabel != -1)
 		{
 			Nucleus::setInsertBlock(returnBlock);
 		}
+
+		for(int i = 0; i < 4; i++)
+		{
+			if((Format)state.targetFormat[i] != FORMAT_NULL)
+			{
+				if(!out[i][0]) r.oC[i].x = Float4(0.0f);
+				if(!out[i][1]) r.oC[i].y = Float4(0.0f);
+				if(!out[i][2]) r.oC[i].z = Float4(0.0f);
+				if(!out[i][3]) r.oC[i].w = Float4(0.0f);
+			}
+		}
 	}
 
-	Short4 PixelRoutine::convertFixed12(Float4 &cf)
+	Short4 PixelRoutine::convertFixed12(RValue<Float4> cf)
 	{
-		return RoundShort4(cf * Float4(0x1000, 0x1000, 0x1000, 0x1000));
+		return RoundShort4(cf * Float4(0x1000));
 	}
 
-	void PixelRoutine::convertFixed12(Color4i &ci, Color4f &cf)
+	void PixelRoutine::convertFixed12(Vector4i &ci, Vector4f &cf)
 	{
-		ci.r = convertFixed12(cf.r);
-		ci.g = convertFixed12(cf.g);
-		ci.b = convertFixed12(cf.b);
-		ci.a = convertFixed12(cf.a);
+		ci.x = convertFixed12(cf.x);
+		ci.y = convertFixed12(cf.y);
+		ci.z = convertFixed12(cf.z);
+		ci.w = convertFixed12(cf.w);
 	}
 
 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
 	{
-		return UShort4(cf * Float4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF), saturate);
+		return UShort4(cf * Float4(0xFFFF), saturate);
 	}
 
-	void PixelRoutine::convertFixed16(Color4i &ci, Color4f &cf, bool saturate)
+	void PixelRoutine::convertFixed16(Vector4i &ci, Vector4f &cf, bool saturate)
 	{
-		ci.r = convertFixed16(cf.r, saturate);
-		ci.g = convertFixed16(cf.g, saturate);
-		ci.b = convertFixed16(cf.b, saturate);
-		ci.a = convertFixed16(cf.a, saturate);
+		ci.x = convertFixed16(cf.x, saturate);
+		ci.y = convertFixed16(cf.y, saturate);
+		ci.z = convertFixed16(cf.z, saturate);
+		ci.w = convertFixed16(cf.w, saturate);
 	}
 
 	Float4 PixelRoutine::convertSigned12(Short4 &ci)
@@ -3970,75 +4134,75 @@
 		return Float4(ci) * Float4(1.0f / 0x0FFE);
 	}
 
-	void PixelRoutine::convertSigned12(Color4f &cf, Color4i &ci)
+	void PixelRoutine::convertSigned12(Vector4f &cf, Vector4i &ci)
 	{
-		cf.r = convertSigned12(ci.r);
-		cf.g = convertSigned12(ci.g);
-		cf.b = convertSigned12(ci.b);
-		cf.a = convertSigned12(ci.a);
+		cf.x = convertSigned12(ci.x);
+		cf.y = convertSigned12(ci.y);
+		cf.z = convertSigned12(ci.z);
+		cf.w = convertSigned12(ci.w);
 	}
 
 	Float4 PixelRoutine::convertUnsigned16(UShort4 ci)
 	{
-		return Float4(ci) * Float4(1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF);
+		return Float4(ci) * Float4(1.0f / 0xFFFF);
 	}
 
-	void PixelRoutine::sRGBtoLinear16_16(Registers &r, Color4i &c)
+	void PixelRoutine::sRGBtoLinear16_16(Registers &r, Vector4i &c)
 	{
-		c.r = As<UShort4>(c.r) >> 4;
-		c.g = As<UShort4>(c.g) >> 4;
-		c.b = As<UShort4>(c.b) >> 4;
+		c.x = As<UShort4>(c.x) >> 4;
+		c.y = As<UShort4>(c.y) >> 4;
+		c.z = As<UShort4>(c.z) >> 4;
 
 		sRGBtoLinear12_16(r, c);
 	}
 
-	void PixelRoutine::sRGBtoLinear12_16(Registers &r, Color4i &c)
+	void PixelRoutine::sRGBtoLinear12_16(Registers &r, Vector4i &c)
 	{
 		Pointer<Byte> LUT = r.constants + OFFSET(Constants,sRGBtoLin12_16);
 
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 0))), 0);
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 1))), 1);
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 2))), 2);
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 3))), 3);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
 
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 0))), 0);
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 1))), 1);
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 2))), 2);
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 3))), 3);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
 
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 0))), 0);
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 1))), 1);
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 2))), 2);
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 3))), 3);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
 	}
 
-	void PixelRoutine::linearToSRGB16_16(Registers &r, Color4i &c)
+	void PixelRoutine::linearToSRGB16_16(Registers &r, Vector4i &c)
 	{
-		c.r = As<UShort4>(c.r) >> 4;
-		c.g = As<UShort4>(c.g) >> 4;
-		c.b = As<UShort4>(c.b) >> 4;
+		c.x = As<UShort4>(c.x) >> 4;
+		c.y = As<UShort4>(c.y) >> 4;
+		c.z = As<UShort4>(c.z) >> 4;
 
 		linearToSRGB12_16(r, c);
 	}
 
-	void PixelRoutine::linearToSRGB12_16(Registers &r, Color4i &c)
+	void PixelRoutine::linearToSRGB12_16(Registers &r, Vector4i &c)
 	{
 		Pointer<Byte> LUT = r.constants + OFFSET(Constants,linToSRGB12_16);
 
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 0))), 0);
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 1))), 1);
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 2))), 2);
-		c.r = Insert(c.r, *Pointer<Short>(LUT + 2 * Int(Extract(c.r, 3))), 3);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
 
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 0))), 0);
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 1))), 1);
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 2))), 2);
-		c.g = Insert(c.g, *Pointer<Short>(LUT + 2 * Int(Extract(c.g, 3))), 3);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
 
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 0))), 0);
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 1))), 1);
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 2))), 2);
-		c.b = Insert(c.b, *Pointer<Short>(LUT + 2 * Int(Extract(c.b, 3))), 3);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
 	}
 
 	Float4 PixelRoutine::linearToSRGB(const Float4 &x)   // Approximates x^(1.0/2.2)
@@ -4057,31 +4221,31 @@
 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
 	}
 
-	void PixelRoutine::MOV(Color4i &dst, Color4i &src0)
+	void PixelRoutine::MOV(Vector4i &dst, Vector4i &src0)
 	{
-		dst.r = src0.x;
-		dst.g = src0.y;
-		dst.b = src0.z;
-		dst.a = src0.w;
+		dst.x = src0.x;
+		dst.y = src0.y;
+		dst.z = src0.z;
+		dst.w = src0.w;
 	}
 
-	void PixelRoutine::ADD(Color4i &dst, Color4i &src0, Color4i &src1)
+	void PixelRoutine::ADD(Vector4i &dst, Vector4i &src0, Vector4i &src1)
 	{
-		dst.r = AddSat(src0.x, src1.x);
-		dst.g = AddSat(src0.y, src1.y);
-		dst.b = AddSat(src0.z, src1.z);
-		dst.a = AddSat(src0.w, src1.w);
+		dst.x = AddSat(src0.x, src1.x);
+		dst.y = AddSat(src0.y, src1.y);
+		dst.z = AddSat(src0.z, src1.z);
+		dst.w = AddSat(src0.w, src1.w);
 	}
 
-	void PixelRoutine::SUB(Color4i &dst, Color4i &src0, Color4i &src1)
+	void PixelRoutine::SUB(Vector4i &dst, Vector4i &src0, Vector4i &src1)
 	{
-		dst.r = SubSat(src0.x, src1.x);
-		dst.g = SubSat(src0.y, src1.y);
-		dst.b = SubSat(src0.z, src1.z);
-		dst.a = SubSat(src0.w, src1.w);
+		dst.x = SubSat(src0.x, src1.x);
+		dst.y = SubSat(src0.y, src1.y);
+		dst.z = SubSat(src0.z, src1.z);
+		dst.w = SubSat(src0.w, src1.w);
 	}
 
-	void PixelRoutine::MAD(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2)
+	void PixelRoutine::MAD(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
 	{
 		// FIXME: Long fixed-point multiply fixup
 		{dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x);}
@@ -4090,7 +4254,7 @@
 		{dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w);}
 	}
 
-	void PixelRoutine::MUL(Color4i &dst, Color4i &src0, Color4i &src1)
+	void PixelRoutine::MUL(Vector4i &dst, Vector4i &src0, Vector4i &src1)
 	{
 		// FIXME: Long fixed-point multiply fixup
 		{dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x);}
@@ -4099,7 +4263,7 @@
 		{dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w);}
 	}
 
-	void PixelRoutine::DP3(Color4i &dst, Color4i &src0, Color4i &src1)
+	void PixelRoutine::DP3(Vector4i &dst, Vector4i &src0, Vector4i &src1)
 	{
 		Short4 t0;
 		Short4 t1;
@@ -4111,13 +4275,13 @@
 		t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); 
 		t0 = AddSat(t0, t1);
 
-		dst.r = t0;
-		dst.g = t0;
-		dst.b = t0;
-		dst.a = t0;
+		dst.x = t0;
+		dst.y = t0;
+		dst.z = t0;
+		dst.w = t0;
 	}
 
-	void PixelRoutine::DP4(Color4i &dst, Color4i &src0, Color4i &src1)
+	void PixelRoutine::DP4(Vector4i &dst, Vector4i &src0, Vector4i &src1)
 	{
 		Short4 t0;
 		Short4 t1;
@@ -4131,13 +4295,13 @@
 		t1 = MulHigh(src0.w, src1.w); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); 
 		t0 = AddSat(t0, t1);
 
-		dst.r = t0;
-		dst.g = t0;
-		dst.b = t0;
-		dst.a = t0;
+		dst.x = t0;
+		dst.y = t0;
+		dst.z = t0;
+		dst.w = t0;
 	}
 
-	void PixelRoutine::LRP(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2)
+	void PixelRoutine::LRP(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
 	{
 		// FIXME: Long fixed-point multiply fixup
 		{dst.x = SubSat(src1.x, src2.x); dst.x = MulHigh(dst.x, src0.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x);}
@@ -4146,7 +4310,7 @@
 		{dst.w = SubSat(src1.w, src2.w); dst.w = MulHigh(dst.w, src0.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w);}
 	}
 
-	void PixelRoutine::TEXCOORD(Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
+	void PixelRoutine::TEXCOORD(Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
 	{
 		Float4 uw;
 		Float4 vw;
@@ -4154,41 +4318,41 @@
 
 		if(state.interpolant[2 + coordinate].component & 0x01)
 		{
-			uw = Max(u, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-			uw = Min(uw, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-			dst.r = convertFixed12(uw);
+			uw = Max(u, Float4(0.0f));
+			uw = Min(uw, Float4(1.0f));
+			dst.x = convertFixed12(uw);
 		}
 		else
 		{
-			dst.r = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			dst.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 		}
 
 		if(state.interpolant[2 + coordinate].component & 0x02)
 		{
-			vw = Max(v, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-			vw = Min(vw, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-			dst.g = convertFixed12(vw);
+			vw = Max(v, Float4(0.0f));
+			vw = Min(vw, Float4(1.0f));
+			dst.y = convertFixed12(vw);
 		}
 		else
 		{
-			dst.g = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			dst.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 		}
 
 		if(state.interpolant[2 + coordinate].component & 0x04)
 		{
-			sw = Max(s, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-			sw = Min(sw, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-			dst.b = convertFixed12(sw);
+			sw = Max(s, Float4(0.0f));
+			sw = Min(sw, Float4(1.0f));
+			dst.z = convertFixed12(sw);
 		}
 		else
 		{
-			dst.b = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			dst.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 		}
 
-		dst.a = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+		dst.w = Short4(0x1000);
 	}
 
-	void PixelRoutine::TEXCRD(Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
+	void PixelRoutine::TEXCRD(Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
 	{
 		Float4 uw = u;
 		Float4 vw = v;
@@ -4202,68 +4366,68 @@
 
 		if(state.interpolant[2 + coordinate].component & 0x01)
 		{
-			uw *= Float4(0x1000, 0x1000, 0x1000, 0x1000);
-			uw = Max(uw, Float4(-0x8000, -0x8000, -0x8000, -0x8000));
-			uw = Min(uw, Float4(0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF));
-			dst.r = RoundShort4(uw);
+			uw *= Float4(0x1000);
+			uw = Max(uw, Float4(-0x8000));
+			uw = Min(uw, Float4(0x7FFF));
+			dst.x = RoundShort4(uw);
 		}
 		else
 		{
-			dst.r = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			dst.x = Short4(0x0000);
 		}
 
 		if(state.interpolant[2 + coordinate].component & 0x02)
 		{
-			vw *= Float4(0x1000, 0x1000, 0x1000, 0x1000);
-			vw = Max(vw, Float4(-0x8000, -0x8000, -0x8000, -0x8000));
-			vw = Min(vw, Float4(0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF));
-			dst.g = RoundShort4(vw);
+			vw *= Float4(0x1000);
+			vw = Max(vw, Float4(-0x8000));
+			vw = Min(vw, Float4(0x7FFF));
+			dst.y = RoundShort4(vw);
 		}
 		else
 		{
-			dst.g = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			dst.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 		}
 		
 		if(state.interpolant[2 + coordinate].component & 0x04)
 		{
-			sw *= Float4(0x1000, 0x1000, 0x1000, 0x1000);
-			sw = Max(sw, Float4(-0x8000, -0x8000, -0x8000, -0x8000));
-			sw = Min(sw, Float4(0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF));
-			dst.b = RoundShort4(sw);
+			sw *= Float4(0x1000);
+			sw = Max(sw, Float4(-0x8000));
+			sw = Min(sw, Float4(0x7FFF));
+			dst.z = RoundShort4(sw);
 		}
 		else
 		{
-			dst.b = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			dst.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 		}
 	}
 
-	void PixelRoutine::TEXDP3(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, Color4i &src)
+	void PixelRoutine::TEXDP3(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src)
 	{
 		TEXM3X3PAD(r, u, v, s, src, 0, false);
 
-		Short4 t0 = RoundShort4(r.u_ * Float4(0x1000, 0x1000, 0x1000, 0x1000));
+		Short4 t0 = RoundShort4(r.u_ * Float4(0x1000));
 
-		dst.r = t0;
-		dst.g = t0;
-		dst.b = t0;
-		dst.a = t0;
+		dst.x = t0;
+		dst.y = t0;
+		dst.z = t0;
+		dst.w = t0;
 	}
 
-	void PixelRoutine::TEXDP3TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0)
+	void PixelRoutine::TEXDP3TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0)
 	{
 		TEXM3X3PAD(r, u, v, s, src0, 0, false);
 
-		r.v_ = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-		r.w_ = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+		r.v_ = Float4(0.0f);
+		r.w_ = Float4(0.0f);
 
 		sampleTexture(r, dst, stage, r.u_, r.v_, r.w_, r.w_);
 	}
 
 	void PixelRoutine::TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s)
 	{
-		Int kill = SignMask(CmpNLT(u, Float4(0, 0, 0, 0))) &
-		           SignMask(CmpNLT(v, Float4(0, 0, 0, 0))) &
-		           SignMask(CmpNLT(s, Float4(0, 0, 0, 0)));
+		Int kill = SignMask(CmpNLT(u, Float4(0.0f))) &
+		           SignMask(CmpNLT(v, Float4(0.0f))) &
+		           SignMask(CmpNLT(s, Float4(0.0f)));
 
 		for(unsigned int q = 0; q < state.multiSample; q++)
 		{
@@ -4271,9 +4435,9 @@
 		}
 	}
 
-	void PixelRoutine::TEXKILL(Int cMask[4], Color4i &src)
+	void PixelRoutine::TEXKILL(Int cMask[4], Vector4i &src)
 	{
-		Short4 test = src.r | src.g | src.b;
+		Short4 test = src.x | src.y | src.z;
 		Int kill = SignMask(Pack(test, test)) ^ 0x0000000F;
 
 		for(unsigned int q = 0; q < state.multiSample; q++)
@@ -4282,24 +4446,24 @@
 		}
 	}
 
-	void PixelRoutine::TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
+	void PixelRoutine::TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
 	{
 		sampleTexture(r, dst, sampler, u, v, s, s, project);
 	}
 
-	void PixelRoutine::TEXLD(Registers &r, Color4i &dst, Color4i &src, int sampler, bool project)
+	void PixelRoutine::TEXLD(Registers &r, Vector4i &dst, Vector4i &src, int sampler, bool project)
 	{
-		Float4 u = Float4(src.r) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 v = Float4(src.g) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 s = Float4(src.b) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
+		Float4 u = Float4(src.x) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src.y) * Float4(1.0f / 0x0FFE);
+		Float4 s = Float4(src.z) * Float4(1.0f / 0x0FFE);
 
 		sampleTexture(r, dst, sampler, u, v, s, s, project);
 	}
 
-	void PixelRoutine::TEXBEM(Registers &r, Color4i &dst, Color4i &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+	void PixelRoutine::TEXBEM(Registers &r, Vector4i &dst, Vector4i &src, Float4 &u, Float4 &v, Float4 &s, int stage)
 	{
-		Float4 du = Float4(src.r) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 dv = Float4(src.g) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
+		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
+		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
 
 		Float4 du2 = du;
 		Float4 dv2 = dv;
@@ -4317,10 +4481,10 @@
 		sampleTexture(r, dst, stage, u_, v_, s, s);
 	}
 
-	void PixelRoutine::TEXBEML(Registers &r, Color4i &dst, Color4i &src, Float4 &u, Float4 &v, Float4 &s, int stage)
+	void PixelRoutine::TEXBEML(Registers &r, Vector4i &dst, Vector4i &src, Float4 &u, Float4 &v, Float4 &s, int stage)
 	{
-		Float4 du = Float4(src.r) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 dv = Float4(src.g) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
+		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
+		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
 
 		Float4 du2 = du;
 		Float4 dv2 = dv;
@@ -4339,46 +4503,46 @@
 
 		Short4 L;
 
-		L = src.b;
+		L = src.z;
 		L = MulHigh(L, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].luminanceScale4)));
 		L = L << 4;
 		L = AddSat(L, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].luminanceOffset4)));
 		L = Max(L, Short4(0x0000, 0x0000, 0x0000, 0x0000));
-		L = Min(L, Short4(0x1000, 0x1000, 0x1000, 0x1000));
+		L = Min(L, Short4(0x1000));
 
-		dst.r = MulHigh(dst.r, L); dst.r = dst.r << 4;
-		dst.g = MulHigh(dst.g, L); dst.g = dst.g << 4;
-		dst.b = MulHigh(dst.b, L); dst.b = dst.b << 4;
+		dst.x = MulHigh(dst.x, L); dst.x = dst.x << 4;
+		dst.y = MulHigh(dst.y, L); dst.y = dst.y << 4;
+		dst.z = MulHigh(dst.z, L); dst.z = dst.z << 4;
 	}
 
-	void PixelRoutine::TEXREG2AR(Registers &r, Color4i &dst, Color4i &src0, int stage)
+	void PixelRoutine::TEXREG2AR(Registers &r, Vector4i &dst, Vector4i &src0, int stage)
 	{
-		Float4 u = Float4(src0.a) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 v = Float4(src0.r) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 s = Float4(src0.b) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
+		Float4 u = Float4(src0.w) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src0.x) * Float4(1.0f / 0x0FFE);
+		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
 
 		sampleTexture(r, dst, stage, u, v, s, s);
 	}
 
-	void PixelRoutine::TEXREG2GB(Registers &r, Color4i &dst, Color4i &src0, int stage)
+	void PixelRoutine::TEXREG2GB(Registers &r, Vector4i &dst, Vector4i &src0, int stage)
 	{
-		Float4 u = Float4(src0.g) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 v = Float4(src0.b) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
+		Float4 u = Float4(src0.y) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src0.z) * Float4(1.0f / 0x0FFE);
 		Float4 s = v;
 
 		sampleTexture(r, dst, stage, u, v, s, s);
 	}
 
-	void PixelRoutine::TEXREG2RGB(Registers &r, Color4i &dst, Color4i &src0, int stage)
+	void PixelRoutine::TEXREG2RGB(Registers &r, Vector4i &dst, Vector4i &src0, int stage)
 	{
-		Float4 u = Float4(src0.r) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 v = Float4(src0.g) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		Float4 s = Float4(src0.b) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
+		Float4 u = Float4(src0.x) * Float4(1.0f / 0x0FFE);
+		Float4 v = Float4(src0.y) * Float4(1.0f / 0x0FFE);
+		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
 
 		sampleTexture(r, dst, stage, u, v, s, s);
 	}
 
-	void PixelRoutine::TEXM3X2DEPTH(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, Color4i &src, bool signedScaling)
+	void PixelRoutine::TEXM3X2DEPTH(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src, bool signedScaling)
 	{
 		TEXM3X2PAD(r, u, v, s, src, 1, signedScaling);
 
@@ -4388,44 +4552,44 @@
 		r.oDepth = r.u_;
 	}
 
-	void PixelRoutine::TEXM3X2PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Color4i &src0, int component, bool signedScaling)
+	void PixelRoutine::TEXM3X2PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, int component, bool signedScaling)
 	{
 		TEXM3X3PAD(r, u, v, s, src0, component, signedScaling);
 	}
 
-	void PixelRoutine::TEXM3X2TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0, bool signedScaling)
+	void PixelRoutine::TEXM3X2TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, bool signedScaling)
 	{
 		TEXM3X2PAD(r, u, v, s, src0, 1, signedScaling);
 
-		r.w_ = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+		r.w_ = Float4(0.0f);
 
 		sampleTexture(r, dst, stage, r.u_, r.v_, r.w_, r.w_);
 	}
 
-	void PixelRoutine::TEXM3X3(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, Color4i &src0, bool signedScaling)
+	void PixelRoutine::TEXM3X3(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, bool signedScaling)
 	{
 		TEXM3X3PAD(r, u, v, s, src0, 2, signedScaling);
 
-		dst.r = RoundShort4(r.u_ * Float4(0x1000, 0x1000, 0x1000, 0x1000));
-		dst.g = RoundShort4(r.v_ * Float4(0x1000, 0x1000, 0x1000, 0x1000));
-		dst.b = RoundShort4(r.w_ * Float4(0x1000, 0x1000, 0x1000, 0x1000));
-		dst.a = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+		dst.x = RoundShort4(r.u_ * Float4(0x1000));
+		dst.y = RoundShort4(r.v_ * Float4(0x1000));
+		dst.z = RoundShort4(r.w_ * Float4(0x1000));
+		dst.w = Short4(0x1000);
 	}
 
-	void PixelRoutine::TEXM3X3PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Color4i &src0, int component, bool signedScaling)
+	void PixelRoutine::TEXM3X3PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, int component, bool signedScaling)
 	{
 		if(component == 0 || previousScaling != signedScaling)   // FIXME: Other source modifiers?
 		{
-			r.U = Float4(src0.r);
-			r.V = Float4(src0.g);
-			r.W = Float4(src0.b);
+			r.U = Float4(src0.x);
+			r.V = Float4(src0.y);
+			r.W = Float4(src0.z);
 
 			previousScaling = signedScaling;
 		}
 
 		Float4 x = r.U * u + r.V * v + r.W * s;
 
-		x *= Float4(1.0f / 0x1000, 1.0f / 0x1000, 1.0f / 0x1000, 1.0f / 0x1000);
+		x *= Float4(1.0f / 0x1000);
 
 		switch(component)
 		{
@@ -4436,15 +4600,15 @@
 		}
 	}
 
-	void PixelRoutine::TEXM3X3SPEC(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0, Color4i &src1)
+	void PixelRoutine::TEXM3X3SPEC(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, Vector4i &src1)
 	{
 		TEXM3X3PAD(r, u, v, s, src0, 2, false);
 
 		Float4 E[3];   // Eye vector
 
-		E[0] = Float4(src1.r) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		E[1] = Float4(src1.g) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
-		E[2] = Float4(src1.b) * Float4(1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE, 1.0f / 0x0FFE);
+		E[0] = Float4(src1.x) * Float4(1.0f / 0x0FFE);
+		E[1] = Float4(src1.y) * Float4(1.0f / 0x0FFE);
+		E[2] = Float4(src1.z) * Float4(1.0f / 0x0FFE);
 
 		// Reflection
 		Float4 u__;
@@ -4473,22 +4637,22 @@
 		sampleTexture(r, dst, stage,  u__, v__, w__, w__);
 	}
 
-	void PixelRoutine::TEXM3X3TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0, bool signedScaling)
+	void PixelRoutine::TEXM3X3TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, bool signedScaling)
 	{
 		TEXM3X3PAD(r, u, v, s, src0, 2, signedScaling);
 
 		sampleTexture(r, dst, stage, r.u_, r.v_, r.w_, r.w_);
 	}
 
-	void PixelRoutine::TEXM3X3VSPEC(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0)
+	void PixelRoutine::TEXM3X3VSPEC(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0)
 	{
 		TEXM3X3PAD(r, u, v, s, src0, 2, false);
 
 		Float4 E[3];   // Eye vector
 
-		E[0] = r.vw[2 + stage - 2];
-		E[1] = r.vw[2 + stage - 1];
-		E[2] = r.vw[2 + stage - 0];
+		E[0] = r.vf[2 + stage - 2].w;
+		E[1] = r.vf[2 + stage - 1].w;
+		E[2] = r.vf[2 + stage - 0].w;
 
 		// Reflection
 		Float4 u__;
@@ -4519,8 +4683,8 @@
 
 	void PixelRoutine::TEXDEPTH(Registers &r)
 	{
-		r.u_ = Float4(r.ri[5].r);
-		r.v_ = Float4(r.ri[5].g);
+		r.u_ = Float4(r.ri[5].x);
+		r.v_ = Float4(r.ri[5].y);
 
 		// z / w
 		r.u_ *= Rcp_pp(r.v_);   // FIXME: Set result to 1.0 when division by zero
@@ -4528,68 +4692,68 @@
 		r.oDepth = r.u_;
 	}
 
-	void PixelRoutine::CND(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2)
+	void PixelRoutine::CND(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
 	{
-		{Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.r = t0;};
-		{Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.g = t0;};
-		{Short4 t0; t0 = src0.z; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.z; t1 = t1 & t0; t0 = ~t0 & src2.z; t0 = t0 | t1; dst.b = t0;};
-		{Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.a = t0;};
+		{Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.x = t0;};
+		{Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.y = t0;};
+		{Short4 t0; t0 = src0.z; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.z; t1 = t1 & t0; t0 = ~t0 & src2.z; t0 = t0 | t1; dst.z = t0;};
+		{Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800, 0x0800, 0x0800, 0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.w = t0;};
 	}
 
-	void PixelRoutine::CMP(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2)
+	void PixelRoutine::CMP(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2)
 	{
-		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.r = t0;};
-		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.g = t0;};
-		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.z); Short4 t1; t1 = src2.z; t1 &= t0; t0 = ~t0 & src1.z; t0 |= t1; dst.b = t0;};
-		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.a = t0;};
+		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.x = t0;};
+		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.y = t0;};
+		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.z); Short4 t1; t1 = src2.z; t1 &= t0; t0 = ~t0 & src1.z; t0 |= t1; dst.z = t0;};
+		{Short4 t0 = CmpGT(Short4(0x0000, 0x0000, 0x0000, 0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.w = t0;};
 	}
 
-	void PixelRoutine::BEM(Registers &r, Color4i &dst, Color4i &src0, Color4i &src1, int stage)
+	void PixelRoutine::BEM(Registers &r, Vector4i &dst, Vector4i &src0, Vector4i &src1, int stage)
 	{
 		Short4 t0;
 		Short4 t1;
 
-		// dst.r = src0.r + BUMPENVMAT00(stage) * src1.r + BUMPENVMAT10(stage) * src1.g
+		// dst.x = src0.x + BUMPENVMAT00(stage) * src1.x + BUMPENVMAT10(stage) * src1.y
 		t0 = MulHigh(src1.x, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].bumpmapMatrix4W[0][0]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
 		t1 = MulHigh(src1.y, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].bumpmapMatrix4W[1][0]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
 		t0 = AddSat(t0, t1);
 		t0 = AddSat(t0, src0.x);
-		dst.r = t0;
+		dst.x = t0;
 
-		// dst.g = src0.g + BUMPENVMAT01(stage) * src1.r + BUMPENVMAT11(stage) * src1.g
+		// dst.y = src0.y + BUMPENVMAT01(stage) * src1.x + BUMPENVMAT11(stage) * src1.y
 		t0 = MulHigh(src1.x, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].bumpmapMatrix4W[0][1]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
 		t1 = MulHigh(src1.y, *Pointer<Short4>(r.data + OFFSET(DrawData,textureStage[stage].bumpmapMatrix4W[1][1]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
 		t0 = AddSat(t0, t1);
 		t0 = AddSat(t0, src0.y);
-		dst.g = t0;
+		dst.y = t0;
 	}
 
-	void PixelRoutine::M3X2(Registers &r, Color4f &dst, Color4f &src0, const Src &src1)
+	void PixelRoutine::M3X2(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
 
 		dst.x = dot3(src0, row0);
 		dst.y = dot3(src0, row1);
 	}
 
-	void PixelRoutine::M3X3(Registers &r, Color4f &dst, Color4f &src0, const Src &src1)
+	void PixelRoutine::M3X3(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
 
 		dst.x = dot3(src0, row0);
 		dst.y = dot3(src0, row1);
 		dst.z = dot3(src0, row2);
 	}
 
-	void PixelRoutine::M3X4(Registers &r, Color4f &dst, Color4f &src0, const Src &src1)
+	void PixelRoutine::M3X4(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
-		Color4f row3 = reg(r, src1, 3);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
+		Vector4f row3 = reg(r, src1, 3);
 
 		dst.x = dot3(src0, row0);
 		dst.y = dot3(src0, row1);
@@ -4597,23 +4761,23 @@
 		dst.w = dot3(src0, row3);
 	}
 
-	void PixelRoutine::M4X3(Registers &r, Color4f &dst, Color4f &src0, const Src &src1)
+	void PixelRoutine::M4X3(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
 
 		dst.x = dot4(src0, row0);
 		dst.y = dot4(src0, row1);
 		dst.z = dot4(src0, row2);
 	}
 
-	void PixelRoutine::M4X4(Registers &r, Color4f &dst, Color4f &src0, const Src &src1)
+	void PixelRoutine::M4X4(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
-		Color4f row3 = reg(r, src1, 3);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
+		Vector4f row3 = reg(r, src1, 3);
 
 		dst.x = dot4(src0, row0);
 		dst.y = dot4(src0, row1);
@@ -4621,11 +4785,10 @@
 		dst.w = dot4(src0, row3);
 	}
 
-	void PixelRoutine::TEXLD(Registers &r, Color4f &dst, Color4f &src0, const Src &src1, bool project, bool bias)
+	void PixelRoutine::TEXLD(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias)
 	{
-		Color4f tmp;
-
-		sampleTexture(r, tmp, src1.index, src0.u, src0.v, src0.s, src0.t, src0, src0, project, bias);
+		Vector4f tmp;
+		sampleTexture(r, tmp, src1, src0.x, src0.y, src0.z, src0.w, src0, src0, project, bias);	
 
 		dst.x = tmp[(src1.swizzle >> 0) & 0x3];
 		dst.y = tmp[(src1.swizzle >> 2) & 0x3];
@@ -4633,11 +4796,10 @@
 		dst.w = tmp[(src1.swizzle >> 6) & 0x3];
 	}
 	
-	void PixelRoutine::TEXLDD(Registers &r, Color4f &dst, Color4f &src0, const Src &src1, Color4f &src2,  Color4f &src3, bool project, bool bias)
+	void PixelRoutine::TEXLDD(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &src2,  Vector4f &src3, bool project, bool bias)
 	{
-		Color4f tmp;
-
-		sampleTexture(r, tmp, src1.index, src0.u, src0.v, src0.s, src0.t, src2, src3, project, bias, true);
+		Vector4f tmp;
+		sampleTexture(r, tmp, src1, src0.x, src0.y, src0.z, src0.w, src2, src3, project, bias, true);
 
 		dst.x = tmp[(src1.swizzle >> 0) & 0x3];
 		dst.y = tmp[(src1.swizzle >> 2) & 0x3];
@@ -4645,11 +4807,10 @@
 		dst.w = tmp[(src1.swizzle >> 6) & 0x3];
 	}
 	
-	void PixelRoutine::TEXLDL(Registers &r, Color4f &dst, Color4f &src0, const Src &src1, bool project, bool bias)
+	void PixelRoutine::TEXLDL(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias)
 	{
-		Color4f tmp;
-
-		sampleTexture(r, tmp, src1.index, src0.u, src0.v, src0.s, src0.t, src0, src0, project, bias, false, true);
+		Vector4f tmp;
+		sampleTexture(r, tmp, src1, src0.x, src0.y, src0.z, src0.w, src0, src0, project, bias, false, true);
 
 		dst.x = tmp[(src1.swizzle >> 0) & 0x3];
 		dst.y = tmp[(src1.swizzle >> 2) & 0x3];
@@ -4657,35 +4818,69 @@
 		dst.w = tmp[(src1.swizzle >> 6) & 0x3];
 	}
 
-	void PixelRoutine::TEXKILL(Int cMask[4], Color4f &src, unsigned char mask)
+	void PixelRoutine::TEXKILL(Int cMask[4], Vector4f &src, unsigned char mask)
 	{
 		Int kill = -1;
 		
-		if(mask & 0x1) kill &= SignMask(CmpNLT(src.x, Float4(0, 0, 0, 0)));
-		if(mask & 0x2) kill &= SignMask(CmpNLT(src.y, Float4(0, 0, 0, 0)));
-		if(mask & 0x4) kill &= SignMask(CmpNLT(src.z, Float4(0, 0, 0, 0)));
-		if(mask & 0x8) kill &= SignMask(CmpNLT(src.w, Float4(0, 0, 0, 0)));
+		if(mask & 0x1) kill &= SignMask(CmpNLT(src.x, Float4(0.0f)));
+		if(mask & 0x2) kill &= SignMask(CmpNLT(src.y, Float4(0.0f)));
+		if(mask & 0x4) kill &= SignMask(CmpNLT(src.z, Float4(0.0f)));
+		if(mask & 0x8) kill &= SignMask(CmpNLT(src.w, Float4(0.0f)));
+
+		// FIXME: Dynamic branching affects TEXKILL?
+	//	if(shader->containsDynamicBranching())
+	//	{
+	//		kill = ~SignMask(enableMask(r));
+	//	}
 
 		for(unsigned int q = 0; q < state.multiSample; q++)
 		{
 			cMask[q] &= kill;
 		}
+
+		// FIXME: Branch to end of shader if all killed?
 	}
 
-	void PixelRoutine::DSX(Color4f &dst, Color4f &src)
+	void PixelRoutine::DISCARD(Registers &r, Int cMask[4], const Shader::Instruction *instruction)
 	{
-		dst.x = src.x.yyyy - src.x.xxxx;
-		dst.y = src.y.yyyy - src.y.xxxx;
-		dst.z = src.z.yyyy - src.z.xxxx;
-		dst.w = src.w.yyyy - src.w.xxxx;
+		Int kill = 0;
+		
+		if(shader->containsDynamicBranching())
+		{
+			kill = ~SignMask(enableMask(r, instruction));
+		}
+		
+		for(unsigned int q = 0; q < state.multiSample; q++)
+		{
+			cMask[q] &= kill;
+		}
+
+		// FIXME: Branch to end of shader if all killed?
 	}
 
-	void PixelRoutine::DSY(Color4f &dst, Color4f &src)
+	void PixelRoutine::DFDX(Vector4f &dst, Vector4f &src)
 	{
-		dst.x = src.x.zzzz - src.x.xxxx;
-		dst.y = src.y.zzzz - src.y.xxxx;
-		dst.z = src.z.zzzz - src.z.xxxx;
-		dst.w = src.w.zzzz - src.w.xxxx;
+		dst.x = src.x.yyww - src.x.xxzz;
+		dst.y = src.y.yyww - src.y.xxzz;
+		dst.z = src.z.yyww - src.z.xxzz;
+		dst.w = src.w.yyww - src.w.xxzz;
+	}
+
+	void PixelRoutine::DFDY(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = src.x.zwzw - src.x.xyxy;
+		dst.y = src.y.zwzw - src.y.xyxy;
+		dst.z = src.z.zwzw - src.z.xyxy;
+		dst.w = src.w.zwzw - src.w.xyxy;
+	}
+
+	void PixelRoutine::FWIDTH(Vector4f &dst, Vector4f &src)
+	{
+		// abs(dFdx(src)) + abs(dFdy(src));
+		dst.x = Abs(src.x.yyww - src.x.xxzz) + Abs(src.x.zwzw - src.x.xyxy);
+		dst.y = Abs(src.y.yyww - src.x.xxzz) + Abs(src.y.zwzw - src.y.xyxy);
+		dst.z = Abs(src.z.yyww - src.x.xxzz) + Abs(src.z.zwzw - src.z.xyxy);
+		dst.w = Abs(src.w.yyww - src.x.xxzz) + Abs(src.w.zwzw - src.w.xyxy);
 	}
 
 	void PixelRoutine::BREAK(Registers &r)
@@ -4695,6 +4890,7 @@
 
 		if(breakDepth == 0)
 		{
+			r.enableIndex = r.enableIndex - breakDepth;
 			Nucleus::createBr(endBlock);
 		}
 		else
@@ -4702,49 +4898,47 @@
 			r.enableBreak = r.enableBreak & ~r.enableStack[r.enableIndex];
 			Bool allBreak = SignMask(r.enableBreak) == 0x0;
 
+			r.enableIndex = r.enableIndex - breakDepth;
 			branch(allBreak, endBlock, deadBlock);
 		}
 
 		Nucleus::setInsertBlock(deadBlock);
+		r.enableIndex = r.enableIndex + breakDepth;
 	}
 
-	void PixelRoutine::BREAKC(Registers &r, Color4f &src0, Color4f &src1, Control control)
+	void PixelRoutine::BREAKC(Registers &r, Vector4f &src0, Vector4f &src1, Control control)
 	{
 		Int4 condition;
 
 		switch(control)
 		{
-		case Op::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
-		case Op::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
-		case Op::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
-		case Op::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
-		case Op::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
-		case Op::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
 		default:
 			ASSERT(false);
 		}
 
-		condition &= r.enableStack[r.enableIndex];
-
-		llvm::BasicBlock *continueBlock = Nucleus::createBasicBlock();
-		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth - 1];
-
-		r.enableBreak = r.enableBreak & ~condition;
-		Bool allBreak = SignMask(r.enableBreak) == 0x0;
-
-		branch(allBreak, endBlock, continueBlock);
-		Nucleus::setInsertBlock(continueBlock);
+		BREAK(r, condition);
 	}
 
 	void PixelRoutine::BREAKP(Registers &r, const Src &predicateRegister)   // FIXME: Factor out parts common with BREAKC
 	{
 		Int4 condition = As<Int4>(r.p0[predicateRegister.swizzle & 0x3]);
 
-		if(predicateRegister.modifier == Src::MODIFIER_NOT)
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = ~condition;
 		}
 
+		BREAK(r, condition);
+	}
+
+	void PixelRoutine::BREAK(Registers &r, Int4 &condition)
+	{
 		condition &= r.enableStack[r.enableIndex];
 
 		llvm::BasicBlock *continueBlock = Nucleus::createBasicBlock();
@@ -4753,44 +4947,61 @@
 		r.enableBreak = r.enableBreak & ~condition;
 		Bool allBreak = SignMask(r.enableBreak) == 0x0;
 
+		r.enableIndex = r.enableIndex - breakDepth;
 		branch(allBreak, endBlock, continueBlock);
+
 		Nucleus::setInsertBlock(continueBlock);
+		r.enableIndex = r.enableIndex + breakDepth;
 	}
 
-	void PixelRoutine::CALL(Registers &r, int labelIndex)
+	void PixelRoutine::CONTINUE(Registers &r)
+	{
+		r.enableContinue = r.enableContinue & ~r.enableStack[r.enableIndex];
+	}
+
+	void PixelRoutine::TEST()
+	{
+		whileTest = true;
+	}
+
+	void PixelRoutine::CALL(Registers &r, int labelIndex, int callSiteIndex)
 	{
 		if(!labelBlock[labelIndex])
 		{
 			labelBlock[labelIndex] = Nucleus::createBasicBlock();
 		}
 
-		llvm::BasicBlock *retBlock = Nucleus::createBasicBlock();
-		callRetBlock.push_back(retBlock);
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			r.callStack[r.stackIndex++] = UInt(callSiteIndex);
+		}
 
-		r.callStack[r.stackIndex++] = UInt((int)callRetBlock.size() - 1);   // FIXME
+		Int4 restoreLeave = r.enableLeave;
 
 		Nucleus::createBr(labelBlock[labelIndex]);
-		Nucleus::setInsertBlock(retBlock);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		r.enableLeave = restoreLeave;
 	}
 
-	void PixelRoutine::CALLNZ(Registers &r, int labelIndex, const Src &src)
+	void PixelRoutine::CALLNZ(Registers &r, int labelIndex, int callSiteIndex, const Src &src)
 	{
-		if(src.type == Src::PARAMETER_CONSTBOOL)
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
 		{
-			CALLNZb(r, labelIndex, src);
+			CALLNZb(r, labelIndex, callSiteIndex, src);
 		}
-		else if(src.type == Src::PARAMETER_PREDICATE)
+		else if(src.type == Shader::PARAMETER_PREDICATE)
 		{
-			CALLNZp(r, labelIndex, src);
+			CALLNZp(r, labelIndex, callSiteIndex, src);
 		}
 		else ASSERT(false);
 	}
 
-	void PixelRoutine::CALLNZb(Registers &r, int labelIndex, const Src &boolRegister)
+	void PixelRoutine::CALLNZb(Registers &r, int labelIndex, int callSiteIndex, const Src &boolRegister)
 	{
 		Bool condition = (*Pointer<Byte>(r.data + OFFSET(DrawData,ps.b[boolRegister.index])) != Byte(0));   // FIXME
 		
-		if(boolRegister.modifier == Src::MODIFIER_NOT)
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = !condition;	
 		}
@@ -4800,20 +5011,24 @@
 			labelBlock[labelIndex] = Nucleus::createBasicBlock();
 		}
 
-		llvm::BasicBlock *retBlock = Nucleus::createBasicBlock();
-		callRetBlock.push_back(retBlock);
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			r.callStack[r.stackIndex++] = UInt(callSiteIndex);
+		}
 
-		r.callStack[r.stackIndex++] = UInt((int)callRetBlock.size() - 1);   // FIXME
+		Int4 restoreLeave = r.enableLeave;
 
-		branch(condition, labelBlock[labelIndex], retBlock);
-		Nucleus::setInsertBlock(retBlock);
+		branch(condition, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		r.enableLeave = restoreLeave;
 	}
 
-	void PixelRoutine::CALLNZp(Registers &r, int labelIndex, const Src &predicateRegister)
+	void PixelRoutine::CALLNZp(Registers &r, int labelIndex, int callSiteIndex, const Src &predicateRegister)
 	{
 		Int4 condition = As<Int4>(r.p0[predicateRegister.swizzle & 0x3]);
 
-		if(predicateRegister.modifier == Src::MODIFIER_NOT)
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = ~condition;
 		}
@@ -4825,20 +5040,21 @@
 			labelBlock[labelIndex] = Nucleus::createBasicBlock();
 		}
 
-		llvm::BasicBlock *retBlock = Nucleus::createBasicBlock();
-		callRetBlock.push_back(retBlock);
-
-		r.callStack[r.stackIndex++] = UInt((int)callRetBlock.size() - 1);   // FIXME
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			r.callStack[r.stackIndex++] = UInt(callSiteIndex);
+		}
 
 		r.enableIndex++;
 		r.enableStack[r.enableIndex] = condition;
+		Int4 restoreLeave = r.enableLeave;
 
-		Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
-
-		branch(notAllFalse, labelBlock[labelIndex], retBlock);
-		Nucleus::setInsertBlock(retBlock);
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
 
 		r.enableIndex--;
+		r.enableLeave = restoreLeave;
 	}
 
 	void PixelRoutine::ELSE(Registers &r)
@@ -4851,7 +5067,7 @@
 		if(isConditionalIf[ifDepth])
 		{
 			Int4 condition = ~r.enableStack[r.enableIndex] & r.enableStack[r.enableIndex - 1];
-			Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
+			Bool notAllFalse = SignMask(condition) != 0;
 
 			branch(notAllFalse, falseBlock, endBlock);
 
@@ -4884,20 +5100,6 @@
 		}
 	}
 
-	void PixelRoutine::ENDREP(Registers &r)
-	{
-		loopRepDepth--;
-
-		llvm::BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
-		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
-
-		Nucleus::createBr(testBlock);
-		Nucleus::setInsertBlock(endBlock);
-
-		r.loopDepth--;
-		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-	}
-
 	void PixelRoutine::ENDLOOP(Registers &r)
 	{
 		loopRepDepth--;
@@ -4914,26 +5116,61 @@
 		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 	}
 
+	void PixelRoutine::ENDREP(Registers &r)
+	{
+		loopRepDepth--;
+
+		llvm::BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		r.loopDepth--;
+		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+	}
+
+	void PixelRoutine::ENDWHILE(Registers &r)
+	{
+		loopRepDepth--;
+
+		llvm::BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		r.enableIndex--;
+		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		whileTest = false;
+	}
+
 	void PixelRoutine::IF(Registers &r, const Src &src)
 	{
-		if(src.type == Src::PARAMETER_CONSTBOOL)
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
 		{
 			IFb(r, src);
 		}
-		else if(src.type == Src::PARAMETER_PREDICATE)
+		else if(src.type == Shader::PARAMETER_PREDICATE)
 		{
 			IFp(r, src);
 		}
-		else ASSERT(false);
+		else
+		{
+			Int4 condition = As<Int4>(reg(r, src).x);
+			IF(r, condition);
+		}
 	}
 
 	void PixelRoutine::IFb(Registers &r, const Src &boolRegister)
 	{
+		ASSERT(ifDepth < 24 + 4);
+
 		Bool condition = (*Pointer<Byte>(r.data + OFFSET(DrawData,ps.b[boolRegister.index])) != Byte(0));   // FIXME
 
-		if(boolRegister.modifier == Src::MODIFIER_NOT)
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
 		{
-			condition = !condition;	
+			condition = !condition;
 		}
 
 		llvm::BasicBlock *trueBlock = Nucleus::createBasicBlock();
@@ -4947,50 +5184,39 @@
 		ifDepth++;
 	}
 
-	void PixelRoutine::IFp(Registers &r, const Src &predicateRegister)   // FIXME: Factor out parts common with IFC
+	void PixelRoutine::IFp(Registers &r, const Src &predicateRegister)
 	{
 		Int4 condition = As<Int4>(r.p0[predicateRegister.swizzle & 0x3]);
 
-		if(predicateRegister.modifier == Src::MODIFIER_NOT)
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = ~condition;
 		}
 
-		condition &= r.enableStack[r.enableIndex];
-
-		r.enableIndex++;
-		r.enableStack[r.enableIndex] = condition;
-
-		llvm::BasicBlock *trueBlock = Nucleus::createBasicBlock();
-		llvm::BasicBlock *falseBlock = Nucleus::createBasicBlock();
-
-		Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
-
-		branch(notAllFalse, trueBlock, falseBlock);
-
-		isConditionalIf[ifDepth] = true;
-		ifFalseBlock[ifDepth] = falseBlock;
-
-		ifDepth++;
-		breakDepth++;
+		IF(r, condition);
 	}
 
-	void PixelRoutine::IFC(Registers &r, Color4f &src0, Color4f &src1, Control control)
+	void PixelRoutine::IFC(Registers &r, Vector4f &src0, Vector4f &src1, Control control)
 	{
 		Int4 condition;
 
 		switch(control)
 		{
-		case Op::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
-		case Op::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
-		case Op::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
-		case Op::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
-		case Op::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
-		case Op::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
 		default:
 			ASSERT(false);
 		}
 
+		IF(r, condition);
+	}
+
+	void PixelRoutine::IF(Registers &r, Int4 &condition)
+	{
 		condition &= r.enableStack[r.enableIndex];
 
 		r.enableIndex++;
@@ -4999,7 +5225,7 @@
 		llvm::BasicBlock *trueBlock = Nucleus::createBasicBlock();
 		llvm::BasicBlock *falseBlock = Nucleus::createBasicBlock();
 
-		Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
+		Bool notAllFalse = SignMask(condition) != 0;
 
 		branch(notAllFalse, trueBlock, falseBlock);
 
@@ -5012,7 +5238,13 @@
 
 	void PixelRoutine::LABEL(int labelIndex)
 	{
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
 		Nucleus::setInsertBlock(labelBlock[labelIndex]);
+		currentLabel = labelIndex;
 	}
 
 	void PixelRoutine::LOOP(Registers &r, const Src &integerRegister)
@@ -5075,27 +5307,73 @@
 		breakDepth = 0;
 	}
 
+	void PixelRoutine::WHILE(Registers &r, const Src &temporaryRegister)
+	{
+		r.enableIndex++;
+
+		llvm::BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		llvm::BasicBlock *testBlock = Nucleus::createBasicBlock();
+		llvm::BasicBlock *endBlock = Nucleus::createBasicBlock();
+		
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		Int4 restoreBreak = r.enableBreak;
+		Int4 restoreContinue = r.enableContinue;
+
+		// FIXME: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+		r.enableContinue = restoreContinue;
+
+		Vector4f &src = reg(r, temporaryRegister);
+		Int4 condition = As<Int4>(src.x);
+		condition &= r.enableStack[r.enableIndex - 1];
+		r.enableStack[r.enableIndex] = condition;
+
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, loopBlock, endBlock);
+		
+		Nucleus::setInsertBlock(endBlock);
+		r.enableBreak = restoreBreak;
+		
+		Nucleus::setInsertBlock(loopBlock);
+
+		loopRepDepth++;
+		breakDepth = 0;
+	}
+
 	void PixelRoutine::RET(Registers &r)
 	{
-		if(!returns)
+		if(currentLabel == -1)
 		{
 			returnBlock = Nucleus::createBasicBlock();
 			Nucleus::createBr(returnBlock);
-
-			returns = true;
 		}
 		else
 		{
-			// FIXME: Encapsulate
-			UInt index = r.callStack[--r.stackIndex];
- 
 			llvm::BasicBlock *unreachableBlock = Nucleus::createBasicBlock();
-			llvm::Value *value = Nucleus::createLoad(index.address);
-			llvm::Value *switchInst = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock.size());
 
-			for(unsigned int i = 0; i < callRetBlock.size(); i++)
+			if(callRetBlock[currentLabel].size() > 1)   // Pop the return destination from the call stack
 			{
-				Nucleus::addSwitchCase(switchInst, i, callRetBlock[i]);
+				// FIXME: Encapsulate
+				UInt index = r.callStack[--r.stackIndex];
+ 
+				llvm::Value *value = Nucleus::createLoad(index.address);
+				llvm::Value *switchInst = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock[currentLabel].size());
+
+				for(unsigned int i = 0; i < callRetBlock[currentLabel].size(); i++)
+				{
+					Nucleus::addSwitchCase(switchInst, i, callRetBlock[currentLabel][i]);
+				}
+			}
+			else if(callRetBlock[currentLabel].size() == 1)   // Jump directly to the unique return destination
+			{
+				Nucleus::createBr(callRetBlock[currentLabel][0]);
+			}
+			else   // Function isn't called
+			{
+				Nucleus::createBr(unreachableBlock);
 			}
 
 			Nucleus::setInsertBlock(unreachableBlock);
@@ -5103,30 +5381,38 @@
 		}
 	}
 
-	void PixelRoutine::writeDestination(Registers &r, Color4i &d, const Dst &dst)
+	void PixelRoutine::LEAVE(Registers &r)
+	{
+		r.enableLeave = r.enableLeave & ~r.enableStack[r.enableIndex];
+
+		// FIXME: Return from function if all instances left
+		// FIXME: Use enableLeave in other control-flow constructs
+	}
+	
+	void PixelRoutine::writeDestination(Registers &r, Vector4i &d, const Dst &dst)
 	{
 		switch(dst.type)
 		{
-		case Dst::PARAMETER_TEMP:
+		case Shader::PARAMETER_TEMP:
 			if(dst.mask & 0x1) r.ri[dst.index].x = d.x;
 			if(dst.mask & 0x2) r.ri[dst.index].y = d.y;
 			if(dst.mask & 0x4) r.ri[dst.index].z = d.z;
 			if(dst.mask & 0x8) r.ri[dst.index].w = d.w;
 			break;
-		case Dst::PARAMETER_INPUT:
+		case Shader::PARAMETER_INPUT:
 			if(dst.mask & 0x1) r.vi[dst.index].x = d.x;
 			if(dst.mask & 0x2) r.vi[dst.index].y = d.y;
 			if(dst.mask & 0x4) r.vi[dst.index].z = d.z;
 			if(dst.mask & 0x8) r.vi[dst.index].w = d.w;
 			break;
-		case Dst::PARAMETER_CONST:			ASSERT(false);	break;
-		case Dst::PARAMETER_TEXTURE:
+		case Shader::PARAMETER_CONST:			ASSERT(false);	break;
+		case Shader::PARAMETER_TEXTURE:
 			if(dst.mask & 0x1) r.ti[dst.index].x = d.x;
 			if(dst.mask & 0x2) r.ti[dst.index].y = d.y;
 			if(dst.mask & 0x4) r.ti[dst.index].z = d.z;
 			if(dst.mask & 0x8) r.ti[dst.index].w = d.w;
 			break;
-		case Dst::PARAMETER_COLOROUT:
+		case Shader::PARAMETER_COLOROUT:
 			if(dst.mask & 0x1) r.vi[dst.index].x = d.x;
 			if(dst.mask & 0x2) r.vi[dst.index].y = d.y;
 			if(dst.mask & 0x4) r.vi[dst.index].z = d.z;
@@ -5137,29 +5423,29 @@
 		}
 	}
 
-	Color4i PixelRoutine::regi(Registers &r, const Src &src)
+	Vector4i PixelRoutine::regi(Registers &r, const Src &src)
 	{
-		Color4i *reg;
+		Vector4i *reg;
 		int i = src.index;
 
-		Color4i c;
+		Vector4i c;
 
-		if(src.type == ShaderParameter::PARAMETER_CONST)
+		if(src.type == Shader::PARAMETER_CONST)
 		{
-			c.r = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][0]));
-			c.g = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][1]));
-			c.b = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][2]));
-			c.a = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][3]));
+			c.x = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][0]));
+			c.y = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][1]));
+			c.z = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][2]));
+			c.w = *Pointer<Short4>(r.data + OFFSET(DrawData,ps.cW[i][3]));
 		}
 
 		switch(src.type)
 		{
-		case Src::PARAMETER_TEMP:			reg = &r.ri[i];	break;
-		case Src::PARAMETER_INPUT:			reg = &r.vi[i];	break;
-		case Src::PARAMETER_CONST:			reg = &c;		break;
-		case Src::PARAMETER_TEXTURE:		reg = &r.ti[i];	break;
-		case Src::PARAMETER_VOID:			return r.ri[0];   // Dummy
-		case Src::PARAMETER_FLOATLITERAL:	return r.ri[0];   // Dummy
+		case Shader::PARAMETER_TEMP:          reg = &r.ri[i]; break;
+		case Shader::PARAMETER_INPUT:         reg = &r.vi[i]; break;
+		case Shader::PARAMETER_CONST:         reg = &c;       break;
+		case Shader::PARAMETER_TEXTURE:       reg = &r.ti[i]; break;
+		case Shader::PARAMETER_VOID:          return r.ri[0]; // Dummy
+		case Shader::PARAMETER_FLOAT4LITERAL: return r.ri[0]; // Dummy
 		default:
 			ASSERT(false);
 		}
@@ -5169,180 +5455,177 @@
 		Short4 &z = (*reg)[(src.swizzle >> 4) & 0x3];
 		Short4 &w = (*reg)[(src.swizzle >> 6) & 0x3];
 
-		Color4i mod;
+		Vector4i mod;
 
 		switch(src.modifier)
 		{
-		case Src::MODIFIER_NONE:
-			mod.r = x;
-			mod.g = y;
-			mod.b = z;
-			mod.a = w;
+		case Shader::MODIFIER_NONE:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
 			break;
-		case Src::MODIFIER_BIAS:
-			mod.r = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-			mod.g = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-			mod.b = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-			mod.a = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+		case Shader::MODIFIER_BIAS:
+			mod.x = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+			mod.y = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+			mod.z = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+			mod.w = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
 			break;
-		case Src::MODIFIER_BIAS_NEGATE:
-			mod.r = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
-			mod.g = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
-			mod.b = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
-			mod.a = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
+		case Shader::MODIFIER_BIAS_NEGATE:
+			mod.x = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
+			mod.y = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
+			mod.z = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
+			mod.w = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
 			break;
-		case Src::MODIFIER_COMPLEMENT:
-			mod.r = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), x);
-			mod.g = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), y);
-			mod.b = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), z);
-			mod.a = SubSat(Short4(0x1000, 0x1000, 0x1000, 0x1000), w);
+		case Shader::MODIFIER_COMPLEMENT:
+			mod.x = SubSat(Short4(0x1000), x);
+			mod.y = SubSat(Short4(0x1000), y);
+			mod.z = SubSat(Short4(0x1000), z);
+			mod.w = SubSat(Short4(0x1000), w);
 			break;
-		case Src::MODIFIER_NEGATE:
-			mod.r = -x;
-			mod.g = -y;
-			mod.b = -z;
-			mod.a = -w;
+		case Shader::MODIFIER_NEGATE:
+			mod.x = -x;
+			mod.y = -y;
+			mod.z = -z;
+			mod.w = -w;
 			break;
-		case Src::MODIFIER_X2:
-			mod.r = AddSat(x, x);
-			mod.g = AddSat(y, y);
-			mod.b = AddSat(z, z);
-			mod.a = AddSat(w, w);
+		case Shader::MODIFIER_X2:
+			mod.x = AddSat(x, x);
+			mod.y = AddSat(y, y);
+			mod.z = AddSat(z, z);
+			mod.w = AddSat(w, w);
 			break;
-		case Src::MODIFIER_X2_NEGATE:
-			mod.r = -AddSat(x, x);
-			mod.g = -AddSat(y, y);
-			mod.b = -AddSat(z, z);
-			mod.a = -AddSat(w, w);
+		case Shader::MODIFIER_X2_NEGATE:
+			mod.x = -AddSat(x, x);
+			mod.y = -AddSat(y, y);
+			mod.z = -AddSat(z, z);
+			mod.w = -AddSat(w, w);
 			break;
-		case Src::MODIFIER_SIGN:
-			mod.r = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-			mod.g = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-			mod.b = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-			mod.a = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
-			mod.r = AddSat(mod.r, mod.r);
-			mod.g = AddSat(mod.g, mod.g);
-			mod.b = AddSat(mod.b, mod.b);
-			mod.a = AddSat(mod.a, mod.a);
+		case Shader::MODIFIER_SIGN:
+			mod.x = SubSat(x, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+			mod.y = SubSat(y, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+			mod.z = SubSat(z, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+			mod.w = SubSat(w, Short4(0x0800, 0x0800, 0x0800, 0x0800));
+			mod.x = AddSat(mod.x, mod.x);
+			mod.y = AddSat(mod.y, mod.y);
+			mod.z = AddSat(mod.z, mod.z);
+			mod.w = AddSat(mod.w, mod.w);
 			break;
-		case Src::MODIFIER_SIGN_NEGATE:
-			mod.r = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
-			mod.g = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
-			mod.b = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
-			mod.a = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
-			mod.r = AddSat(mod.r, mod.r);
-			mod.g = AddSat(mod.g, mod.g);
-			mod.b = AddSat(mod.b, mod.b);
-			mod.a = AddSat(mod.a, mod.a);
+		case Shader::MODIFIER_SIGN_NEGATE:
+			mod.x = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), x);
+			mod.y = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), y);
+			mod.z = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), z);
+			mod.w = SubSat(Short4(0x0800, 0x0800, 0x0800, 0x0800), w);
+			mod.x = AddSat(mod.x, mod.x);
+			mod.y = AddSat(mod.y, mod.y);
+			mod.z = AddSat(mod.z, mod.z);
+			mod.w = AddSat(mod.w, mod.w);
 			break;
-		case Src::MODIFIER_DZ:
-			mod.r = x;
-			mod.g = y;
-			mod.b = z;
-			mod.a = w;
+		case Shader::MODIFIER_DZ:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
 			// Projection performed by texture sampler
 			break;
-		case Src::MODIFIER_DW:
-			mod.r = x;
-			mod.g = y;
-			mod.b = z;
-			mod.a = w;
+		case Shader::MODIFIER_DW:
+			mod.x = x;
+			mod.y = y;
+			mod.z = z;
+			mod.w = w;
 			// Projection performed by texture sampler
 			break;
 		default:
 			ASSERT(false);
 		}
 
-		if(src.type == ShaderParameter::PARAMETER_CONST && (src.modifier == Src::MODIFIER_X2 || src.modifier == Src::MODIFIER_X2_NEGATE))
+		if(src.type == Shader::PARAMETER_CONST && (src.modifier == Shader::MODIFIER_X2 || src.modifier == Shader::MODIFIER_X2_NEGATE))
 		{
-			mod.r = Min(mod.r, Short4(0x1000, 0x1000, 0x1000, 0x1000)); mod.r = Max(mod.r, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
-			mod.g = Min(mod.g, Short4(0x1000, 0x1000, 0x1000, 0x1000)); mod.g = Max(mod.g, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
-			mod.b = Min(mod.b, Short4(0x1000, 0x1000, 0x1000, 0x1000)); mod.b = Max(mod.b, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
-			mod.a = Min(mod.a, Short4(0x1000, 0x1000, 0x1000, 0x1000)); mod.a = Max(mod.a, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
+			mod.x = Min(mod.x, Short4(0x1000)); mod.x = Max(mod.x, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
+			mod.y = Min(mod.y, Short4(0x1000)); mod.y = Max(mod.y, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
+			mod.z = Min(mod.z, Short4(0x1000)); mod.z = Max(mod.z, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
+			mod.w = Min(mod.w, Short4(0x1000)); mod.w = Max(mod.w, Short4(-0x1000, -0x1000, -0x1000, -0x1000));
 		}
 
 		return mod;
 	}
 
-	Color4f PixelRoutine::reg(Registers &r, const Src &src, int offset)
+	Vector4f PixelRoutine::reg(Registers &r, const Src &src, int offset)
 	{
-		Color4f reg;
+		Vector4f reg;
 		int i = src.index + offset;
 
 		switch(src.type)
 		{
-		case Src::PARAMETER_TEMP:			reg = r.rf[i];		break;
-		case Src::PARAMETER_INPUT:
+		case Shader::PARAMETER_TEMP:
+			if(src.rel.type == Shader::PARAMETER_VOID)
 			{
-				if(!src.relative)
+				reg = r.rf[i];
+			}
+			else
+			{
+				Int a = relativeAddress(r, src);
+
+				reg = r.rf[i + a];
+			}
+			break;
+		case Shader::PARAMETER_INPUT:
+			{
+				if(src.rel.type == Shader::PARAMETER_VOID)   // Not relative
 				{
-					reg.x = r.vx[i];
-					reg.y = r.vy[i];
-					reg.z = r.vz[i];
-					reg.w = r.vw[i];
+					reg = r.vf[i];
 				}
-				else if(src.relativeType == Src::PARAMETER_LOOP)
+				else if(src.rel.type == Shader::PARAMETER_LOOP)
 				{
 					Int aL = r.aL[r.loopDepth];
 
-					reg.x = r.vx[i + aL];
-					reg.y = r.vy[i + aL];
-					reg.z = r.vz[i + aL];
-					reg.w = r.vw[i + aL];
+					reg = r.vf[i + aL];
 				}
-				else ASSERT(false);
-			}
-			break;
-		case Src::PARAMETER_CONST:
-			{
-				reg.r = reg.g = reg.b = reg.a = *Pointer<Float4>(r.data + OFFSET(DrawData,ps.c[i]));
-
-				reg.r = reg.r.xxxx;
-				reg.g = reg.g.yyyy;
-				reg.b = reg.b.zzzz;
-				reg.a = reg.a.wwww;
-
-				if(localShaderConstants)   // Constant may be known at compile time
+				else
 				{
-					for(int j = 0; j < pixelShader->getLength(); j++)
-					{
-						const ShaderInstruction &instruction = *pixelShader->getInstruction(j);
-
-						if(instruction.getOpcode() == ShaderOperation::OPCODE_DEF)
-						{
-							if(instruction.getDestinationParameter().index == i)
-							{
-								reg.r = Float4(instruction.getSourceParameter(0).value);
-								reg.g = Float4(instruction.getSourceParameter(1).value);
-								reg.b = Float4(instruction.getSourceParameter(2).value);
-								reg.a = Float4(instruction.getSourceParameter(3).value);
-
-								break;
-							}
-						}
-					}
+					Int a = relativeAddress(r, src);
+					
+					reg = r.vf[i + a];
 				}
 			}
 			break;
-		case Src::PARAMETER_TEXTURE:
-			{
-				reg.x = r.vx[2 + i];
-				reg.y = r.vy[2 + i];
-				reg.z = r.vz[2 + i];
-				reg.w = r.vw[2 + i];
-			}
+		case Shader::PARAMETER_CONST:
+			reg = readConstant(r, src, offset);
 			break;
-		case Src::PARAMETER_MISCTYPE:
+		case Shader::PARAMETER_TEXTURE:
+			reg = r.vf[2 + i];
+			break;
+		case Shader::PARAMETER_MISCTYPE:
 			if(src.index == 0)				reg = r.vPos;
 			if(src.index == 1)				reg = r.vFace;
 			break;
-		case Src::PARAMETER_SAMPLER:		return r.rf[0];   // Dummy
-		case Src::PARAMETER_PREDICATE:		return r.rf[0];   // Dummy
-		case Src::PARAMETER_VOID:			return r.rf[0];   // Dummy
-		case Src::PARAMETER_FLOATLITERAL:	return r.rf[0];   // Dummy
-		case Src::PARAMETER_CONSTINT:		return r.rf[0];   // Dummy
-		case Src::PARAMETER_CONSTBOOL:		return r.rf[0];   // Dummy
-		case Src::PARAMETER_LOOP:			return r.rf[0];   // Dummy
+		case Shader::PARAMETER_SAMPLER:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg.x = As<Float4>(Int4(i));
+			}
+			else if(src.rel.type == Shader::PARAMETER_TEMP)
+			{
+				reg.x = As<Float4>(Int4(i) + RoundInt(r.rf[src.rel.index].x));
+			}
+			return reg;
+		case Shader::PARAMETER_PREDICATE:	return reg;   // Dummy
+		case Shader::PARAMETER_VOID:		return reg;   // Dummy
+		case Shader::PARAMETER_FLOAT4LITERAL:
+			reg.x = Float4(src.value[0]);
+			reg.y = Float4(src.value[1]);
+			reg.z = Float4(src.value[2]);
+			reg.w = Float4(src.value[3]);
+			break;
+		case Shader::PARAMETER_CONSTINT:	return reg;   // Dummy
+		case Shader::PARAMETER_CONSTBOOL:	return reg;   // Dummy
+		case Shader::PARAMETER_LOOP:		return reg;   // Dummy
+		case Shader::PARAMETER_COLOROUT:
+			reg = r.oC[i];
+			break;
+		case Shader::PARAMETER_DEPTHOUT:
+			reg.x = r.oDepth;
+			break;
 		default:
 			ASSERT(false);
 		}
@@ -5352,29 +5635,29 @@
 		Float4 &z = reg[(src.swizzle >> 4) & 0x3];
 		Float4 &w = reg[(src.swizzle >> 6) & 0x3];
 
-		Color4f mod;
+		Vector4f mod;
 
 		switch(src.modifier)
 		{
-		case Src::MODIFIER_NONE:
+		case Shader::MODIFIER_NONE:
 			mod.x = x;
 			mod.y = y;
 			mod.z = z;
 			mod.w = w;
 			break;
-		case Src::MODIFIER_NEGATE:
+		case Shader::MODIFIER_NEGATE:
 			mod.x = -x;
 			mod.y = -y;
 			mod.z = -z;
 			mod.w = -w;
 			break;
-		case Src::MODIFIER_ABS:
+		case Shader::MODIFIER_ABS:
 			mod.x = Abs(x);
 			mod.y = Abs(y);
 			mod.z = Abs(z);
 			mod.w = Abs(w);
 			break;
-		case Src::MODIFIER_ABS_NEGATE:
+		case Shader::MODIFIER_ABS_NEGATE:
 			mod.x = -Abs(x);
 			mod.y = -Abs(y);
 			mod.z = -Abs(z);
@@ -5387,13 +5670,134 @@
 		return mod;
 	}
 
-	bool PixelRoutine::colorUsed()
+	Vector4f PixelRoutine::readConstant(Registers &r, const Src &src, int offset)
 	{
-		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsTexkill;
+		Vector4f c;
+
+		int i = src.index + offset;
+
+		if(src.rel.type == Shader::PARAMETER_VOID)   // Not relative
+		{
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,ps.c[i]));
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+
+			if(localShaderConstants)   // Constant may be known at compile time
+			{
+				for(int j = 0; j < shader->getLength(); j++)
+				{
+					const Shader::Instruction &instruction = *shader->getInstruction(j);
+
+					if(instruction.opcode == Shader::OPCODE_DEF)
+					{
+						if(instruction.dst.index == i)
+						{
+							c.x = Float4(instruction.src[0].value[0]);
+							c.y = Float4(instruction.src[0].value[1]);
+							c.z = Float4(instruction.src[0].value[2]);
+							c.w = Float4(instruction.src[0].value[3]);
+
+							break;
+						}
+					}
+				}
+			}
+		}
+		else if(src.rel.type == Shader::PARAMETER_LOOP)
+		{
+			Int loopCounter = r.aL[r.loopDepth];
+
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,ps.c[i]) + loopCounter * 16);
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+		}
+		else
+		{
+			Int a = relativeAddress(r, src);
+			
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,ps.c[i]) + a * 16);
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+		}
+
+		return c;
 	}
 
-	unsigned short PixelRoutine::pixelShaderVersion() const
+	Int PixelRoutine::relativeAddress(Registers &r, const Shader::Parameter &var)
 	{
-		return pixelShader ? pixelShader->getVersion() : 0x0000;
+		ASSERT(var.rel.deterministic);
+
+		if(var.rel.type == Shader::PARAMETER_TEMP)
+		{
+			return RoundInt(Extract(r.rf[var.rel.index].x, 0)) * var.rel.scale;
+		}
+		else if(var.rel.type == Shader::PARAMETER_INPUT)
+		{
+			return RoundInt(Extract(r.vf[var.rel.index].x, 0)) * var.rel.scale;
+		}
+		else if(var.rel.type == Shader::PARAMETER_OUTPUT)
+		{
+			return RoundInt(Extract(r.oC[var.rel.index].x, 0)) * var.rel.scale;
+		}
+		else if(var.rel.type == Shader::PARAMETER_CONST)
+		{
+			RValue<Float4> c = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c[var.rel.index]));
+
+			return RoundInt(Extract(c, 0)) * var.rel.scale;
+		}
+		else ASSERT(false);
+
+		return 0;
+	}
+
+	Int4 PixelRoutine::enableMask(Registers &r, const Shader::Instruction *instruction)
+	{
+		Int4 enable = instruction->analysisBranch ? Int4(r.enableStack[r.enableIndex]) : Int4(0xFFFFFFFF);
+					
+		if(shader->containsBreakInstruction() && !whileTest && instruction->analysisBreak)
+		{
+			enable &= r.enableBreak;
+		}
+
+		if(shader->containsContinueInstruction() && !whileTest && instruction->analysisContinue)
+		{
+			enable &= r.enableContinue;
+		}
+
+		if(shader->containsLeaveInstruction() && instruction->analysisLeave)
+		{
+			enable &= r.enableLeave;
+		}
+
+		return enable;
+	}
+
+	bool PixelRoutine::colorUsed()
+	{
+		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
+	}
+
+	unsigned short PixelRoutine::shaderVersion() const
+	{
+		return shader ? shader->getVersion() : 0x0000;
+	}
+
+	bool PixelRoutine::interpolateZ() const
+	{
+		return state.depthTestActive || state.pixelFogActive() || (shader && shader->vPosDeclared && fullPixelPositionRegister);
+	}
+
+	bool PixelRoutine::interpolateW() const
+	{
+		return state.perspective || (shader && shader->vPosDeclared && fullPixelPositionRegister);
 	}
 }
diff --git a/src/Shader/PixelRoutine.hpp b/src/Shader/PixelRoutine.hpp
index c1070ff..29ea75a 100644
--- a/src/Shader/PixelRoutine.hpp
+++ b/src/Shader/PixelRoutine.hpp
@@ -1,304 +1,327 @@
-// SwiftShader Software Renderer
-//
-// Copyright(c) 2005-2011 TransGaming Inc.
-//
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
-//
-
-#ifndef sw_PixelRoutine_hpp
-#define sw_PixelRoutine_hpp
-
-#include "Rasterizer.hpp"
-#include "ShaderCore.hpp"
-
-#include "Types.hpp"
-
-namespace sw
-{
-	extern bool forceClearRegisters;
-
-	class PixelShader;
-	class SamplerCore;
-
-	class PixelRoutine : public Rasterizer, public ShaderCore
-	{
-		friend PixelProcessor;   // FIXME
-
-	public:
-		PixelRoutine(const PixelProcessor::State &state, const PixelShader *pixelShader);
-
-		~PixelRoutine();
-
-	protected:
-		struct Registers
-		{
-			Registers() : current(ri[0]), diffuse(vi[0]), specular(vi[1]), callStack(4), aL(4), increment(4), iteration(4), enableStack(1 + 24), vx(10), vy(10), vz(10), vw(10)
-			{
-				if(forceClearRegisters)
-				{
-					for(int i = 0; i < 10; i++)
-					{
-						vx[i] = Float4(0, 0, 0, 0);
-						vy[i] = Float4(0, 0, 0, 0);
-						vz[i] = Float4(0, 0, 0, 0);
-						vw[i] = Float4(0, 0, 0, 0);
-					}
-
-					for(int i = 0; i < 4; i++)
-					{
-						oC[i].r = Float4(0.0f);
-						oC[i].g = Float4(0.0f);
-						oC[i].b = Float4(0.0f);
-						oC[i].a = Float4(0.0f);
-					}
-				}
-
-				loopDepth = -1;
-				enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-				enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-
-				occlusion = 0;
-				
-				#if PERF_PROFILE
-					for(int i = 0; i < PERF_TIMERS; i++)
-					{
-						cycles[i] = 0;
-					}
-				#endif
-			}
-
-			Pointer<Byte> constants;
-
-			Pointer<Byte> primitive;
-			Int cluster;
-			Pointer<Byte> data;
-
-			Float4 z[4];
-			Float4 rhw;
-
-			Float4 Dz[4];
-			Float4 Dw;
-			Float4 Dv[10][4];
-			Float4 Df;
-
-			Color4i &current;
-			Color4i &diffuse;
-			Color4i &specular;
-
-			Color4i ri[6];
-			Color4i vi[2];
-			Color4i ti[6];
-
-			Color4f rf[32];
-			Array<Float4> vx;
-			Array<Float4> vy;
-			Array<Float4> vz;
-			Array<Float4> vw;
-
-			Color4f vPos;
-			Color4f vFace;
-
-			Color4f oC[4];
-			Float4 oDepth;
-
-			Color4f p0;
-			Array<Int> aL;
-
-			Array<Int> increment;
-			Array<Int> iteration;
-
-			Int loopDepth;
-			Int stackIndex;   // FIXME: Inc/decrement callStack
-			Array<UInt> callStack;
-
-			Int enableIndex;
-			Array<Int4> enableStack;
-			Int4 enableBreak;
-
-			// bem(l) offsets and luminance
-			Float4 du;
-			Float4 dv;
-			Short4 L;
-
-			// texm3x3 temporaries
-			Float4 u_;   // FIXME
-			Float4 v_;   // FIXME
-			Float4 w_;   // FIXME
-			Float4 U;   // FIXME
-			Float4 V;   // FIXME
-			Float4 W;   // FIXME
-
-			UInt occlusion;
-
-			#if PERF_PROFILE
-				Long cycles[PERF_TIMERS];
-			#endif
-		};
-
-		void quad(Registers &r, Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y);
-
-		Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
-		Float4 interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);
-		void stencilTest(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask);
-		void stencilTest(Registers &r, Byte8 &value, Context::StencilCompareMode stencilCompareMode, bool CCW);
-		void stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, Context::StencilOperation stencilPassOperation, Context::StencilOperation stencilZFailOperation, Context::StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask);
-		void stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, Context::StencilOperation operation, bool CCW);
-		Bool depthTest(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask);
-		void blendTexture(Registers &r, Color4i &current, Color4i &temp, Color4i &texture, int stage);
-		void alphaTest(Registers &r, Int &aMask, Short4 &alpha);
-		void alphaToCoverage(Registers &r, Int cMask[4], Float4 &alpha);
-		Bool alphaTest(Registers &r, Int cMask[4], Color4i &current);
-		Bool alphaTest(Registers &r, Int cMask[4], Color4f &c0);
-		void fogBlend(Registers &r, Color4i &current, Float4 &fog, Float4 &z, Float4 &rhw);
-		void fogBlend(Registers &r, Color4f &c0, Float4 &fog, Float4 &z, Float4 &rhw);
-		void pixelFog(Registers &r, Float4 &visibility, Float4 &z, Float4 &rhw);
-		void specularPixel(Color4i &current, Color4i &specular);
-
-		void sampleTexture(Registers &r, Color4i &c, int coordinates, int sampler, bool project = false);
-		void sampleTexture(Registers &r, Color4i &c, int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project = false, bool bias = false, bool fixed12 = true);
-		void sampleTexture(Registers &r, Color4i &c, int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool project = false, bool bias = false, bool fixed12 = true, bool gradients = false, bool lodProvided = false);
-		void sampleTexture(Registers &r, Color4f &c, int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool project = false, bool bias = false, bool gradients = false, bool lodProvided = false);
-	
-		// Raster operations
-		void clampColor(Color4f oC[4]);
-		void rasterOperation(Color4i &current, Registers &r, Float4 &fog, Pointer<Byte> &cBuffer, Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
-		void rasterOperation(Color4f oC[4], Registers &r, Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);
-		void blendFactor(Registers &r, const Color4i &blendFactor, const Color4i &current, const Color4i &pixel, Context::BlendFactor blendFactorActive);
-		void blendFactorAlpha(Registers &r, const Color4i &blendFactor, const Color4i &current, const Color4i &pixel, Context::BlendFactor blendFactorAlphaActive);
-		void alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Color4i &current, Int &x);
-		void writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &i, Color4i &current, Int &sMask, Int &zMask, Int &cMask);
-		void blendFactor(Registers &r, const Color4f &blendFactor, const Color4f &oC, const Color4f &pixel, Context::BlendFactor blendFactorActive);
-		void blendFactorAlpha(Registers &r, const Color4f &blendFactor, const Color4f &oC, const Color4f &pixel, Context::BlendFactor blendFactorAlphaActive);
-		void alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Color4f &oC, Int &x);
-		void writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &i, Color4f &oC, Int &sMask, Int &zMask, Int &cMask);
-		void writeStencil(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask);
-		void writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask);
-
-		void ps_1_x(Registers &r, Int cMask[4]);
-		void ps_2_x(Registers &r, Int cMask[4]);
-
-		Short4 convertFixed12(Float4 &cf);
-		void convertFixed12(Color4i &ci, Color4f &cf);
-		Float4 convertSigned12(Short4 &ci);
-		void convertSigned12(Color4f &cf, Color4i &ci);
-		Float4 convertUnsigned16(UShort4 ci);
-		UShort4 convertFixed16(Float4 &cf, bool saturate = true);
-		void convertFixed16(Color4i &ci, Color4f &cf, bool saturate = true);
-		void sRGBtoLinear16_16(Registers &r, Color4i &c);
-		void sRGBtoLinear12_16(Registers &r, Color4i &c);
-		void linearToSRGB16_16(Registers &r, Color4i &c);
-		void linearToSRGB12_16(Registers &r, Color4i &c);
-		Float4 sRGBtoLinear(const Float4 &x);
-		Float4 linearToSRGB(const Float4 &x);
-
-		typedef Shader::Instruction::DestinationParameter Dst;
-		typedef Shader::Instruction::SourceParameter Src;
-		typedef Shader::Instruction::Operation Op;
-		typedef Shader::Instruction::Operation::Control Control;
-
-		// ps_1_x instructions
-		void MOV(Color4i &dst, Color4i &src0);
-		void ADD(Color4i &dst, Color4i &src0, Color4i &src1);
-		void SUB(Color4i &dst, Color4i &src0, Color4i &src1);
-		void MAD(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2);
-		void MUL(Color4i &dst, Color4i &src0, Color4i &src1);
-		void DP3(Color4i &dst, Color4i &src0, Color4i &src1);
-		void DP4(Color4i &dst, Color4i &src0, Color4i &src1);
-		void LRP(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2);
-		void TEXCOORD(Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate);
-		void TEXCRD(Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project);
-		void TEXDP3(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, Color4i &src);
-		void TEXDP3TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0);
-		void TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s);
-		void TEXKILL(Int cMask[4], Color4i &dst);
-		void TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, bool project);
-		void TEXLD(Registers &r, Color4i &dst, Color4i &src, int stage, bool project);
-		void TEXBEM(Registers &r, Color4i &dst, Color4i &src, Float4 &u, Float4 &v, Float4 &s, int stage);
-		void TEXBEML(Registers &r, Color4i &dst, Color4i &src, Float4 &u, Float4 &v, Float4 &s, int stage);
-		void TEXREG2AR(Registers &r, Color4i &dst, Color4i &src0, int stage);
-		void TEXREG2GB(Registers &r, Color4i &dst, Color4i &src0, int stage);
-		void TEXREG2RGB(Registers &r, Color4i &dst, Color4i &src0, int stage);
-		void TEXM3X2DEPTH(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, Color4i &src, bool signedScaling);
-		void TEXM3X2PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Color4i &src0, int component, bool signedScaling);
-		void TEXM3X2TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0, bool signedScaling);
-		void TEXM3X3(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, Color4i &src0, bool signedScaling);
-		void TEXM3X3PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Color4i &src0, int component, bool signedScaling);
-		void TEXM3X3SPEC(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0, Color4i &src1);
-		void TEXM3X3TEX(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0, bool singedScaling);
-		void TEXM3X3VSPEC(Registers &r, Color4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Color4i &src0);
-		void TEXDEPTH(Registers &r);
-		void CND(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2);
-		void CMP(Color4i &dst, Color4i &src0, Color4i &src1, Color4i &src2);
-		void BEM(Registers &r, Color4i &dst, Color4i &src0, Color4i &src1, int stage);
-
-		// ps_2_x instructions
-		void M3X2(Registers &r, Color4f &dst, Color4f &src0, const Src &src1);
-		void M3X3(Registers &r, Color4f &dst, Color4f &src0, const Src &src1);
-		void M3X4(Registers &r, Color4f &dst, Color4f &src0, const Src &src1);
-		void M4X3(Registers &r, Color4f &dst, Color4f &src0, const Src &src1);
-		void M4X4(Registers &r, Color4f &dst, Color4f &src0, const Src &src1);
-		void TEXLD(Registers &r, Color4f &dst, Color4f &src0, const Src &src1, bool project, bool bias);
-		void TEXLDD(Registers &r, Color4f &dst, Color4f &src0, const Src &src1, Color4f &src2,  Color4f &src3, bool project, bool bias);
-		void TEXLDL(Registers &r, Color4f &dst, Color4f &src0, const Src &src1, bool project, bool bias);
-		void TEXKILL(Int cMask[4], Color4f &src, unsigned char mask);
-		void DSX(Color4f &dst, Color4f &src);
-		void DSY(Color4f &dst, Color4f &src);
-		void BREAK(Registers &r);
-		void BREAKC(Registers &r, Color4f &src0, Color4f &src1, Control);
-		void BREAKP(Registers &r, const Src &predicateRegister);
-		void CALL(Registers &r, int labelIndex);
-		void CALLNZ(Registers &r, int labelIndex, const Src &src);
-		void CALLNZb(Registers &r, int labelIndex, const Src &boolRegister);
-		void CALLNZp(Registers &r, int labelIndex, const Src &predicateRegister);
-		void ELSE(Registers &r);
-		void ENDIF(Registers &r);
-		void ENDLOOP(Registers &r);
-		void ENDREP(Registers &r);
-		void IF(Registers &r, const Src &src);
-		void IFb(Registers &r, const Src &boolRegister);
-		void IFp(Registers &r, const Src &predicateRegister);
-		void IFC(Registers &r, Color4f &src0, Color4f &src1, Control);
-		void LABEL(int labelIndex);
-		void LOOP(Registers &r, const Src &integerRegister);
-		void REP(Registers &r, const Src &integerRegister);
-		void RET(Registers &r);
-
-		void readConstant(Registers &r, int index);
-
-		void writeDestination(Registers &r, Color4i &d, const Dst &dst);
-		Color4i regi(Registers &r, const Src &src);
-		Color4f reg(Registers &r, const Src &src, int offset = 0);
-
-		bool colorUsed();
-		unsigned short pixelShaderVersion() const;
-
-	private:
-		SamplerCore *sampler[16];
-
-		bool perturbate;
-		bool luminance;
-		bool previousScaling;
-
-		bool returns;
-		int ifDepth;
-		int loopRepDepth;
-		int breakDepth;
-
-		// FIXME: Get rid of llvm::
-		llvm::BasicBlock *ifFalseBlock[24 + 24];
-		llvm::BasicBlock *loopRepTestBlock[4];
-		llvm::BasicBlock *loopRepEndBlock[4];
-		llvm::BasicBlock *labelBlock[2048];
-		std::vector<llvm::BasicBlock*> callRetBlock;
-		llvm::BasicBlock *returnBlock;
-		bool isConditionalIf[24 + 24];
-
-		const PixelShader *const pixelShader;
-	};
-}
-
-#endif   // sw_PixelRoutine_hpp
+// SwiftShader Software Renderer

+//

+// Copyright(c) 2005-2012 TransGaming Inc.

+//

+// All rights reserved. No part of this software may be copied, distributed, transmitted,

+// transcribed, stored in a retrieval system, translated into any human or computer

+// language by any means, or disclosed to third parties without the explicit written

+// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express

+// or implied, including but not limited to any patent rights, are granted to you.

+//

+

+#ifndef sw_PixelRoutine_hpp

+#define sw_PixelRoutine_hpp

+

+#include "Rasterizer.hpp"

+#include "ShaderCore.hpp"

+#include "PixelShader.hpp"

+

+#include "Types.hpp"

+

+namespace sw

+{

+	extern bool forceClearRegisters;

+

+	class PixelShader;

+	class SamplerCore;

+

+	class PixelRoutine : public Rasterizer, public ShaderCore

+	{

+		friend PixelProcessor;   // FIXME

+

+	public:

+		PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader);

+

+		~PixelRoutine();

+

+	protected:

+		struct Registers

+		{

+			Registers(const PixelShader *shader) :

+				current(ri[0]), diffuse(vi[0]), specular(vi[1]),

+				rf(shader && shader->dynamicallyIndexedTemporaries),

+				vf(shader && shader->dynamicallyIndexedInput)

+			{

+				if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)

+				{

+					for(int i = 0; i < 10; i++)

+					{

+						vf[i].x = Float4(0.0f);

+						vf[i].y = Float4(0.0f);

+						vf[i].z = Float4(0.0f);

+						vf[i].w = Float4(0.0f);

+					}

+				}

+

+				loopDepth = -1;

+				enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				

+				if(shader && shader->containsBreakInstruction())

+				{

+					enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				}

+

+				if(shader && shader->containsContinueInstruction())

+				{

+					enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				}

+

+				if(shader && shader->containsLeaveInstruction())

+				{

+					enableLeave = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				}

+

+				occlusion = 0;

+				

+				#if PERF_PROFILE

+					for(int i = 0; i < PERF_TIMERS; i++)

+					{

+						cycles[i] = 0;

+					}

+				#endif

+			}

+

+			Pointer<Byte> constants;

+

+			Pointer<Byte> primitive;

+			Int cluster;

+			Pointer<Byte> data;

+

+			Float4 z[4];

+			Float4 w;

+			Float4 rhw;

+

+			Float4 Dz[4];

+			Float4 Dw;

+			Float4 Dv[10][4];

+			Float4 Df;

+

+			Vector4i &current;

+			Vector4i &diffuse;

+			Vector4i &specular;

+

+			Vector4i ri[6];

+			Vector4i vi[2];

+			Vector4i ti[6];

+

+			RegisterArray<4096> rf;

+			RegisterArray<10> vf;

+

+			Vector4f vPos;

+			Vector4f vFace;

+

+			Vector4f oC[4];

+			Float4 oDepth;

+

+			Vector4f p0;

+			Array<Int, 4> aL;

+

+			Array<Int, 4> increment;

+			Array<Int, 4> iteration;

+

+			Int loopDepth;

+			Int stackIndex;   // FIXME: Inc/decrement callStack

+			Array<UInt, 4> callStack;

+

+			Int enableIndex;

+			Array<Int4, 1 + 24> enableStack;

+			Int4 enableBreak;

+			Int4 enableContinue;

+			Int4 enableLeave;

+

+			// bem(l) offsets and luminance

+			Float4 du;

+			Float4 dv;

+			Short4 L;

+

+			// texm3x3 temporaries

+			Float4 u_;   // FIXME

+			Float4 v_;   // FIXME

+			Float4 w_;   // FIXME

+			Float4 U;   // FIXME

+			Float4 V;   // FIXME

+			Float4 W;   // FIXME

+

+			UInt occlusion;

+

+			#if PERF_PROFILE

+				Long cycles[PERF_TIMERS];

+			#endif

+		};

+

+		typedef Shader::DestinationParameter Dst;

+		typedef Shader::SourceParameter Src;

+		typedef Shader::Control Control;

+

+		void quad(Registers &r, Pointer<Byte> cBuffer[4], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y);

+

+		Float4 interpolate(Float4 &x, Float4 &D, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);

+		Float4 interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective);

+		void stencilTest(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask);

+		void stencilTest(Registers &r, Byte8 &value, Context::StencilCompareMode stencilCompareMode, bool CCW);

+		void stencilOperation(Registers &r, Byte8 &newValue, Byte8 &bufferValue, Context::StencilOperation stencilPassOperation, Context::StencilOperation stencilZFailOperation, Context::StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask);

+		void stencilOperation(Registers &r, Byte8 &output, Byte8 &bufferValue, Context::StencilOperation operation, bool CCW);

+		Bool depthTest(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask);

+		void blendTexture(Registers &r, Vector4i &current, Vector4i &temp, Vector4i &texture, int stage);

+		void alphaTest(Registers &r, Int &aMask, Short4 &alpha);

+		void alphaToCoverage(Registers &r, Int cMask[4], Float4 &alpha);

+		Bool alphaTest(Registers &r, Int cMask[4], Vector4i &current);

+		Bool alphaTest(Registers &r, Int cMask[4], Vector4f &c0);

+		void fogBlend(Registers &r, Vector4i &current, Float4 &fog, Float4 &z, Float4 &rhw);

+		void fogBlend(Registers &r, Vector4f &c0, Float4 &fog, Float4 &z, Float4 &rhw);

+		void pixelFog(Registers &r, Float4 &visibility, Float4 &z, Float4 &rhw);

+		void specularPixel(Vector4i &current, Vector4i &specular);

+

+		void sampleTexture(Registers &r, Vector4i &c, int coordinates, int sampler, bool project = false);

+		void sampleTexture(Registers &r, Vector4i &c, int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project = false, bool bias = false, bool fixed12 = true);

+		void sampleTexture(Registers &r, Vector4i &c, int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project = false, bool bias = false, bool fixed12 = true, bool gradients = false, bool lodProvided = false);

+		void sampleTexture(Registers &r, Vector4f &c, const Src &sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project = false, bool bias = false, bool gradients = false, bool lodProvided = false);

+		void sampleTexture(Registers &r, Vector4f &c, int sampler, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool project = false, bool bias = false, bool gradients = false, bool lodProvided = false);

+	

+		// Raster operations

+		void clampColor(Vector4f oC[4]);

+		void rasterOperation(Vector4i &current, Registers &r, Float4 &fog, Pointer<Byte> &cBuffer, Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);

+		void rasterOperation(Vector4f oC[4], Registers &r, Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4]);

+		void blendFactor(Registers &r, const Vector4i &blendFactor, const Vector4i &current, const Vector4i &pixel, Context::BlendFactor blendFactorActive);

+		void blendFactorAlpha(Registers &r, const Vector4i &blendFactor, const Vector4i &current, const Vector4i &pixel, Context::BlendFactor blendFactorAlphaActive);

+		void alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4i &current, Int &x);

+		void writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &i, Vector4i &current, Int &sMask, Int &zMask, Int &cMask);

+		void blendFactor(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, Context::BlendFactor blendFactorActive);

+		void blendFactorAlpha(Registers &r, const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, Context::BlendFactor blendFactorAlphaActive);

+		void alphaBlend(Registers &r, int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x);

+		void writeColor(Registers &r, int index, Pointer<Byte> &cBuffer, Int &i, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask);

+		void writeStencil(Registers &r, Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask);

+		void writeDepth(Registers &r, Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask);

+

+		void ps_1_x(Registers &r, Int cMask[4]);

+		void ps_2_x(Registers &r, Int cMask[4]);

+

+		Short4 convertFixed12(RValue<Float4> cf);

+		void convertFixed12(Vector4i &ci, Vector4f &cf);

+		Float4 convertSigned12(Short4 &ci);

+		void convertSigned12(Vector4f &cf, Vector4i &ci);

+		Float4 convertUnsigned16(UShort4 ci);

+		UShort4 convertFixed16(Float4 &cf, bool saturate = true);

+		void convertFixed16(Vector4i &ci, Vector4f &cf, bool saturate = true);

+		void sRGBtoLinear16_16(Registers &r, Vector4i &c);

+		void sRGBtoLinear12_16(Registers &r, Vector4i &c);

+		void linearToSRGB16_16(Registers &r, Vector4i &c);

+		void linearToSRGB12_16(Registers &r, Vector4i &c);

+		Float4 sRGBtoLinear(const Float4 &x);

+		Float4 linearToSRGB(const Float4 &x);

+

+		// ps_1_x instructions

+		void MOV(Vector4i &dst, Vector4i &src0);

+		void ADD(Vector4i &dst, Vector4i &src0, Vector4i &src1);

+		void SUB(Vector4i &dst, Vector4i &src0, Vector4i &src1);

+		void MAD(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2);

+		void MUL(Vector4i &dst, Vector4i &src0, Vector4i &src1);

+		void DP3(Vector4i &dst, Vector4i &src0, Vector4i &src1);

+		void DP4(Vector4i &dst, Vector4i &src0, Vector4i &src1);

+		void LRP(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2);

+		void TEXCOORD(Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate);

+		void TEXCRD(Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project);

+		void TEXDP3(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src);

+		void TEXDP3TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0);

+		void TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s);

+		void TEXKILL(Int cMask[4], Vector4i &dst);

+		void TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, bool project);

+		void TEXLD(Registers &r, Vector4i &dst, Vector4i &src, int stage, bool project);

+		void TEXBEM(Registers &r, Vector4i &dst, Vector4i &src, Float4 &u, Float4 &v, Float4 &s, int stage);

+		void TEXBEML(Registers &r, Vector4i &dst, Vector4i &src, Float4 &u, Float4 &v, Float4 &s, int stage);

+		void TEXREG2AR(Registers &r, Vector4i &dst, Vector4i &src0, int stage);

+		void TEXREG2GB(Registers &r, Vector4i &dst, Vector4i &src0, int stage);

+		void TEXREG2RGB(Registers &r, Vector4i &dst, Vector4i &src0, int stage);

+		void TEXM3X2DEPTH(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src, bool signedScaling);

+		void TEXM3X2PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, int component, bool signedScaling);

+		void TEXM3X2TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, bool signedScaling);

+		void TEXM3X3(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, bool signedScaling);

+		void TEXM3X3PAD(Registers &r, Float4 &u, Float4 &v, Float4 &s, Vector4i &src0, int component, bool signedScaling);

+		void TEXM3X3SPEC(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, Vector4i &src1);

+		void TEXM3X3TEX(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0, bool singedScaling);

+		void TEXM3X3VSPEC(Registers &r, Vector4i &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4i &src0);

+		void TEXDEPTH(Registers &r);

+		void CND(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2);

+		void CMP(Vector4i &dst, Vector4i &src0, Vector4i &src1, Vector4i &src2);

+		void BEM(Registers &r, Vector4i &dst, Vector4i &src0, Vector4i &src1, int stage);

+

+		// ps_2_x instructions

+		void M3X2(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1);

+		void M3X3(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1);

+		void M3X4(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1);

+		void M4X3(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1);

+		void M4X4(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1);

+		void TEXLD(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias);

+		void TEXLDD(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1, Vector4f &src2,  Vector4f &src3, bool project, bool bias);

+		void TEXLDL(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1, bool project, bool bias);

+		void TEXKILL(Int cMask[4], Vector4f &src, unsigned char mask);

+		void DISCARD(Registers &r, Int cMask[4], const Shader::Instruction *instruction);

+		void DFDX(Vector4f &dst, Vector4f &src);

+		void DFDY(Vector4f &dst, Vector4f &src);

+		void FWIDTH(Vector4f &dst, Vector4f &src);

+		void BREAK(Registers &r);

+		void BREAKC(Registers &r, Vector4f &src0, Vector4f &src1, Control);

+		void BREAKP(Registers &r, const Src &predicateRegister);

+		void BREAK(Registers &r, Int4 &condition);

+		void CONTINUE(Registers &r);

+		void TEST();

+		void CALL(Registers &r, int labelIndex, int callSiteIndex);

+		void CALLNZ(Registers &r, int labelIndex, int callSiteIndex, const Src &src);

+		void CALLNZb(Registers &r, int labelIndex, int callSiteIndex, const Src &boolRegister);

+		void CALLNZp(Registers &r, int labelIndex, int callSiteIndex, const Src &predicateRegister);

+		void ELSE(Registers &r);

+		void ENDIF(Registers &r);

+		void ENDLOOP(Registers &r);

+		void ENDREP(Registers &r);

+		void ENDWHILE(Registers &r);

+		void IF(Registers &r, const Src &src);

+		void IFb(Registers &r, const Src &boolRegister);

+		void IFp(Registers &r, const Src &predicateRegister);

+		void IFC(Registers &r, Vector4f &src0, Vector4f &src1, Control);

+		void IF(Registers &r, Int4 &condition);

+		void LABEL(int labelIndex);

+		void LOOP(Registers &r, const Src &integerRegister);

+		void REP(Registers &r, const Src &integerRegister);

+		void WHILE(Registers &r, const Src &temporaryRegister);

+		void RET(Registers &r);

+		void LEAVE(Registers &r);

+

+		void writeDestination(Registers &r, Vector4i &d, const Dst &dst);

+		Vector4i regi(Registers &r, const Src &src);

+		Vector4f reg(Registers &r, const Src &src, int offset = 0);

+		Vector4f readConstant(Registers &r, const Src &src, int offset = 0);

+		Int relativeAddress(Registers &r, const Shader::Parameter &var);

+		Int4 enableMask(Registers &r, const Shader::Instruction *instruction);

+

+		bool colorUsed();

+		unsigned short shaderVersion() const;

+		bool interpolateZ() const;

+		bool interpolateW() const;

+

+		const PixelShader *const shader;

+

+	private:

+		SamplerCore *sampler[16];

+

+		bool perturbate;

+		bool luminance;

+		bool previousScaling;

+

+		int ifDepth;

+		int loopRepDepth;

+		int breakDepth;

+		int currentLabel;

+		bool whileTest;

+

+		// FIXME: Get rid of llvm::

+		llvm::BasicBlock *ifFalseBlock[24 + 24];

+		llvm::BasicBlock *loopRepTestBlock[4];

+		llvm::BasicBlock *loopRepEndBlock[4];

+		llvm::BasicBlock *labelBlock[2048];

+		std::vector<llvm::BasicBlock*> callRetBlock[2048];

+		llvm::BasicBlock *returnBlock;

+		bool isConditionalIf[24 + 24];

+	};

+}

+

+#endif   // sw_PixelRoutine_hpp

diff --git a/src/Shader/PixelShader.cpp b/src/Shader/PixelShader.cpp
index 7bf072a..3a43892 100644
--- a/src/Shader/PixelShader.cpp
+++ b/src/Shader/PixelShader.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -15,50 +15,44 @@
 
 namespace sw
 {
-	PixelShader::PixelShader(const unsigned long *token) : Shader(token)
+	PixelShader::PixelShader(const PixelShader *ps) : Shader()
+	{
+		version = 0x0300;
+		vPosDeclared = false;
+		vFaceDeclared = false;
+		centroid = false;
+
+		if(ps)   // Make a copy
+		{
+			for(int i = 0; i < ps->getLength(); i++)
+			{
+				append(new sw::Shader::Instruction(*ps->getInstruction(i)));
+			}
+
+			memcpy(semantic, ps->semantic, sizeof(semantic));
+			vPosDeclared = ps->vPosDeclared;
+			vFaceDeclared = ps->vFaceDeclared;
+			usedSamplers = ps->usedSamplers;
+
+			analyze();
+		}
+	}
+
+	PixelShader::PixelShader(const unsigned long *token) : Shader()
 	{
 		parse(token);
+
+		vPosDeclared = false;
+		vFaceDeclared = false;
+		centroid = false;
+
+		analyze();
 	}
 
 	PixelShader::~PixelShader()
 	{
 	}
 
-	void PixelShader::parse(const unsigned long *token)
-	{
-		minorVersion = (unsigned char)(token[0] & 0x000000FF);
-		majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
-		shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
-
-		length = validate(token);
-		ASSERT(length != 0);
-
-		instruction = new Shader::Instruction*[length];
-
-		for(int i = 0; i < length; i++)
-		{
-			while((*token & 0x0000FFFF) == 0x0000FFFE)   // Comment token
-			{
-				int length = (*token & 0x7FFF0000) >> 16;
-
-				token += length + 1;
-			}
-
-			int length = size(*token);
-
-			instruction[i] = new Instruction(token, length, majorVersion);
-
-			token += length + 1;
-		}
-
-		analyzeZOverride();
-		analyzeTexkill();
-		analyzeInterpolants();
-		analyzeDirtyConstants();
-		analyzeDynamicBranching();
-		analyzeSamplers();
-	}
-
 	int PixelShader::validate(const unsigned long *const token)
 	{
 		if(!token)
@@ -88,12 +82,12 @@
 			}
 			else
 			{
-				ShaderOpcode opcode = (ShaderOpcode)(token[i] & 0x0000FFFF);
+				Shader::Opcode opcode = (Shader::Opcode)(token[i] & 0x0000FFFF);
 
 				switch(opcode)
 				{
-				case ShaderOperation::OPCODE_RESERVED0:
-				case ShaderOperation::OPCODE_MOVA:
+				case Shader::OPCODE_RESERVED0:
+				case Shader::OPCODE_MOVA:
 					return 0;   // Unsupported operation
 				default:
 					instructionCount++;
@@ -112,9 +106,9 @@
 		return zOverride;
 	}
 
-	bool PixelShader::containsTexkill() const
+	bool PixelShader::containsKill() const
 	{
-		return texkill;
+		return kill;
 	}
 
 	bool PixelShader::containsCentroid() const
@@ -137,15 +131,27 @@
 		return semantic[2 + coordinate][component].active();
 	}
 
+	void PixelShader::analyze()
+	{
+		analyzeZOverride();
+		analyzeKill();
+		analyzeInterpolants();
+		analyzeDirtyConstants();
+		analyzeDynamicBranching();
+		analyzeSamplers();
+		analyzeCallSites();
+		analyzeDynamicIndexing();
+	}
+
 	void PixelShader::analyzeZOverride()
 	{
 		zOverride = false;
 
-		for(int i = 0; i < length; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
-			if(instruction[i]->getOpcode() == Instruction::Operation::OPCODE_TEXM3X2DEPTH ||
-			   instruction[i]->getOpcode() == Instruction::Operation::OPCODE_TEXDEPTH ||
-			   instruction[i]->getDestinationParameter().type == Instruction::DestinationParameter::PARAMETER_DEPTHOUT)
+			if(instruction[i]->opcode == Shader::OPCODE_TEXM3X2DEPTH ||
+			   instruction[i]->opcode == Shader::OPCODE_TEXDEPTH ||
+			   instruction[i]->dst.type == Shader::PARAMETER_DEPTHOUT)
 			{
 				zOverride = true;
 
@@ -154,15 +160,16 @@
 		}
 	}
 
-	void PixelShader::analyzeTexkill()
+	void PixelShader::analyzeKill()
 	{
-		texkill = false;
+		kill = false;
 
-		for(int i = 0; i < length; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
-			if(instruction[i]->getOpcode() == Instruction::Operation::OPCODE_TEXKILL)
+			if(instruction[i]->opcode == Shader::OPCODE_TEXKILL ||
+			   instruction[i]->opcode == Shader::OPCODE_DISCARD)
 			{
-				texkill = true;
+				kill = true;
 
 				break;
 			}
@@ -171,76 +178,72 @@
 
 	void PixelShader::analyzeInterpolants()
 	{
-		vPosDeclared = false;
-		vFaceDeclared = false;
-		centroid = false;
-
 		if(version < 0x0300)
 		{
 			// Set default mapping; disable unused interpolants below
-			semantic[0][0] = Semantic(ShaderOperation::USAGE_COLOR, 0);
-			semantic[0][1] = Semantic(ShaderOperation::USAGE_COLOR, 0);
-			semantic[0][2] = Semantic(ShaderOperation::USAGE_COLOR, 0);
-			semantic[0][3] = Semantic(ShaderOperation::USAGE_COLOR, 0);
+			semantic[0][0] = Semantic(Shader::USAGE_COLOR, 0);
+			semantic[0][1] = Semantic(Shader::USAGE_COLOR, 0);
+			semantic[0][2] = Semantic(Shader::USAGE_COLOR, 0);
+			semantic[0][3] = Semantic(Shader::USAGE_COLOR, 0);
 
-			semantic[1][0] = Semantic(ShaderOperation::USAGE_COLOR, 1);
-			semantic[1][1] = Semantic(ShaderOperation::USAGE_COLOR, 1);
-			semantic[1][2] = Semantic(ShaderOperation::USAGE_COLOR, 1);
-			semantic[1][3] = Semantic(ShaderOperation::USAGE_COLOR, 1);
+			semantic[1][0] = Semantic(Shader::USAGE_COLOR, 1);
+			semantic[1][1] = Semantic(Shader::USAGE_COLOR, 1);
+			semantic[1][2] = Semantic(Shader::USAGE_COLOR, 1);
+			semantic[1][3] = Semantic(Shader::USAGE_COLOR, 1);
 
 			for(int i = 0; i < 8; i++)
 			{
-				semantic[2 + i][0] = Semantic(ShaderOperation::USAGE_TEXCOORD, i);
-				semantic[2 + i][1] = Semantic(ShaderOperation::USAGE_TEXCOORD, i);
-				semantic[2 + i][2] = Semantic(ShaderOperation::USAGE_TEXCOORD, i);
-				semantic[2 + i][3] = Semantic(ShaderOperation::USAGE_TEXCOORD, i);
+				semantic[2 + i][0] = Semantic(Shader::USAGE_TEXCOORD, i);
+				semantic[2 + i][1] = Semantic(Shader::USAGE_TEXCOORD, i);
+				semantic[2 + i][2] = Semantic(Shader::USAGE_TEXCOORD, i);
+				semantic[2 + i][3] = Semantic(Shader::USAGE_TEXCOORD, i);
 			}
 
-			Instruction::Operation::SamplerType samplerType[16];
+			Shader::SamplerType samplerType[16];
 
 			for(int i = 0; i < 16; i++)
 			{
-				samplerType[i] = Instruction::Operation::SAMPLER_UNKNOWN;
+				samplerType[i] = Shader::SAMPLER_UNKNOWN;
 			}
 
-			for(int i = 0; i < length; i++)
+			for(unsigned int i = 0; i < instruction.size(); i++)
 			{
-				if(instruction[i]->getDestinationParameter().type == Instruction::SourceParameter::PARAMETER_SAMPLER)
+				if(instruction[i]->dst.type == Shader::PARAMETER_SAMPLER)
 				{
-					int sampler = instruction[i]->getDestinationParameter().index;
+					int sampler = instruction[i]->dst.index;
 
-					samplerType[sampler] = instruction[i]->getSamplerType();
+					samplerType[sampler] = instruction[i]->samplerType;
 				}
 			}
 
 			bool interpolant[10][4] = {false};   // Interpolants in use
 
-			for(int i = 0; i < length; i++)
+			for(unsigned int i = 0; i < instruction.size(); i++)
 			{
-				if(instruction[i]->getDestinationParameter().type == Instruction::SourceParameter::PARAMETER_TEXTURE)
+				if(instruction[i]->dst.type == Shader::PARAMETER_TEXTURE)
 				{	
-					int index = instruction[i]->getDestinationParameter().index + 2;
-					int mask = instruction[i]->getDestinationParameter().mask;
+					int index = instruction[i]->dst.index + 2;
+					int mask = instruction[i]->dst.mask;
 
-					switch(instruction[i]->getOpcode())
+					switch(instruction[i]->opcode)
 					{
-					case Instruction::Operation::OPCODE_TEX:
-					case Instruction::Operation::OPCODE_TEXBEM:
-					case Instruction::Operation::OPCODE_TEXBEML:
-					case Instruction::Operation::OPCODE_TEXCOORD:
-					case Instruction::Operation::OPCODE_TEXDP3:
-					case Instruction::Operation::OPCODE_TEXDP3TEX:
-					case Instruction::Operation::OPCODE_TEXM3X2DEPTH:
-					case Instruction::Operation::OPCODE_TEXM3X2PAD:
-					case Instruction::Operation::OPCODE_TEXM3X2TEX:
-					case Instruction::Operation::OPCODE_TEXM3X3:
-					case Instruction::Operation::OPCODE_TEXM3X3PAD:
-					case Instruction::Operation::OPCODE_TEXM3X3TEX:
+					case Shader::OPCODE_TEX:
+					case Shader::OPCODE_TEXBEM:
+					case Shader::OPCODE_TEXBEML:
+					case Shader::OPCODE_TEXCOORD:
+					case Shader::OPCODE_TEXDP3:
+					case Shader::OPCODE_TEXDP3TEX:
+					case Shader::OPCODE_TEXM3X2DEPTH:
+					case Shader::OPCODE_TEXM3X2PAD:
+					case Shader::OPCODE_TEXM3X2TEX:
+					case Shader::OPCODE_TEXM3X3:
+					case Shader::OPCODE_TEXM3X3PAD:
+					case Shader::OPCODE_TEXM3X3TEX:
 						interpolant[index][0] = true;
 						interpolant[index][1] = true;
 						interpolant[index][2] = true;
 						break;
-					case Instruction::Operation::OPCODE_TEXKILL:
+					case Shader::OPCODE_TEXKILL:
 						if(majorVersion < 2)
 						{
 							interpolant[index][0] = true;
@@ -255,7 +258,7 @@
 							interpolant[index][3] = true;
 						}
 						break;
-					case Instruction::Operation::OPCODE_TEXM3X3VSPEC:
+					case Shader::OPCODE_TEXM3X3VSPEC:
 						interpolant[index][0] = true;
 						interpolant[index][1] = true;
 						interpolant[index][2] = true;
@@ -263,7 +266,7 @@
 						interpolant[index - 1][3] = true;
 						interpolant[index - 0][3] = true;
 						break;
-					case Instruction::Operation::OPCODE_DCL:
+					case Shader::OPCODE_DCL:
 						break;   // Ignore
 					default:   // Arithmetic instruction
 						if(version >= 0x0104)
@@ -275,32 +278,32 @@
 
 				for(int argument = 0; argument < 4; argument++)
 				{
-					if(instruction[i]->getSourceParameter(argument).type == Instruction::SourceParameter::PARAMETER_INPUT ||
-					   instruction[i]->getSourceParameter(argument).type == Instruction::SourceParameter::PARAMETER_TEXTURE)
+					if(instruction[i]->src[argument].type == Shader::PARAMETER_INPUT ||
+					   instruction[i]->src[argument].type == Shader::PARAMETER_TEXTURE)
 					{
-						int index = instruction[i]->getSourceParameter(argument).index;
-						int swizzle = instruction[i]->getSourceParameter(argument).swizzle;
-						int mask = instruction[i]->getDestinationParameter().mask;
+						int index = instruction[i]->src[argument].index;
+						int swizzle = instruction[i]->src[argument].swizzle;
+						int mask = instruction[i]->dst.mask;
 						
-						if(instruction[i]->getSourceParameter(argument).type == Instruction::SourceParameter::PARAMETER_TEXTURE)
+						if(instruction[i]->src[argument].type == Shader::PARAMETER_TEXTURE)
 						{
 							index += 2;
 						}
 
-						switch(instruction[i]->getOpcode())
+						switch(instruction[i]->opcode)
 						{
-						case Instruction::Operation::OPCODE_TEX:
-						case Instruction::Operation::OPCODE_TEXLDD:
-						case Instruction::Operation::OPCODE_TEXLDL:
+						case Shader::OPCODE_TEX:
+						case Shader::OPCODE_TEXLDD:
+						case Shader::OPCODE_TEXLDL:
 							{
-								int sampler = instruction[i]->getSourceParameter(1).index;
+								int sampler = instruction[i]->src[1].index;
 
 								switch(samplerType[sampler])
 								{
-								case Instruction::Operation::SAMPLER_UNKNOWN:
+								case Shader::SAMPLER_UNKNOWN:
 									if(version == 0x0104)
 									{
-										if((instruction[i]->getSourceParameter(0).swizzle & 0x30) == 0x20)   // .xyz
+										if((instruction[i]->src[0].swizzle & 0x30) == 0x20)   // .xyz
 										{
 											interpolant[index][0] = true;
 											interpolant[index][1] = true;
@@ -318,19 +321,19 @@
 										ASSERT(false);
 									}
 									break;
-								case Instruction::Operation::SAMPLER_1D:
+								case Shader::SAMPLER_1D:
 									interpolant[index][0] = true;
 									break;
-								case Instruction::Operation::SAMPLER_2D:
+								case Shader::SAMPLER_2D:
 									interpolant[index][0] = true;
 									interpolant[index][1] = true;
 									break;
-								case Instruction::Operation::SAMPLER_CUBE:
+								case Shader::SAMPLER_CUBE:
 									interpolant[index][0] = true;
 									interpolant[index][1] = true;
 									interpolant[index][2] = true;
 									break;
-								case Instruction::Operation::SAMPLER_VOLUME:
+								case Shader::SAMPLER_VOLUME:
 									interpolant[index][0] = true;
 									interpolant[index][1] = true;
 									interpolant[index][2] = true;
@@ -339,31 +342,31 @@
 									ASSERT(false);
 								}
 
-								if(instruction[i]->isBias())
+								if(instruction[i]->bias)
 								{
 									interpolant[index][3] = true;
 								}
 
-								if(instruction[i]->isProject())
+								if(instruction[i]->project)
 								{
 									interpolant[index][3] = true;
 								}
 
-								if(version == 0x0104 && instruction[i]->getOpcode() == Instruction::Operation::OPCODE_TEX)
+								if(version == 0x0104 && instruction[i]->opcode == Shader::OPCODE_TEX)
 								{
-									if(instruction[i]->getSourceParameter(0).modifier == Instruction::SourceParameter::MODIFIER_DZ)
+									if(instruction[i]->src[0].modifier == Shader::MODIFIER_DZ)
 									{
 										interpolant[index][2] = true;
 									}
 
-									if(instruction[i]->getSourceParameter(0).modifier == Instruction::SourceParameter::MODIFIER_DW)
+									if(instruction[i]->src[0].modifier == Shader::MODIFIER_DW)
 									{
 										interpolant[index][3] = true;
 									}
 								}
 							}
 							break;
-						case Instruction::Operation::OPCODE_M3X2:
+						case Shader::OPCODE_M3X2:
 							if(mask & 0x1)
 							{
 								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
@@ -383,7 +386,7 @@
 								}
 							}
 							break;
-						case Instruction::Operation::OPCODE_M3X3:
+						case Shader::OPCODE_M3X3:
 							if(mask & 0x1)
 							{
 								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
@@ -411,7 +414,7 @@
 								}
 							}
 							break;
-						case Instruction::Operation::OPCODE_M3X4:
+						case Shader::OPCODE_M3X4:
 							if(mask & 0x1)
 							{
 								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
@@ -447,7 +450,7 @@
 								}
 							}
 							break;
-						case Instruction::Operation::OPCODE_M4X3:
+						case Shader::OPCODE_M4X3:
 							if(mask & 0x1)
 							{
 								interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
@@ -475,7 +478,7 @@
 								}
 							}
 							break;
-						case Instruction::Operation::OPCODE_M4X4:
+						case Shader::OPCODE_M4X4:
 							if(mask & 0x1)
 							{
 								interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
@@ -511,7 +514,7 @@
 								}
 							}
 							break;
-						case Instruction::Operation::OPCODE_CRS:
+						case Shader::OPCODE_CRS:
 							if(mask & 0x1)
 							{
 								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x6);
@@ -536,7 +539,7 @@
 								interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x3);
 							}
 							break;
-						case Instruction::Operation::OPCODE_DP2ADD:
+						case Shader::OPCODE_DP2ADD:
 							if(argument == 0 || argument == 1)
 							{
 								interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x3);
@@ -552,81 +555,81 @@
 								interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
 							}
 							break;
-						case Instruction::Operation::OPCODE_DP3:
+						case Shader::OPCODE_DP3:
 							interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7);
 							interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7);
 							interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7);
 							interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7);
 							break;
-						case Instruction::Operation::OPCODE_DP4:
+						case Shader::OPCODE_DP4:
 							interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
 							interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
 							interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
 							interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
 							break;
-						case Instruction::Operation::OPCODE_SINCOS:
-						case Instruction::Operation::OPCODE_EXP:
-						case Instruction::Operation::OPCODE_LOG:
-						case Instruction::Operation::OPCODE_POW:
-						case Instruction::Operation::OPCODE_RCP:
-						case Instruction::Operation::OPCODE_RSQ:
+						case Shader::OPCODE_SINCOS:
+						case Shader::OPCODE_EXP2X:
+						case Shader::OPCODE_LOG2X:
+						case Shader::OPCODE_POWX:
+						case Shader::OPCODE_RCPX:
+						case Shader::OPCODE_RSQX:
 							interpolant[index][0] |= swizzleContainsComponent(swizzle, 0);
 							interpolant[index][1] |= swizzleContainsComponent(swizzle, 1);
 							interpolant[index][2] |= swizzleContainsComponent(swizzle, 2);
 							interpolant[index][3] |= swizzleContainsComponent(swizzle, 3);
 							break;
-						case Instruction::Operation::OPCODE_NRM:
+						case Shader::OPCODE_NRM3:
 							interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, 0x7 | mask);
 							interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, 0x7 | mask);
 							interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, 0x7 | mask);
 							interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, 0x7 | mask);
 							break;
-						case Instruction::Operation::OPCODE_MOV:
-						case Instruction::Operation::OPCODE_ADD:
-						case Instruction::Operation::OPCODE_SUB:
-						case Instruction::Operation::OPCODE_MUL:
-						case Instruction::Operation::OPCODE_MAD:
-						case Instruction::Operation::OPCODE_ABS:
-						case Instruction::Operation::OPCODE_CMP:
-						case Instruction::Operation::OPCODE_CND:
-						case Instruction::Operation::OPCODE_FRC:
-						case Instruction::Operation::OPCODE_LRP:
-						case Instruction::Operation::OPCODE_MAX:
-						case Instruction::Operation::OPCODE_MIN:
-						case Instruction::Operation::OPCODE_SETP:
-						case Instruction::Operation::OPCODE_BREAKC:
-						case Instruction::Operation::OPCODE_DSX:
-						case Instruction::Operation::OPCODE_DSY:
+						case Shader::OPCODE_MOV:
+						case Shader::OPCODE_ADD:
+						case Shader::OPCODE_SUB:
+						case Shader::OPCODE_MUL:
+						case Shader::OPCODE_MAD:
+						case Shader::OPCODE_ABS:
+						case Shader::OPCODE_CMP0:
+						case Shader::OPCODE_CND:
+						case Shader::OPCODE_FRC:
+						case Shader::OPCODE_LRP:
+						case Shader::OPCODE_MAX:
+						case Shader::OPCODE_MIN:
+						case Shader::OPCODE_CMP:
+						case Shader::OPCODE_BREAKC:
+						case Shader::OPCODE_DFDX:
+						case Shader::OPCODE_DFDY:
 							interpolant[index][0] |= swizzleContainsComponentMasked(swizzle, 0, mask);
 							interpolant[index][1] |= swizzleContainsComponentMasked(swizzle, 1, mask);
 							interpolant[index][2] |= swizzleContainsComponentMasked(swizzle, 2, mask);
 							interpolant[index][3] |= swizzleContainsComponentMasked(swizzle, 3, mask);
 							break;
-						case Instruction::Operation::OPCODE_TEXCOORD:
+						case Shader::OPCODE_TEXCOORD:
 							interpolant[index][0] = true;
 							interpolant[index][1] = true;
 							interpolant[index][2] = true;
 							interpolant[index][3] = true;
 							break;
-						case Instruction::Operation::OPCODE_TEXDP3:
-						case Instruction::Operation::OPCODE_TEXDP3TEX:
-						case Instruction::Operation::OPCODE_TEXM3X2PAD:
-						case Instruction::Operation::OPCODE_TEXM3X3PAD:
-						case Instruction::Operation::OPCODE_TEXM3X2TEX:
-						case Instruction::Operation::OPCODE_TEXM3X3SPEC:
-						case Instruction::Operation::OPCODE_TEXM3X3VSPEC:
-						case Instruction::Operation::OPCODE_TEXBEM:
-						case Instruction::Operation::OPCODE_TEXBEML:
-						case Instruction::Operation::OPCODE_TEXM3X2DEPTH:
-						case Instruction::Operation::OPCODE_TEXM3X3:
-						case Instruction::Operation::OPCODE_TEXM3X3TEX:
+						case Shader::OPCODE_TEXDP3:
+						case Shader::OPCODE_TEXDP3TEX:
+						case Shader::OPCODE_TEXM3X2PAD:
+						case Shader::OPCODE_TEXM3X3PAD:
+						case Shader::OPCODE_TEXM3X2TEX:
+						case Shader::OPCODE_TEXM3X3SPEC:
+						case Shader::OPCODE_TEXM3X3VSPEC:
+						case Shader::OPCODE_TEXBEM:
+						case Shader::OPCODE_TEXBEML:
+						case Shader::OPCODE_TEXM3X2DEPTH:
+						case Shader::OPCODE_TEXM3X3:
+						case Shader::OPCODE_TEXM3X3TEX:
 							interpolant[index][0] = true;
 							interpolant[index][1] = true;
 							interpolant[index][2] = true;
 							break;
-						case Instruction::Operation::OPCODE_TEXREG2AR:
-						case Instruction::Operation::OPCODE_TEXREG2GB:
-						case Instruction::Operation::OPCODE_TEXREG2RGB:
+						case Shader::OPCODE_TEXREG2AR:
+						case Shader::OPCODE_TEXREG2GB:
+						case Shader::OPCODE_TEXREG2RGB:
 							break;
 						default:
 						//	ASSERT(false);   // Refine component usage
@@ -652,40 +655,25 @@
 		}
 		else   // Shader Model 3.0 input declaration; v# indexable
 		{
-			for(int i = 0; i < length; i++)
+			for(unsigned int i = 0; i < instruction.size(); i++)
 			{
-				if(instruction[i]->getOpcode() == ShaderOperation::OPCODE_DCL)
+				if(instruction[i]->opcode == Shader::OPCODE_DCL)
 				{
-					if(instruction[i]->getDestinationParameter().type == ShaderParameter::PARAMETER_INPUT)
+					if(instruction[i]->dst.type == Shader::PARAMETER_INPUT)
 					{
-						unsigned char usage = instruction[i]->getUsage();
-						unsigned char index = instruction[i]->getUsageIndex();
-						unsigned char mask = instruction[i]->getDestinationParameter().mask;
-						unsigned char reg = instruction[i]->getDestinationParameter().index;
+						unsigned char usage = instruction[i]->usage;
+						unsigned char index = instruction[i]->usageIndex;
+						unsigned char mask = instruction[i]->dst.mask;
+						unsigned char reg = instruction[i]->dst.index;
 
-						if(mask & 0x01)
-						{
-							semantic[reg][0] = Semantic(usage, index);
-						}
-
-						if(mask & 0x02)
-						{
-							semantic[reg][1] = Semantic(usage, index);
-						}
-
-						if(mask & 0x04)
-						{
-							semantic[reg][2] = Semantic(usage, index);
-						}
-
-						if(mask & 0x08)
-						{
-							semantic[reg][3] = Semantic(usage, index);
-						}
+						if(mask & 0x01)	semantic[reg][0] = Semantic(usage, index);
+						if(mask & 0x02) semantic[reg][1] = Semantic(usage, index);
+						if(mask & 0x04) semantic[reg][2] = Semantic(usage, index);
+						if(mask & 0x08)	semantic[reg][3] = Semantic(usage, index);
 					}
-					else if(instruction[i]->getDestinationParameter().type == ShaderParameter::PARAMETER_MISCTYPE)
+					else if(instruction[i]->dst.type == Shader::PARAMETER_MISCTYPE)
 					{
-						unsigned char index = instruction[i]->getDestinationParameter().index;
+						unsigned char index = instruction[i]->dst.index;
 
 						if(index == 0)
 						{
@@ -703,19 +691,19 @@
 
 		if(version >= 0x0200)
 		{
-			for(int i = 0; i < length; i++)
+			for(unsigned int i = 0; i < instruction.size(); i++)
 			{
-				if(instruction[i]->getOpcode() == ShaderOperation::OPCODE_DCL)
+				if(instruction[i]->opcode == Shader::OPCODE_DCL)
 				{
-					bool centroid = instruction[i]->getDestinationParameter().centroid;
-					unsigned char reg = instruction[i]->getDestinationParameter().index;
+					bool centroid = instruction[i]->dst.centroid;
+					unsigned char reg = instruction[i]->dst.index;
 
-					switch(instruction[i]->getDestinationParameter().type)
+					switch(instruction[i]->dst.type)
 					{
-					case ShaderParameter::PARAMETER_INPUT:
+					case Shader::PARAMETER_INPUT:
 						semantic[reg][0].centroid = centroid;
 						break;
-					case ShaderParameter::PARAMETER_TEXTURE:
+					case Shader::PARAMETER_TEXTURE:
 						semantic[2 + reg][0].centroid = centroid;
 						break;
 					}
diff --git a/src/Shader/PixelShader.hpp b/src/Shader/PixelShader.hpp
index a0f9b99..83ca253 100644
--- a/src/Shader/PixelShader.hpp
+++ b/src/Shader/PixelShader.hpp
@@ -1,54 +1,53 @@
-// SwiftShader Software Renderer
-//
-// Copyright(c) 2005-2011 TransGaming Inc.
-//
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
-//
-
-#ifndef sw_PixelShader_hpp
-#define sw_PixelShader_hpp
-
-#include "Shader.hpp"
-
-namespace sw
-{
-	class PixelShader : public Shader
-	{
-	public:
-		PixelShader(const unsigned long *token);
-
-		virtual ~PixelShader();
-
-		static int validate(const unsigned long *const token);   // Returns number of instructions if valid
-		bool depthOverride() const;
-		bool containsTexkill() const;
-		bool containsCentroid() const;
-		bool usesDiffuse(int component) const;
-		bool usesSpecular(int component) const;
-		bool usesTexture(int coordinate, int component) const;
-
-		Semantic semantic[10][4];   // FIXME: Private
-
-		bool vPosDeclared;
-		bool vFaceDeclared;
-
-	private:
-		void parse(const unsigned long *token);
-
-		void analyzeZOverride();
-		void analyzeTexkill();
-		void analyzeInterpolants();
-
-		bool zOverride;
-		bool texkill;
-		bool centroid;
-	};
-
-	typedef PixelShader::Instruction PixelShaderInstruction;
-}
-
-#endif   // sw_PixelShader_hpp
+// SwiftShader Software Renderer

+//

+// Copyright(c) 2005-2012 TransGaming Inc.

+//

+// All rights reserved. No part of this software may be copied, distributed, transmitted,

+// transcribed, stored in a retrieval system, translated into any human or computer

+// language by any means, or disclosed to third parties without the explicit written

+// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express

+// or implied, including but not limited to any patent rights, are granted to you.

+//

+

+#ifndef sw_PixelShader_hpp

+#define sw_PixelShader_hpp

+

+#include "Shader.hpp"

+

+namespace sw

+{

+	class PixelShader : public Shader

+	{

+	public:

+		explicit PixelShader(const PixelShader *ps = 0);

+		explicit PixelShader(const unsigned long *token);

+

+		virtual ~PixelShader();

+

+		static int validate(const unsigned long *const token);   // Returns number of instructions if valid

+		bool depthOverride() const;

+		bool containsKill() const;

+		bool containsCentroid() const;

+		bool usesDiffuse(int component) const;

+		bool usesSpecular(int component) const;

+		bool usesTexture(int coordinate, int component) const;

+

+		virtual void analyze();

+

+		Semantic semantic[10][4];   // FIXME: Private

+

+		bool vPosDeclared;

+		bool vFaceDeclared;

+

+	private:

+		void analyzeZOverride();

+		void analyzeKill();

+		void analyzeInterpolants();

+

+		bool zOverride;

+		bool kill;

+		bool centroid;

+	};

+}

+

+#endif   // sw_PixelShader_hpp

diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index b7055c4..dbd98b8 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -20,7 +20,7 @@
 	{
 	}
 
-	void SamplerCore::sampleTexture(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool bias, bool fixed12, bool gradients, bool lodProvided)
+	void SamplerCore::sampleTexture(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool bias, bool fixed12, bool gradients, bool lodProvided)
 	{
 		#if PERF_PROFILE
 			AddAtomic(Pointer<Long>(&profiler.texOperations), Long(4));
@@ -40,17 +40,17 @@
 
 		if(state.textureType == TEXTURE_NULL)
 		{
-			c.r = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-			c.g = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-			c.b = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			c.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			c.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+			c.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 
 			if(fixed12)   // FIXME: Convert to fixed12 at higher level, when required
 			{
-				c.a = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+				c.w = Short4(0x1000, 0x1000, 0x1000, 0x1000);
 			}
 			else
 			{
-				c.a = Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
+				c.w = Short4((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);   // FIXME
 			}
 		}
 		else
@@ -87,8 +87,8 @@
 
 			if(cubeTexture)
 			{
-				uuuu += Float4(0.5f, 0.5f, 0.5f, 0.5f);
-				vvvv += Float4(0.5f, 0.5f, 0.5f, 0.5f);
+				uuuu += Float4(0.5f);
+				vvvv += Float4(0.5f);
 			}
 
 			if(!hasFloatTexture())
@@ -97,7 +97,7 @@
 			}
 			else
 			{
-				Color4f cf;
+				Vector4f cf;
 
 				sampleFloatFilter(texture, cf, uuuu, vvvv, wwww, lod, anisotropy, uDelta, vDelta, face, lodProvided);
 
@@ -144,40 +144,40 @@
 				case FORMAT_G8R8:
 				case FORMAT_G16R16:
 				case FORMAT_A16B16G16R16:
-					if(componentCount < 2) c.g = Short4(0x1000, 0x1000, 0x1000, 0x1000);
-					if(componentCount < 3) c.b = Short4(0x1000, 0x1000, 0x1000, 0x1000);
-					if(componentCount < 4) c.a = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+					if(componentCount < 2) c.y = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+					if(componentCount < 3) c.z = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+					if(componentCount < 4) c.w = Short4(0x1000, 0x1000, 0x1000, 0x1000);
 					break;
 				case FORMAT_A8:
-					c.a = c.r;
-					c.r = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-					c.g = Short4(0x0000, 0x0000, 0x0000, 0x0000);
-					c.b = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+					c.w = c.x;
+					c.x = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+					c.y = Short4(0x0000, 0x0000, 0x0000, 0x0000);
+					c.z = Short4(0x0000, 0x0000, 0x0000, 0x0000);
 					break;
 				case FORMAT_L8:
 				case FORMAT_L16:
-					c.g = c.r;
-					c.b = c.r;
-					c.a = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+					c.y = c.x;
+					c.z = c.x;
+					c.w = Short4(0x1000, 0x1000, 0x1000, 0x1000);
 					break;
 				case FORMAT_A8L8:
-					c.a = c.g;
-					c.g = c.r;
-					c.b = c.r;
+					c.w = c.y;
+					c.y = c.x;
+					c.z = c.x;
 					break;
 				case FORMAT_R32F:
-					c.g = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+					c.y = Short4(0x1000, 0x1000, 0x1000, 0x1000);
 				case FORMAT_G32R32F:
-					c.b = Short4(0x1000, 0x1000, 0x1000, 0x1000);
-					c.a = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+					c.z = Short4(0x1000, 0x1000, 0x1000, 0x1000);
+					c.w = Short4(0x1000, 0x1000, 0x1000, 0x1000);
 				case FORMAT_A32B32G32R32F:
 					break;
 				case FORMAT_D32F_LOCKABLE:
 				case FORMAT_D32F_TEXTURE:
 				case FORMAT_D32F_SHADOW:
-					c.g = c.r;
-					c.b = c.r;
-					c.a = c.r;
+					c.y = c.x;
+					c.z = c.x;
+					c.w = c.x;
 					break;
 				default:
 					ASSERT(false);
@@ -186,7 +186,7 @@
 		}
 	}
 
-	void SamplerCore::sampleTexture(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool bias, bool gradients, bool lodProvided)
+	void SamplerCore::sampleTexture(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool bias, bool gradients, bool lodProvided)
 	{
 		#if PERF_PROFILE
 			AddAtomic(Pointer<Long>(&profiler.texOperations), Long(4));
@@ -202,10 +202,10 @@
 
 		if(state.textureType == TEXTURE_NULL)
 		{
-			c.r = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			c.g = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			c.b = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			c.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+			c.x = Float4(0.0f);
+			c.y = Float4(0.0f);
+			c.z = Float4(0.0f);
+			c.w = Float4(1.0f);
 		}
 		else
 		{
@@ -247,15 +247,15 @@
 
 				if(cubeTexture)
 				{
-					uuuu += Float4(0.5f, 0.5f, 0.5f, 0.5f);
-					vvvv += Float4(0.5f, 0.5f, 0.5f, 0.5f);
+					uuuu += Float4(0.5f);
+					vvvv += Float4(0.5f);
 				}
 
 				sampleFloatFilter(texture, c, uuuu, vvvv, wwww, lod, anisotropy, uDelta, vDelta, face, lodProvided);
 			}
 			else
 			{
-				Color4i ci;
+				Vector4i ci;
 
 				sampleTexture(texture, ci, u, v, w, q, dsx, dsy, bias, false, gradients, lodProvided);
 
@@ -298,40 +298,40 @@
 				case FORMAT_G8R8:
 				case FORMAT_G16R16:
 				case FORMAT_A16B16G16R16:
-					if(componentCount < 2) c.g = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-					if(componentCount < 3) c.b = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-					if(componentCount < 4) c.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					if(componentCount < 2) c.y = Float4(1.0f);
+					if(componentCount < 3) c.z = Float4(1.0f);
+					if(componentCount < 4) c.w = Float4(1.0f);
 					break;
 				case FORMAT_A8:
-					c.a = c.r;
-					c.r = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-					c.g = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-					c.b = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+					c.w = c.x;
+					c.x = Float4(0.0f);
+					c.y = Float4(0.0f);
+					c.z = Float4(0.0f);
 					break;
 				case FORMAT_L8:
 				case FORMAT_L16:
-					c.g = c.r;
-					c.b = c.r;
-					c.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					c.y = c.x;
+					c.z = c.x;
+					c.w = Float4(1.0f);
 					break;
 				case FORMAT_A8L8:
-					c.a = c.g;
-					c.g = c.r;
-					c.b = c.r;
+					c.w = c.y;
+					c.y = c.x;
+					c.z = c.x;
 					break;
 				case FORMAT_R32F:
-					c.g = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					c.y = Float4(1.0f);
 				case FORMAT_G32R32F:
-					c.b = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-					c.a = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					c.z = Float4(1.0f);
+					c.w = Float4(1.0f);
 				case FORMAT_A32B32G32R32F:
 					break;
 				case FORMAT_D32F_LOCKABLE:
 				case FORMAT_D32F_TEXTURE:
 				case FORMAT_D32F_SHADOW:
-					c.g = c.r;
-					c.b = c.r;
-					c.a = c.r;
+					c.y = c.x;
+					c.z = c.x;
+					c.w = c.x;
 					break;
 				default:
 					ASSERT(false);
@@ -377,7 +377,7 @@
 		return uvw;
 	}
 
-	void SamplerCore::sampleFilter(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided)
+	void SamplerCore::sampleFilter(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided)
 	{
 		bool volumeTexture = state.textureType == TEXTURE_3D;
 
@@ -385,7 +385,7 @@
 
 		if(state.mipmapFilter > MIPMAP_POINT)
 		{
-			Color4i cc;
+			Vector4i cc;
 
 			sampleAniso(texture, cc, u, v, w, lod, anisotropy, uDelta, vDelta, face, true, lodProvided);
 
@@ -394,28 +394,28 @@
 			UShort4 utri = UShort4(Float4(lod));   // FIXME: Optimize
 			Short4 stri = utri >> 1;   // FIXME: Optimize
 
-			if(hasUnsignedTextureComponent(0)) cc.r = MulHigh(As<UShort4>(cc.r), utri); else cc.r = MulHigh(cc.r, stri);
-			if(hasUnsignedTextureComponent(1)) cc.g = MulHigh(As<UShort4>(cc.g), utri); else cc.g = MulHigh(cc.g, stri);
-			if(hasUnsignedTextureComponent(2)) cc.b = MulHigh(As<UShort4>(cc.b), utri); else cc.b = MulHigh(cc.b, stri);
-			if(hasUnsignedTextureComponent(3)) cc.a = MulHigh(As<UShort4>(cc.a), utri); else cc.a = MulHigh(cc.a, stri);
+			if(hasUnsignedTextureComponent(0)) cc.x = MulHigh(As<UShort4>(cc.x), utri); else cc.x = MulHigh(cc.x, stri);
+			if(hasUnsignedTextureComponent(1)) cc.y = MulHigh(As<UShort4>(cc.y), utri); else cc.y = MulHigh(cc.y, stri);
+			if(hasUnsignedTextureComponent(2)) cc.z = MulHigh(As<UShort4>(cc.z), utri); else cc.z = MulHigh(cc.z, stri);
+			if(hasUnsignedTextureComponent(3)) cc.w = MulHigh(As<UShort4>(cc.w), utri); else cc.w = MulHigh(cc.w, stri);
 
 			utri = ~utri;
 			stri = Short4(0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF) - stri;
 
-			if(hasUnsignedTextureComponent(0)) c.r = MulHigh(As<UShort4>(c.r), utri); else c.r = MulHigh(c.r, stri);
-			if(hasUnsignedTextureComponent(1)) c.g = MulHigh(As<UShort4>(c.g), utri); else c.g = MulHigh(c.g, stri);
-			if(hasUnsignedTextureComponent(2)) c.b = MulHigh(As<UShort4>(c.b), utri); else c.b = MulHigh(c.b, stri);
-			if(hasUnsignedTextureComponent(3)) c.a = MulHigh(As<UShort4>(c.a), utri); else c.a = MulHigh(c.a, stri);
+			if(hasUnsignedTextureComponent(0)) c.x = MulHigh(As<UShort4>(c.x), utri); else c.x = MulHigh(c.x, stri);
+			if(hasUnsignedTextureComponent(1)) c.y = MulHigh(As<UShort4>(c.y), utri); else c.y = MulHigh(c.y, stri);
+			if(hasUnsignedTextureComponent(2)) c.z = MulHigh(As<UShort4>(c.z), utri); else c.z = MulHigh(c.z, stri);
+			if(hasUnsignedTextureComponent(3)) c.w = MulHigh(As<UShort4>(c.w), utri); else c.w = MulHigh(c.w, stri);
 			
-			c.r += cc.r;
-			c.g += cc.g;
-			c.b += cc.b;
-			c.a += cc.a;
+			c.x += cc.x;
+			c.y += cc.y;
+			c.z += cc.z;
+			c.w += cc.w;
 			
-			if(!hasUnsignedTextureComponent(0)) c.r += c.r;
-			if(!hasUnsignedTextureComponent(1)) c.g += c.g;
-			if(!hasUnsignedTextureComponent(2)) c.b += c.b;
-			if(!hasUnsignedTextureComponent(3)) c.a += c.a;
+			if(!hasUnsignedTextureComponent(0)) c.x += c.x;
+			if(!hasUnsignedTextureComponent(1)) c.y += c.y;
+			if(!hasUnsignedTextureComponent(2)) c.z += c.z;
+			if(!hasUnsignedTextureComponent(3)) c.w += c.w;
 		}
 
 		Short4 borderMask;
@@ -468,14 +468,14 @@
 		{
 			Short4 b;
 
-			c.r = borderMask & c.r | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[0])) >> (hasUnsignedTextureComponent(0) ? 0 : 1));
-			c.g = borderMask & c.g | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[1])) >> (hasUnsignedTextureComponent(1) ? 0 : 1));
-			c.b = borderMask & c.b | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[2])) >> (hasUnsignedTextureComponent(2) ? 0 : 1));
-			c.a = borderMask & c.a | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[3])) >> (hasUnsignedTextureComponent(3) ? 0 : 1));
+			c.x = borderMask & c.x | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[0])) >> (hasUnsignedTextureComponent(0) ? 0 : 1));
+			c.y = borderMask & c.y | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[1])) >> (hasUnsignedTextureComponent(1) ? 0 : 1));
+			c.z = borderMask & c.z | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[2])) >> (hasUnsignedTextureComponent(2) ? 0 : 1));
+			c.w = borderMask & c.w | ~borderMask & (*Pointer<Short4>(texture + OFFSET(Texture,borderColor4[3])) >> (hasUnsignedTextureComponent(3) ? 0 : 1));
 		}
 	}
 
-	void SamplerCore::sampleAniso(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided)
+	void SamplerCore::sampleAniso(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided)
 	{
 		if(state.textureFilter != FILTER_ANISOTROPIC || lodProvided)
 		{
@@ -485,12 +485,12 @@
 		{
 			Int a = RoundInt(anisotropy);
 
-			Color4i cSum;
+			Vector4i cSum;
 
-			cSum.r = Short4(0, 0, 0, 0);
-			cSum.g = Short4(0, 0, 0, 0);
-			cSum.b = Short4(0, 0, 0, 0);
-			cSum.a = Short4(0, 0, 0, 0);
+			cSum.x = Short4(0, 0, 0, 0);
+			cSum.y = Short4(0, 0, 0, 0);
+			cSum.z = Short4(0, 0, 0, 0);
+			cSum.w = Short4(0, 0, 0, 0);
 
 			Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
 			Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
@@ -515,23 +515,23 @@
 				u0 += du;
 				v0 += dv;
 
-				if(hasUnsignedTextureComponent(0)) cSum.r += As<Short4>(MulHigh(As<UShort4>(c.r), cw)); else cSum.r += MulHigh(c.r, sw);
-				if(hasUnsignedTextureComponent(1)) cSum.g += As<Short4>(MulHigh(As<UShort4>(c.g), cw)); else cSum.g += MulHigh(c.g, sw);
-				if(hasUnsignedTextureComponent(2)) cSum.b += As<Short4>(MulHigh(As<UShort4>(c.b), cw)); else cSum.b += MulHigh(c.b, sw);
-				if(hasUnsignedTextureComponent(3)) cSum.a += As<Short4>(MulHigh(As<UShort4>(c.a), cw)); else cSum.a += MulHigh(c.a, sw);
+				if(hasUnsignedTextureComponent(0)) cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw)); else cSum.x += MulHigh(c.x, sw);
+				if(hasUnsignedTextureComponent(1)) cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw)); else cSum.y += MulHigh(c.y, sw);
+				if(hasUnsignedTextureComponent(2)) cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw)); else cSum.z += MulHigh(c.z, sw);
+				if(hasUnsignedTextureComponent(3)) cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw)); else cSum.w += MulHigh(c.w, sw);
 
 				i++;
 			}
 			Until(i >= a)
 
-			if(hasUnsignedTextureComponent(0)) c.r = cSum.r; else c.r = AddSat(cSum.r, cSum.r);
-			if(hasUnsignedTextureComponent(1)) c.g = cSum.g; else c.g = AddSat(cSum.g, cSum.g);
-			if(hasUnsignedTextureComponent(2)) c.b = cSum.b; else c.b = AddSat(cSum.b, cSum.b);
-			if(hasUnsignedTextureComponent(3)) c.a = cSum.a; else c.a = AddSat(cSum.a, cSum.a);
+			if(hasUnsignedTextureComponent(0)) c.x = cSum.x; else c.x = AddSat(cSum.x, cSum.x);
+			if(hasUnsignedTextureComponent(1)) c.y = cSum.y; else c.y = AddSat(cSum.y, cSum.y);
+			if(hasUnsignedTextureComponent(2)) c.z = cSum.z; else c.z = AddSat(cSum.z, cSum.z);
+			if(hasUnsignedTextureComponent(3)) c.w = cSum.w; else c.w = AddSat(cSum.w, cSum.w);
 		}
 	}
 
-	void SamplerCore::sampleQuad(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD)
+	void SamplerCore::sampleQuad(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD)
 	{
 		if(state.textureType != TEXTURE_3D)
 		{
@@ -543,7 +543,7 @@
 		}
 	}
 
-	void SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float &lod, Int face[4], bool secondLOD)
+	void SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float &lod, Int face[4], bool secondLOD)
 	{
 		int componentCount = textureComponentCount();
 		bool gather = state.textureFilter == FILTER_GATHER;
@@ -565,10 +565,10 @@
 		}
 		else
 		{
-			Color4i c0;
-			Color4i c1;
-			Color4i c2;
-			Color4i c3;
+			Vector4i c0;
+			Vector4i c1;
+			Vector4i c2;
+			Vector4i c3;
 
 			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), (AddressingMode)state.addressingModeU == ADDRESSING_WRAP, gather ? 0 : -1);
 			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), (AddressingMode)state.addressingModeV == ADDRESSING_WRAP, gather ? 0 : -1);
@@ -624,29 +624,29 @@
 				{
 					if(has16bitTexture() && hasUnsignedTextureComponent(0))
 					{
-						c0.r = As<UShort4>(c0.r) - MulHigh(As<UShort4>(c0.r), f0u) + MulHigh(As<UShort4>(c1.r), f0u);
-						c2.r = As<UShort4>(c2.r) - MulHigh(As<UShort4>(c2.r), f0u) + MulHigh(As<UShort4>(c3.r), f0u);
-						c.r  = As<UShort4>(c0.r) - MulHigh(As<UShort4>(c0.r), f0v) + MulHigh(As<UShort4>(c2.r), f0v);
+						c0.x = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0u) + MulHigh(As<UShort4>(c1.x), f0u);
+						c2.x = As<UShort4>(c2.x) - MulHigh(As<UShort4>(c2.x), f0u) + MulHigh(As<UShort4>(c3.x), f0u);
+						c.x  = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0v) + MulHigh(As<UShort4>(c2.x), f0v);
 					}
 					else
 					{
 						if(hasUnsignedTextureComponent(0))
 						{
-							c0.r = MulHigh(As<UShort4>(c0.r), As<UShort4>(f1u1v));
-							c1.r = MulHigh(As<UShort4>(c1.r), As<UShort4>(f0u1v));
-							c2.r = MulHigh(As<UShort4>(c2.r), As<UShort4>(f1u0v));
-							c3.r = MulHigh(As<UShort4>(c3.r), As<UShort4>(f0u0v));
+							c0.x = MulHigh(As<UShort4>(c0.x), As<UShort4>(f1u1v));
+							c1.x = MulHigh(As<UShort4>(c1.x), As<UShort4>(f0u1v));
+							c2.x = MulHigh(As<UShort4>(c2.x), As<UShort4>(f1u0v));
+							c3.x = MulHigh(As<UShort4>(c3.x), As<UShort4>(f0u0v));
 						}
 						else
 						{
-							c0.r = MulHigh(c0.r, f1u1vs);
-							c1.r = MulHigh(c1.r, f0u1vs);
-							c2.r = MulHigh(c2.r, f1u0vs);
-							c3.r = MulHigh(c3.r, f0u0vs);
+							c0.x = MulHigh(c0.x, f1u1vs);
+							c1.x = MulHigh(c1.x, f0u1vs);
+							c2.x = MulHigh(c2.x, f1u0vs);
+							c3.x = MulHigh(c3.x, f0u0vs);
 						}
 
-						c.r = (c0.r + c1.r) + (c2.r + c3.r);
-						if(!hasUnsignedTextureComponent(0)) c.r = AddSat(c.r, c.r);   // Correct for signed fractions
+						c.x = (c0.x + c1.x) + (c2.x + c3.x);
+						if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x);   // Correct for signed fractions
 					}
 				}
 
@@ -654,29 +654,29 @@
 				{
 					if(has16bitTexture() && hasUnsignedTextureComponent(1))
 					{
-						c0.g = As<UShort4>(c0.g) - MulHigh(As<UShort4>(c0.g), f0u) + MulHigh(As<UShort4>(c1.g), f0u);
-						c2.g = As<UShort4>(c2.g) - MulHigh(As<UShort4>(c2.g), f0u) + MulHigh(As<UShort4>(c3.g), f0u);
-						c.g  = As<UShort4>(c0.g) - MulHigh(As<UShort4>(c0.g), f0v) + MulHigh(As<UShort4>(c2.g), f0v);
+						c0.y = As<UShort4>(c0.y) - MulHigh(As<UShort4>(c0.y), f0u) + MulHigh(As<UShort4>(c1.y), f0u);
+						c2.y = As<UShort4>(c2.y) - MulHigh(As<UShort4>(c2.y), f0u) + MulHigh(As<UShort4>(c3.y), f0u);
+						c.y  = As<UShort4>(c0.y) - MulHigh(As<UShort4>(c0.y), f0v) + MulHigh(As<UShort4>(c2.y), f0v);
 					}
 					else
 					{
 						if(hasUnsignedTextureComponent(1))
 						{
-							c0.g = MulHigh(As<UShort4>(c0.g), As<UShort4>(f1u1v));
-							c1.g = MulHigh(As<UShort4>(c1.g), As<UShort4>(f0u1v));
-							c2.g = MulHigh(As<UShort4>(c2.g), As<UShort4>(f1u0v));
-							c3.g = MulHigh(As<UShort4>(c3.g), As<UShort4>(f0u0v));
+							c0.y = MulHigh(As<UShort4>(c0.y), As<UShort4>(f1u1v));
+							c1.y = MulHigh(As<UShort4>(c1.y), As<UShort4>(f0u1v));
+							c2.y = MulHigh(As<UShort4>(c2.y), As<UShort4>(f1u0v));
+							c3.y = MulHigh(As<UShort4>(c3.y), As<UShort4>(f0u0v));
 						}
 						else
 						{
-							c0.g = MulHigh(c0.g, f1u1vs);
-							c1.g = MulHigh(c1.g, f0u1vs);
-							c2.g = MulHigh(c2.g, f1u0vs);
-							c3.g = MulHigh(c3.g, f0u0vs);
+							c0.y = MulHigh(c0.y, f1u1vs);
+							c1.y = MulHigh(c1.y, f0u1vs);
+							c2.y = MulHigh(c2.y, f1u0vs);
+							c3.y = MulHigh(c3.y, f0u0vs);
 						}
 
-						c.g = (c0.g + c1.g) + (c2.g + c3.g);
-						if(!hasUnsignedTextureComponent(1)) c.g = AddSat(c.g, c.g);   // Correct for signed fractions
+						c.y = (c0.y + c1.y) + (c2.y + c3.y);
+						if(!hasUnsignedTextureComponent(1)) c.y = AddSat(c.y, c.y);   // Correct for signed fractions
 					}
 				}
 
@@ -684,29 +684,29 @@
 				{
 					if(has16bitTexture() && hasUnsignedTextureComponent(2))
 					{
-						c0.b = As<UShort4>(c0.b) - MulHigh(As<UShort4>(c0.b), f0u) + MulHigh(As<UShort4>(c1.b), f0u);
-						c2.b = As<UShort4>(c2.b) - MulHigh(As<UShort4>(c2.b), f0u) + MulHigh(As<UShort4>(c3.b), f0u);
-						c.b  = As<UShort4>(c0.b) - MulHigh(As<UShort4>(c0.b), f0v) + MulHigh(As<UShort4>(c2.b), f0v);
+						c0.z = As<UShort4>(c0.z) - MulHigh(As<UShort4>(c0.z), f0u) + MulHigh(As<UShort4>(c1.z), f0u);
+						c2.z = As<UShort4>(c2.z) - MulHigh(As<UShort4>(c2.z), f0u) + MulHigh(As<UShort4>(c3.z), f0u);
+						c.z  = As<UShort4>(c0.z) - MulHigh(As<UShort4>(c0.z), f0v) + MulHigh(As<UShort4>(c2.z), f0v);
 					}
 					else
 					{
 						if(hasUnsignedTextureComponent(2))
 						{
-							c0.b = MulHigh(As<UShort4>(c0.b), As<UShort4>(f1u1v));
-							c1.b = MulHigh(As<UShort4>(c1.b), As<UShort4>(f0u1v));
-							c2.b = MulHigh(As<UShort4>(c2.b), As<UShort4>(f1u0v));
-							c3.b = MulHigh(As<UShort4>(c3.b), As<UShort4>(f0u0v));
+							c0.z = MulHigh(As<UShort4>(c0.z), As<UShort4>(f1u1v));
+							c1.z = MulHigh(As<UShort4>(c1.z), As<UShort4>(f0u1v));
+							c2.z = MulHigh(As<UShort4>(c2.z), As<UShort4>(f1u0v));
+							c3.z = MulHigh(As<UShort4>(c3.z), As<UShort4>(f0u0v));
 						}
 						else
 						{
-							c0.b = MulHigh(c0.b, f1u1vs);
-							c1.b = MulHigh(c1.b, f0u1vs);
-							c2.b = MulHigh(c2.b, f1u0vs);
-							c3.b = MulHigh(c3.b, f0u0vs);
+							c0.z = MulHigh(c0.z, f1u1vs);
+							c1.z = MulHigh(c1.z, f0u1vs);
+							c2.z = MulHigh(c2.z, f1u0vs);
+							c3.z = MulHigh(c3.z, f0u0vs);
 						}
 
-						c.b = (c0.b + c1.b) + (c2.b + c3.b);
-						if(!hasUnsignedTextureComponent(2)) c.b = AddSat(c.b, c.b);   // Correct for signed fractions
+						c.z = (c0.z + c1.z) + (c2.z + c3.z);
+						if(!hasUnsignedTextureComponent(2)) c.z = AddSat(c.z, c.z);   // Correct for signed fractions
 					}
 				}
 
@@ -714,43 +714,43 @@
 				{
 					if(has16bitTexture() && hasUnsignedTextureComponent(3))
 					{
-						c0.a = As<UShort4>(c0.a) - MulHigh(As<UShort4>(c0.a), f0u) + MulHigh(As<UShort4>(c1.a), f0u);
-						c2.a = As<UShort4>(c2.a) - MulHigh(As<UShort4>(c2.a), f0u) + MulHigh(As<UShort4>(c3.a), f0u);
-						c.a  = As<UShort4>(c0.a) - MulHigh(As<UShort4>(c0.a), f0v) + MulHigh(As<UShort4>(c2.a), f0v);
+						c0.w = As<UShort4>(c0.w) - MulHigh(As<UShort4>(c0.w), f0u) + MulHigh(As<UShort4>(c1.w), f0u);
+						c2.w = As<UShort4>(c2.w) - MulHigh(As<UShort4>(c2.w), f0u) + MulHigh(As<UShort4>(c3.w), f0u);
+						c.w  = As<UShort4>(c0.w) - MulHigh(As<UShort4>(c0.w), f0v) + MulHigh(As<UShort4>(c2.w), f0v);
 					}
 					else
 					{
 						if(hasUnsignedTextureComponent(3))
 						{
-							c0.a = MulHigh(As<UShort4>(c0.a), As<UShort4>(f1u1v));
-							c1.a = MulHigh(As<UShort4>(c1.a), As<UShort4>(f0u1v));
-							c2.a = MulHigh(As<UShort4>(c2.a), As<UShort4>(f1u0v));
-							c3.a = MulHigh(As<UShort4>(c3.a), As<UShort4>(f0u0v));
+							c0.w = MulHigh(As<UShort4>(c0.w), As<UShort4>(f1u1v));
+							c1.w = MulHigh(As<UShort4>(c1.w), As<UShort4>(f0u1v));
+							c2.w = MulHigh(As<UShort4>(c2.w), As<UShort4>(f1u0v));
+							c3.w = MulHigh(As<UShort4>(c3.w), As<UShort4>(f0u0v));
 						}
 						else
 						{
-							c0.a = MulHigh(c0.a, f1u1vs);
-							c1.a = MulHigh(c1.a, f0u1vs);
-							c2.a = MulHigh(c2.a, f1u0vs);
-							c3.a = MulHigh(c3.a, f0u0vs);
+							c0.w = MulHigh(c0.w, f1u1vs);
+							c1.w = MulHigh(c1.w, f0u1vs);
+							c2.w = MulHigh(c2.w, f1u0vs);
+							c3.w = MulHigh(c3.w, f0u0vs);
 						}
 
-						c.a = (c0.a + c1.a) + (c2.a + c3.a);
-						if(!hasUnsignedTextureComponent(3)) c.a = AddSat(c.a, c.a);   // Correct for signed fractions
+						c.w = (c0.w + c1.w) + (c2.w + c3.w);
+						if(!hasUnsignedTextureComponent(3)) c.w = AddSat(c.w, c.w);   // Correct for signed fractions
 					}
 				}
 			}
 			else
 			{
-				c.r = c1.r;
-				c.g = c2.r;
-				c.b = c3.r;
-				c.a = c0.r;
+				c.x = c1.x;
+				c.y = c2.x;
+				c.z = c3.x;
+				c.w = c0.x;
 			}
 		}
 	}
 
-	void SamplerCore::sample3D(Pointer<Byte> &texture, Color4i &c_, Float4 &u_, Float4 &v_, Float4 &w_, Float &lod, bool secondLOD)
+	void SamplerCore::sample3D(Pointer<Byte> &texture, Vector4i &c_, Float4 &u_, Float4 &v_, Float4 &w_, Float &lod, bool secondLOD)
 	{
 		int componentCount = textureComponentCount();
 
@@ -774,7 +774,7 @@
 		}
 		else
 		{
-			Color4i c[2][2][2];
+			Vector4i c[2][2][2];
 
 			Short4 u[2][2][2];
 			Short4 v[2][2][2];
@@ -863,36 +863,36 @@
 					{
 						sampleTexel(c[i][j][k], u[i][j][k], v[i][j][k], s[i][j][k], mipmap, buffer);
 
-						if(componentCount >= 1) if(hasUnsignedTextureComponent(0)) c[i][j][k].r = MulHigh(As<UShort4>(c[i][j][k].r), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].r = MulHigh(c[i][j][k].r, fs[1 - i][1 - j][1 - k]);
-						if(componentCount >= 2) if(hasUnsignedTextureComponent(1)) c[i][j][k].g = MulHigh(As<UShort4>(c[i][j][k].g), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].g = MulHigh(c[i][j][k].g, fs[1 - i][1 - j][1 - k]);
-						if(componentCount >= 3) if(hasUnsignedTextureComponent(2)) c[i][j][k].b = MulHigh(As<UShort4>(c[i][j][k].b), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].b = MulHigh(c[i][j][k].b, fs[1 - i][1 - j][1 - k]);
-						if(componentCount >= 4) if(hasUnsignedTextureComponent(3)) c[i][j][k].a = MulHigh(As<UShort4>(c[i][j][k].a), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].a = MulHigh(c[i][j][k].a, fs[1 - i][1 - j][1 - k]);
+						if(componentCount >= 1) if(hasUnsignedTextureComponent(0)) c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]);
+						if(componentCount >= 2) if(hasUnsignedTextureComponent(1)) c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]);
+						if(componentCount >= 3) if(hasUnsignedTextureComponent(2)) c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]);
+						if(componentCount >= 4) if(hasUnsignedTextureComponent(3)) c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), As<UShort4>(f[1 - i][1 - j][1 - k])); else c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]);
 
 						if(i != 0 || j != 0 || k != 0)
 						{
-							if(componentCount >= 1) c[0][0][0].r += c[i][j][k].r;
-							if(componentCount >= 2) c[0][0][0].g += c[i][j][k].g;
-							if(componentCount >= 3) c[0][0][0].b += c[i][j][k].b;
-							if(componentCount >= 4) c[0][0][0].a += c[i][j][k].a;
+							if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
+							if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
+							if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
+							if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
 						}
 					}
 				}
 			}
 
-			if(componentCount >= 1) c_.r = c[0][0][0].r;
-			if(componentCount >= 2) c_.g = c[0][0][0].g;
-			if(componentCount >= 3) c_.b = c[0][0][0].b;
-			if(componentCount >= 4) c_.a = c[0][0][0].a;
+			if(componentCount >= 1) c_.x = c[0][0][0].x;
+			if(componentCount >= 2) c_.y = c[0][0][0].y;
+			if(componentCount >= 3) c_.z = c[0][0][0].z;
+			if(componentCount >= 4) c_.w = c[0][0][0].w;
 
 			// Correct for signed fractions
-			if(componentCount >= 1) if(!hasUnsignedTextureComponent(0)) c_.r = AddSat(c_.r, c_.r);
-			if(componentCount >= 2) if(!hasUnsignedTextureComponent(1)) c_.g = AddSat(c_.g, c_.g);
-			if(componentCount >= 3) if(!hasUnsignedTextureComponent(2)) c_.b = AddSat(c_.b, c_.b);
-			if(componentCount >= 4) if(!hasUnsignedTextureComponent(3)) c_.a = AddSat(c_.a, c_.a);
+			if(componentCount >= 1) if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
+			if(componentCount >= 2) if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
+			if(componentCount >= 3) if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
+			if(componentCount >= 4) if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
 		}
 	}
 
-	void SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided)
+	void SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided)
 	{
 		bool volumeTexture = state.textureType == TEXTURE_3D;
 
@@ -900,16 +900,16 @@
 
 		if(state.mipmapFilter > MIPMAP_POINT)
 		{
-			Color4f cc;
+			Vector4f cc;
 
 			sampleFloatAniso(texture, cc, u, v, w, lod, anisotropy, uDelta, vDelta, face, true, lodProvided);
 
-			Float4 lod4 = Float4(Fraction(lod));
+			Float4 lod4 = Float4(Frac(lod));
 
-			c.r = (cc.r - c.r) * lod4 + c.r;
-			c.g = (cc.g - c.g) * lod4 + c.g;
-			c.b = (cc.b - c.b) * lod4 + c.b;
-			c.a = (cc.a - c.a) * lod4 + c.a;
+			c.x = (cc.x - c.x) * lod4 + c.x;
+			c.y = (cc.y - c.y) * lod4 + c.y;
+			c.z = (cc.z - c.z) * lod4 + c.z;
+			c.w = (cc.w - c.w) * lod4 + c.w;
 		}
 
 		Int4 borderMask;
@@ -962,14 +962,14 @@
 		{
 			Int4 b;
 
-			c.r = As<Float4>(borderMask & As<Int4>(c.r) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[0])));
-			c.g = As<Float4>(borderMask & As<Int4>(c.g) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[1])));
-			c.b = As<Float4>(borderMask & As<Int4>(c.b) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[2])));
-			c.a = As<Float4>(borderMask & As<Int4>(c.a) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[3])));
+			c.x = As<Float4>(borderMask & As<Int4>(c.x) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[0])));
+			c.y = As<Float4>(borderMask & As<Int4>(c.y) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[1])));
+			c.z = As<Float4>(borderMask & As<Int4>(c.z) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[2])));
+			c.w = As<Float4>(borderMask & As<Int4>(c.w) | ~borderMask & *Pointer<Int4>(texture + OFFSET(Texture,borderColorF[3])));
 		}
 	}
 
-	void SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided)
+	void SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided)
 	{
 		if(state.textureFilter != FILTER_ANISOTROPIC || lodProvided)
 		{
@@ -979,12 +979,12 @@
 		{
 			Int a = RoundInt(anisotropy);
 
-			Color4f cSum;
+			Vector4f cSum;
 
-			cSum.r = Float4(0, 0, 0, 0);
-			cSum.g = Float4(0, 0, 0, 0);
-			cSum.b = Float4(0, 0, 0, 0);
-			cSum.a = Float4(0, 0, 0, 0);
+			cSum.x = Float4(0.0f);
+			cSum.y = Float4(0.0f);
+			cSum.z = Float4(0.0f);
+			cSum.w = Float4(0.0f);
 
 			Float4 A = *Pointer<Float4>(constants + OFFSET(Constants,uvWeight) + 16 * a);
 			Float4 B = *Pointer<Float4>(constants + OFFSET(Constants,uvStart) + 16 * a);
@@ -1007,23 +1007,23 @@
 				u0 += du;
 				v0 += dv;
 
-				cSum.r += c.r * A;
-				cSum.g += c.g * A;
-				cSum.b += c.b * A;
-				cSum.a += c.a * A;
+				cSum.x += c.x * A;
+				cSum.y += c.y * A;
+				cSum.z += c.z * A;
+				cSum.w += c.w * A;
 
 				i++;
 			}
 			Until(i >= a)
 
-			c.r = cSum.r;
-			c.g = cSum.g;
-			c.b = cSum.b;
-			c.a = cSum.a;
+			c.x = cSum.x;
+			c.y = cSum.y;
+			c.z = cSum.z;
+			c.w = cSum.w;
 		}
 	}
 
-	void SamplerCore::sampleFloat(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD)
+	void SamplerCore::sampleFloat(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD)
 	{
 		if(state.textureType != TEXTURE_3D)
 		{
@@ -1035,7 +1035,7 @@
 		}
 	}
 	
-	void SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &z, Float &lod, Int face[4], bool secondLOD)
+	void SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &z, Float &lod, Int face[4], bool secondLOD)
 	{
 		int componentCount = textureComponentCount();
 		bool gather = state.textureFilter == FILTER_GATHER;
@@ -1057,10 +1057,10 @@
 		}
 		else
 		{
-			Color4f c0;
-			Color4f c1;
-			Color4f c2;
-			Color4f c3;
+			Vector4f c0;
+			Vector4f c1;
+			Vector4f c2;
+			Vector4f c3;
 
 			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), (AddressingMode)state.addressingModeU == ADDRESSING_WRAP, gather ? 0 : -1);
 			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), (AddressingMode)state.addressingModeV == ADDRESSING_WRAP, gather ? 0 : -1);
@@ -1075,35 +1075,35 @@
 			if(!gather)   // Blend
 			{
 				// Fractions
-				Float4 fu = Fraction(Float4(As<UShort4>(uuuu0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fWidth)));
-				Float4 fv = Fraction(Float4(As<UShort4>(vvvv0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fHeight)));
+				Float4 fu = Frac(Float4(As<UShort4>(uuuu0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fWidth)));
+				Float4 fv = Frac(Float4(As<UShort4>(vvvv0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fHeight)));
 
-				if(componentCount >= 1) c0.r = c0.r + fu * (c1.r - c0.r);
-				if(componentCount >= 2) c0.g = c0.g + fu * (c1.g - c0.g);
-				if(componentCount >= 3) c0.b = c0.b + fu * (c1.b - c0.b);
-				if(componentCount >= 4) c0.a = c0.a + fu * (c1.a - c0.a);
+				if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
+				if(componentCount >= 2) c0.y = c0.y + fu * (c1.y - c0.y);
+				if(componentCount >= 3) c0.z = c0.z + fu * (c1.z - c0.z);
+				if(componentCount >= 4) c0.w = c0.w + fu * (c1.w - c0.w);
 
-				if(componentCount >= 1) c2.r = c2.r + fu * (c3.r - c2.r);
-				if(componentCount >= 2) c2.g = c2.g + fu * (c3.g - c2.g);
-				if(componentCount >= 3) c2.b = c2.b + fu * (c3.b - c2.b);
-				if(componentCount >= 4) c2.a = c2.a + fu * (c3.a - c2.a);
+				if(componentCount >= 1) c2.x = c2.x + fu * (c3.x - c2.x);
+				if(componentCount >= 2) c2.y = c2.y + fu * (c3.y - c2.y);
+				if(componentCount >= 3) c2.z = c2.z + fu * (c3.z - c2.z);
+				if(componentCount >= 4) c2.w = c2.w + fu * (c3.w - c2.w);
 
-				if(componentCount >= 1) c.r = c0.r + fv * (c2.r - c0.r);
-				if(componentCount >= 2) c.g = c0.g + fv * (c2.g - c0.g);
-				if(componentCount >= 3) c.b = c0.b + fv * (c2.b - c0.b);
-				if(componentCount >= 4) c.a = c0.a + fv * (c2.a - c0.a);
+				if(componentCount >= 1) c.x = c0.x + fv * (c2.x - c0.x);
+				if(componentCount >= 2) c.y = c0.y + fv * (c2.y - c0.y);
+				if(componentCount >= 3) c.z = c0.z + fv * (c2.z - c0.z);
+				if(componentCount >= 4) c.w = c0.w + fv * (c2.w - c0.w);
 			}
 			else
 			{
-				c.r = c1.r;
-				c.g = c2.r;
-				c.b = c3.r;
-				c.a = c0.r;
+				c.x = c1.x;
+				c.y = c2.x;
+				c.z = c3.x;
+				c.w = c0.x;
 			}
 		}
 	}
 
-	void SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, bool secondLOD)
+	void SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, bool secondLOD)
 	{
 		int componentCount = textureComponentCount();
 
@@ -1127,14 +1127,14 @@
 		}
 		else
 		{
-			Color4f &c0 = c;
-			Color4f c1;
-			Color4f c2;
-			Color4f c3;
-			Color4f c4;
-			Color4f c5;
-			Color4f c6;
-			Color4f c7;
+			Vector4f &c0 = c;
+			Vector4f c1;
+			Vector4f c2;
+			Vector4f c3;
+			Vector4f c4;
+			Vector4f c5;
+			Vector4f c6;
+			Vector4f c7;
 
 			Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap,uHalf), (AddressingMode)state.addressingModeU == ADDRESSING_WRAP, -1);
 			Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap,vHalf), (AddressingMode)state.addressingModeV == ADDRESSING_WRAP, -1);
@@ -1153,51 +1153,51 @@
 			sampleTexel(c7, uuuu1, vvvv1, wwww1, w, mipmap, buffer);
 
 			// Fractions
-			Float4 fu = Fraction(Float4(As<UShort4>(uuuu0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fWidth)));
-			Float4 fv = Fraction(Float4(As<UShort4>(vvvv0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fHeight)));
-			Float4 fw = Fraction(Float4(As<UShort4>(wwww0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fDepth)));
+			Float4 fu = Frac(Float4(As<UShort4>(uuuu0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fWidth)));
+			Float4 fv = Frac(Float4(As<UShort4>(vvvv0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fHeight)));
+			Float4 fw = Frac(Float4(As<UShort4>(wwww0)) * *Pointer<Float4>(mipmap + OFFSET(Mipmap,fDepth)));
 
 			// Blend first slice
-			if(componentCount >= 1) c0.r = c0.r + fu * (c1.r - c0.r);
-			if(componentCount >= 2) c0.g = c0.g + fu * (c1.g - c0.g);
-			if(componentCount >= 3) c0.b = c0.b + fu * (c1.b - c0.b);
-			if(componentCount >= 4) c0.a = c0.a + fu * (c1.a - c0.a);
+			if(componentCount >= 1) c0.x = c0.x + fu * (c1.x - c0.x);
+			if(componentCount >= 2) c0.y = c0.y + fu * (c1.y - c0.y);
+			if(componentCount >= 3) c0.z = c0.z + fu * (c1.z - c0.z);
+			if(componentCount >= 4) c0.w = c0.w + fu * (c1.w - c0.w);
 
-			if(componentCount >= 1) c2.r = c2.r + fu * (c3.r - c2.r);
-			if(componentCount >= 2) c2.g = c2.g + fu * (c3.g - c2.g);
-			if(componentCount >= 3) c2.b = c2.b + fu * (c3.b - c2.b);
-			if(componentCount >= 4) c2.a = c2.a + fu * (c3.a - c2.a);
+			if(componentCount >= 1) c2.x = c2.x + fu * (c3.x - c2.x);
+			if(componentCount >= 2) c2.y = c2.y + fu * (c3.y - c2.y);
+			if(componentCount >= 3) c2.z = c2.z + fu * (c3.z - c2.z);
+			if(componentCount >= 4) c2.w = c2.w + fu * (c3.w - c2.w);
 
-			if(componentCount >= 1) c0.r = c0.r + fv * (c2.r - c0.r);
-			if(componentCount >= 2) c0.g = c0.g + fv * (c2.g - c0.g);
-			if(componentCount >= 3) c0.b = c0.b + fv * (c2.b - c0.b);
-			if(componentCount >= 4) c0.a = c0.a + fv * (c2.a - c0.a);
+			if(componentCount >= 1) c0.x = c0.x + fv * (c2.x - c0.x);
+			if(componentCount >= 2) c0.y = c0.y + fv * (c2.y - c0.y);
+			if(componentCount >= 3) c0.z = c0.z + fv * (c2.z - c0.z);
+			if(componentCount >= 4) c0.w = c0.w + fv * (c2.w - c0.w);
 
 			// Blend second slice
-			if(componentCount >= 1) c4.r = c4.r + fu * (c5.r - c4.r);
-			if(componentCount >= 2) c4.g = c4.g + fu * (c5.g - c4.g);
-			if(componentCount >= 3) c4.b = c4.b + fu * (c5.b - c4.b);
-			if(componentCount >= 4) c4.a = c4.a + fu * (c5.a - c4.a);
+			if(componentCount >= 1) c4.x = c4.x + fu * (c5.x - c4.x);
+			if(componentCount >= 2) c4.y = c4.y + fu * (c5.y - c4.y);
+			if(componentCount >= 3) c4.z = c4.z + fu * (c5.z - c4.z);
+			if(componentCount >= 4) c4.w = c4.w + fu * (c5.w - c4.w);
 
-			if(componentCount >= 1) c6.r = c6.r + fu * (c7.r - c6.r);
-			if(componentCount >= 2) c6.g = c6.g + fu * (c7.g - c6.g);
-			if(componentCount >= 3) c6.b = c6.b + fu * (c7.b - c6.b);
-			if(componentCount >= 4) c6.a = c6.a + fu * (c7.a - c6.a);
+			if(componentCount >= 1) c6.x = c6.x + fu * (c7.x - c6.x);
+			if(componentCount >= 2) c6.y = c6.y + fu * (c7.y - c6.y);
+			if(componentCount >= 3) c6.z = c6.z + fu * (c7.z - c6.z);
+			if(componentCount >= 4) c6.w = c6.w + fu * (c7.w - c6.w);
 
-			if(componentCount >= 1) c4.r = c4.r + fv * (c6.r - c4.r);
-			if(componentCount >= 2) c4.g = c4.g + fv * (c6.g - c4.g);
-			if(componentCount >= 3) c4.b = c4.b + fv * (c6.b - c4.b);
-			if(componentCount >= 4) c4.a = c4.a + fv * (c6.a - c4.a);
+			if(componentCount >= 1) c4.x = c4.x + fv * (c6.x - c4.x);
+			if(componentCount >= 2) c4.y = c4.y + fv * (c6.y - c4.y);
+			if(componentCount >= 3) c4.z = c4.z + fv * (c6.z - c4.z);
+			if(componentCount >= 4) c4.w = c4.w + fv * (c6.w - c4.w);
 
 			// Blend slices
-			if(componentCount >= 1) c0.r = c0.r + fw * (c4.r - c0.r);
-			if(componentCount >= 2) c0.g = c0.g + fw * (c4.g - c0.g);
-			if(componentCount >= 3) c0.b = c0.b + fw * (c4.b - c0.b);
-			if(componentCount >= 4) c0.a = c0.a + fw * (c4.a - c0.a);
+			if(componentCount >= 1) c0.x = c0.x + fw * (c4.x - c0.x);
+			if(componentCount >= 2) c0.y = c0.y + fw * (c4.y - c0.y);
+			if(componentCount >= 3) c0.z = c0.z + fw * (c4.z - c0.z);
+			if(componentCount >= 4) c0.w = c0.w + fw * (c4.w - c0.w);
 		}
 	}
 
-	void SamplerCore::computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float &lodBias, Color4f &dsx, Color4f &dsy, bool bias, bool gradients, bool lodProvided)
+	void SamplerCore::computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float &lodBias, Vector4f &dsx, Vector4f &dsy, bool bias, bool gradients, bool lodProvided)
 	{
 		if(!lodProvided)
 		{
@@ -1267,7 +1267,7 @@
 		lod = Min(lod, Float(MIPMAP_LEVELS - 2));   // Trilinear accesses lod+1
 	}
 
-	void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float &lodBias, Color4f &dsx, Color4f &dsy, bool bias, bool gradients, bool lodProvided)
+	void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float &lodBias, Vector4f &dsx, Vector4f &dsy, bool bias, bool gradients, bool lodProvided)
 	{
 		if(state.mipmapFilter == MIPMAP_NONE)
 		{
@@ -1337,9 +1337,9 @@
 
 	void SamplerCore::cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &lodU, Float4 &lodV, Float4 &x, Float4 &y, Float4 &z)
 	{
-		Int4 xp = CmpNLE(x, Float4(0.0f, 0.0f, 0.0f, 0.0f));   // x > 0
-		Int4 yp = CmpNLE(y, Float4(0.0f, 0.0f, 0.0f, 0.0f));   // y > 0
-		Int4 zp = CmpNLE(z, Float4(0.0f, 0.0f, 0.0f, 0.0f));   // z > 0
+		Int4 xp = CmpNLE(x, Float4(0.0f));   // x > 0
+		Int4 yp = CmpNLE(y, Float4(0.0f));   // y > 0
+		Int4 zp = CmpNLE(z, Float4(0.0f));   // z > 0
 
 		Float4 absX = Abs(x);
 		Float4 absY = Abs(y);
@@ -1383,8 +1383,8 @@
 		Float4 M = As<Float4>((xyz & As<Int4>(x)) | (yzx & As<Int4>(y)) | (zxy & As<Int4>(z)));
 		
 		M = reciprocal(M);
-		U *= M * Float4(0.5f, 0.5f, 0.5f, 0.5f);
-		V *= M * Float4(0.5f, 0.5f, 0.5f, 0.5f);
+		U *= M * Float4(0.5f);
+		V *= M * Float4(0.5f);
 
 		// Project coordinates onto one face for consistent LOD calculation
 		{
@@ -1404,8 +1404,8 @@
 			Float4 M = As<Float4>((xyz & As<Int4>(x)) | (yzx & As<Int4>(y)) | (zxy & As<Int4>(z)));
 			
 			M = Rcp_pp(M);
-			lodU *= M * Float4(0.5f, 0.5f, 0.5f, 0.5f);
-			lodV *= M * Float4(0.5f, 0.5f, 0.5f, 0.5f);
+			lodU *= M * Float4(0.5f);
+			lodV *= M * Float4(0.5f);
 		}
 	}
 
@@ -1467,7 +1467,7 @@
 		index[3] = Extract(As<Int2>(uuu2), 1);
 	}
 
-	void SamplerCore::sampleTexel(Color4i &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4])
+	void SamplerCore::sampleTexel(Vector4i &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4])
 	{
 		Int index[4];
 
@@ -1493,32 +1493,32 @@
 					Byte8 c1 = *Pointer<Byte8>(buffer[f1] + 4 * index[1]);
 					Byte8 c2 = *Pointer<Byte8>(buffer[f2] + 4 * index[2]);
 					Byte8 c3 = *Pointer<Byte8>(buffer[f3] + 4 * index[3]);
-					c.r = UnpackLow(c0, c1);
-					c.g = UnpackLow(c2, c3);
+					c.x = UnpackLow(c0, c1);
+					c.y = UnpackLow(c2, c3);
 					
 					switch(state.textureFormat)
 					{
 					case FORMAT_A8R8G8B8:
-						c.b = c.r;
-						c.b = As<Short4>(UnpackLow(c.b, c.g));
-						c.r = As<Short4>(UnpackHigh(c.r, c.g));
-						c.g = c.b;
-						c.a = c.r;
-						c.b = UnpackLow(As<Byte8>(c.b), As<Byte8>(c.b));
-						c.g = UnpackHigh(As<Byte8>(c.g), As<Byte8>(c.g));
-						c.r = UnpackLow(As<Byte8>(c.r), As<Byte8>(c.r));
-						c.a = UnpackHigh(As<Byte8>(c.a), As<Byte8>(c.a));
+						c.z = c.x;
+						c.z = As<Short4>(UnpackLow(c.z, c.y));
+						c.x = As<Short4>(UnpackHigh(c.x, c.y));
+						c.y = c.z;
+						c.w = c.x;
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(c.w));
 						break;
 					case FORMAT_Q8W8V8U8:
-						c.b = c.r;
-						c.r = As<Short4>(UnpackLow(c.r, c.g));
-						c.b = As<Short4>(UnpackHigh(c.b, c.g));
-						c.g = c.r;
-						c.a = c.b;
-						c.r = UnpackLow(As<Byte8>(c.r), As<Byte8>(c.r));
-						c.g = UnpackHigh(As<Byte8>(c.g), As<Byte8>(c.g));
-						c.b = UnpackLow(As<Byte8>(c.b), As<Byte8>(c.b));
-						c.a = UnpackHigh(As<Byte8>(c.a), As<Byte8>(c.a));
+						c.z = c.x;
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.z = As<Short4>(UnpackHigh(c.z, c.y));
+						c.y = c.x;
+						c.w = c.z;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(c.w));
 						break;
 					default:
 						ASSERT(false);
@@ -1531,30 +1531,30 @@
 					Byte8 c1 = *Pointer<Byte8>(buffer[f1] + 4 * index[1]);
 					Byte8 c2 = *Pointer<Byte8>(buffer[f2] + 4 * index[2]);
 					Byte8 c3 = *Pointer<Byte8>(buffer[f3] + 4 * index[3]);
-					c.r = UnpackLow(c0, c1);
-					c.g = UnpackLow(c2, c3);
+					c.x = UnpackLow(c0, c1);
+					c.y = UnpackLow(c2, c3);
 
 					switch(state.textureFormat)
 					{
 					case FORMAT_X8R8G8B8:
-						c.b = c.r;
-						c.b = As<Short4>(UnpackLow(c.b, c.g));
-						c.r = As<Short4>(UnpackHigh(c.r, c.g));
-						c.g = c.b;
-						c.b = UnpackLow(As<Byte8>(c.b), As<Byte8>(c.b));
-						c.g = UnpackHigh(As<Byte8>(c.g), As<Byte8>(c.g));
-						c.r = UnpackLow(As<Byte8>(c.r), As<Byte8>(c.r));
+						c.z = c.x;
+						c.z = As<Short4>(UnpackLow(c.z, c.y));
+						c.x = As<Short4>(UnpackHigh(c.x, c.y));
+						c.y = c.z;
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(c.y));
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(c.x));
 						break;
 					case FORMAT_X8L8V8U8:
-						c.b = c.r;
-						c.r = As<Short4>(UnpackLow(c.r, c.g));
-						c.b = As<Short4>(UnpackHigh(c.b, c.g));
-						c.g = c.r;
-						c.r = UnpackLow(As<Byte8>(c.r), As<Byte8>(Short4(0x0000, 0x0000, 0x0000, 0x0000)));
-						c.r = c.r << 8;
-						c.g = UnpackHigh(As<Byte8>(c.g), As<Byte8>(Short4(0x0000, 0x0000, 0x0000, 0x0000)));
-						c.g = c.g << 8;
-						c.b = UnpackLow(As<Byte8>(c.b), As<Byte8>(c.b));
+						c.z = c.x;
+						c.x = As<Short4>(UnpackLow(c.x, c.y));
+						c.z = As<Short4>(UnpackHigh(c.z, c.y));
+						c.y = c.x;
+						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+						c.x = c.x << 8;
+						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0x0000, 0x0000, 0x0000, 0x0000)));
+						c.y = c.y << 8;
+						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(c.z));
 						break;
 					default:
 						ASSERT(false);
@@ -1562,10 +1562,10 @@
 				}
 				break;
 			case 2:
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f0] + 2 * index[0]), 0);
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f1] + 2 * index[1]), 1);
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f2] + 2 * index[2]), 2);
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f3] + 2 * index[3]), 3);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f0] + 2 * index[0]), 0);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f1] + 2 * index[1]), 1);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f2] + 2 * index[2]), 2);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f3] + 2 * index[3]), 3);
 
 				switch(state.textureFormat)
 				{
@@ -1573,8 +1573,8 @@
 				case FORMAT_V8U8:
 				case FORMAT_A8L8:
 					// FIXME: Unpack properly to 0.16 format
-					c.g = c.r;
-					c.r = c.r << 8;
+					c.y = c.x;
+					c.x = c.x << 8;
 					break;
 				default:
 					ASSERT(false);
@@ -1586,8 +1586,7 @@
 				c2 = Int(*Pointer<Byte>(buffer[f2] + index[2]));
 				c3 = Int(*Pointer<Byte>(buffer[f3] + index[3]));
 				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
-				c.r = As<Short4>(Int2(c0));
-				c.r = UnpackLow(As<Byte8>(c.r), As<Byte8>(c.r));
+				c.x = Unpack(As<Byte4>(c0));
 				break;
 			default:
 				ASSERT(false);
@@ -1598,26 +1597,26 @@
 			switch(textureComponentCount())
 			{
 			case 4:
-				c.r = *Pointer<Short4>(buffer[f0] + 8 * index[0]);
-				c.g = *Pointer<Short4>(buffer[f1] + 8 * index[1]);
-				c.b = *Pointer<Short4>(buffer[f2] + 8 * index[2]);
-				c.a = *Pointer<Short4>(buffer[f3] + 8 * index[3]);
-				transpose4x4(c.r, c.g, c.b, c.a);
+				c.x = *Pointer<Short4>(buffer[f0] + 8 * index[0]);
+				c.y = *Pointer<Short4>(buffer[f1] + 8 * index[1]);
+				c.z = *Pointer<Short4>(buffer[f2] + 8 * index[2]);
+				c.w = *Pointer<Short4>(buffer[f3] + 8 * index[3]);
+				transpose4x4(c.x, c.y, c.z, c.w);
 				break;
 			case 2:
-				c.r = *Pointer<Short4>(buffer[f0] + 4 * index[0]);
-				c.r = As<Short4>(UnpackLow(c.r, *Pointer<Short4>(buffer[f1] + 4 * index[1])));
-				c.b = *Pointer<Short4>(buffer[f2] + 4 * index[2]);
-				c.b = As<Short4>(UnpackLow(c.b, *Pointer<Short4>(buffer[f3] + 4 * index[3])));
-				c.g = c.r;
-				c.r = As<Short4>(UnpackLow(As<Int2>(c.r), As<Int2>(c.b)));
-				c.g = As<Short4>(UnpackHigh(As<Int2>(c.g), As<Int2>(c.b)));
+				c.x = *Pointer<Short4>(buffer[f0] + 4 * index[0]);
+				c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer[f1] + 4 * index[1])));
+				c.z = *Pointer<Short4>(buffer[f2] + 4 * index[2]);
+				c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer[f3] + 4 * index[3])));
+				c.y = c.x;
+				c.x = As<Short4>(UnpackLow(As<Int2>(c.x), As<Int2>(c.z)));
+				c.y = As<Short4>(UnpackHigh(As<Int2>(c.y), As<Int2>(c.z)));
 				break;
 			case 1:
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f0] + 2 * index[0]), 0);
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f1] + 2 * index[1]), 1);
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f2] + 2 * index[2]), 2);
-				c.r = Insert(c.r, *Pointer<Short>(buffer[f3] + 2 * index[3]), 3);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f0] + 2 * index[0]), 0);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f1] + 2 * index[1]), 1);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f2] + 2 * index[2]), 2);
+				c.x = Insert(c.x, *Pointer<Short>(buffer[f3] + 2 * index[3]), 3);
 				break;
 			default:
 				ASSERT(false);
@@ -1625,7 +1624,7 @@
 		}
 	}
 
-	void SamplerCore::sampleTexel(Color4f &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4])
+	void SamplerCore::sampleTexel(Vector4f &c, Short4 &uuuu, Short4 &vvvv, Short4 &wwww, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4])
 	{
 		Int index[4];
 
@@ -1640,34 +1639,34 @@
 		switch(textureComponentCount())
 		{
 		case 4:
-			c.r = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
-			c.g = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
-			c.b = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
-			c.a = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
-			transpose4x4(c.r, c.g, c.b, c.a);
+			c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+			c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+			c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+			c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+			transpose4x4(c.x, c.y, c.z, c.w);
 			break;
 		case 2:
 			// FIXME: Optimal shuffling?
-			c.r.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
-			c.r.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
-			c.b.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
-			c.b.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
-			c.g = c.r;
-			c.r = Float4(c.r.xz, c.b.xz);
-			c.g = Float4(c.g.yw, c.b.yw);
+			c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
+			c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
+			c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
+			c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
+			c.y = c.x;
+			c.x = Float4(c.x.xz, c.z.xz);
+			c.y = Float4(c.y.yw, c.z.yw);
 			break;
 		case 1:
 			// FIXME: Optimal shuffling?
-			c.r.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
-			c.r.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
-			c.r.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
-			c.r.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
+			c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
+			c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
+			c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
+			c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
 
 			if(state.textureFormat == FORMAT_D32F_SHADOW && state.textureFilter != FILTER_GATHER)
 			{
 				Float4 d = Min(Max(z, Float4(0.0f)), Float4(1.0f));
 
-				c.r = As<Float4>(As<Int4>(CmpNLT(c.r, d)) & As<Int4>(Float4(1.0f, 1.0f, 1.0f, 1.0f)));   // FIXME: Only less-equal?
+				c.x = As<Float4>(As<Int4>(CmpNLT(c.x, d)) & As<Int4>(Float4(1.0f)));   // FIXME: Only less-equal?
 			}
 			break;
 		default:
@@ -1746,15 +1745,15 @@
 
 	void SamplerCore::convertFixed12(Short4 &ci, Float4 &cf)
 	{
-		ci = RoundShort4(cf * Float4(0x1000, 0x1000, 0x1000, 0x1000));
+		ci = RoundShort4(cf * Float4(0x1000));
 	}
 
-	void SamplerCore::convertFixed12(Color4i &ci, Color4f &cf)
+	void SamplerCore::convertFixed12(Vector4i &ci, Vector4f &cf)
 	{
-		convertFixed12(ci.r, cf.r);
-		convertFixed12(ci.g, cf.g);
-		convertFixed12(ci.b, cf.b);
-		convertFixed12(ci.a, cf.a);
+		convertFixed12(ci.x, cf.x);
+		convertFixed12(ci.y, cf.y);
+		convertFixed12(ci.z, cf.z);
+		convertFixed12(ci.w, cf.w);
 	}
 
 	void SamplerCore::convertSigned12(Float4 &cf, Short4 &ci)
@@ -1762,22 +1761,22 @@
 		cf = Float4(ci) * Float4(1.0f / 0x0FFE);
 	}
 
-//	void SamplerCore::convertSigned12(Color4f &cf, Color4i &ci)
+//	void SamplerCore::convertSigned12(Vector4f &cf, Vector4i &ci)
 //	{
-//		convertSigned12(cf.r, ci.r);
-//		convertSigned12(cf.g, ci.g);
-//		convertSigned12(cf.b, ci.b);
-//		convertSigned12(cf.a, ci.a);
+//		convertSigned12(cf.x, ci.x);
+//		convertSigned12(cf.y, ci.y);
+//		convertSigned12(cf.z, ci.z);
+//		convertSigned12(cf.w, ci.w);
 //	}
 
 	void SamplerCore::convertSigned15(Float4 &cf, Short4 &ci)
 	{
-		cf = Float4(ci) * Float4(1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF, 1.0f / 0x7FFF);
+		cf = Float4(ci) * Float4(1.0f / 0x7FFF);
 	}
 
 	void SamplerCore::convertUnsigned16(Float4 &cf, Short4 &ci)
 	{
-		cf = Float4(As<UShort4>(ci)) * Float4(1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF, 1.0f / 0xFFFF);
+		cf = Float4(As<UShort4>(ci)) * Float4(1.0f / 0xFFFF);
 	}
 
 	void SamplerCore::sRGBtoLinear16_12(Short4 &c)
diff --git a/src/Shader/SamplerCore.hpp b/src/Shader/SamplerCore.hpp
index c3c11e9..c03e30c 100644
--- a/src/Shader/SamplerCore.hpp
+++ b/src/Shader/SamplerCore.hpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer

 //

-// Copyright(c) 2005-2011 TransGaming Inc.

+// Copyright(c) 2005-2012 TransGaming Inc.

 //

 // All rights reserved. No part of this software may be copied, distributed, transmitted,

 // transcribed, stored in a retrieval system, translated into any human or computer

@@ -22,34 +22,34 @@
 	public:

 		SamplerCore(Pointer<Byte> &r, const Sampler::State &state);

 

-		void sampleTexture(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool bias = false, bool fixed12 = true, bool gradients = false, bool lodProvided = false);

-		void sampleTexture(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Color4f &dsx, Color4f &dsy, bool bias = false, bool gradients = false, bool lodProvided = false);

+		void sampleTexture(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool bias = false, bool fixed12 = true, bool gradients = false, bool lodProvided = false);

+		void sampleTexture(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float4 &q, Vector4f &dsx, Vector4f &dsy, bool bias = false, bool gradients = false, bool lodProvided = false);

 

 	private:

 		void border(Short4 &mask, Float4 &coordinates);

 		void border(Int4 &mask, Float4 &coordinates);

 		Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count);

-		void sampleFilter(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided);

-		void sampleAniso(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided);

-		void sampleQuad(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD);

-		void sampleQuad2D(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float &lod, Int face[4], bool secondLOD);

-		void sample3D(Pointer<Byte> &texture, Color4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, bool secondLOD);

-		void sampleFloatFilter(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided);

-		void sampleFloatAniso(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided);

-		void sampleFloat(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD);

-		void sampleFloat2D(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &z, Float &lod, Int face[4], bool secondLOD);

-		void sampleFloat3D(Pointer<Byte> &texture, Color4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, bool secondLOD);

-		void computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Float &lodBias, Color4f &dsx, Color4f &dsy, bool bias, bool gradients, bool lodProvided);

-		void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float &lodBias, Color4f &dsx, Color4f &dsy, bool bias, bool gradients, bool lodProvided);

+		void sampleFilter(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided);

+		void sampleAniso(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided);

+		void sampleQuad(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD);

+		void sampleQuad2D(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float &lod, Int face[4], bool secondLOD);

+		void sample3D(Pointer<Byte> &texture, Vector4i &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, bool secondLOD);

+		void sampleFloatFilter(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool lodProvided);

+		void sampleFloatAniso(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Int face[4], bool secondLOD, bool lodProvided);

+		void sampleFloat(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, Int face[4], bool secondLOD);

+		void sampleFloat2D(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &z, Float &lod, Int face[4], bool secondLOD);

+		void sampleFloat3D(Pointer<Byte> &texture, Vector4f &c, Float4 &u, Float4 &v, Float4 &w, Float &lod, bool secondLOD);

+		void computeLod(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Float &lodBias, Vector4f &dsx, Vector4f &dsy, bool bias, bool gradients, bool lodProvided);

+		void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float &lodBias, Vector4f &dsx, Vector4f &dsy, bool bias, bool gradients, bool lodProvided);

 		void cubeFace(Int face[4], Float4 &U, Float4 &V, Float4 &lodU, Float4 &lodV, Float4 &x, Float4 &y, Float4 &z);

 		void computeIndices(Int index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Pointer<Byte> &mipmap);

-		void sampleTexel(Color4i &c, Short4 &u, Short4 &v, Short4 &s, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4]);

-		void sampleTexel(Color4f &c, Short4 &u, Short4 &v, Short4 &s, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4]);

+		void sampleTexel(Vector4i &c, Short4 &u, Short4 &v, Short4 &s, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4]);

+		void sampleTexel(Vector4f &c, Short4 &u, Short4 &v, Short4 &s, Float4 &z, Pointer<Byte> &mipmap, Pointer<Byte> buffer[4]);

 		void selectMipmap(Pointer<Byte> &texture, Pointer<Byte> buffer[4], Pointer<Byte> &mipmap, Float &lod, Int face[4], bool secondLOD);

 		void address(Short4 &uuuu, Float4 &uw, AddressingMode addressingMode);

 

 		void convertFixed12(Short4 &ci, Float4 &cf);

-		void convertFixed12(Color4i &ci, Color4f &cf);

+		void convertFixed12(Vector4i &ci, Vector4f &cf);

 		void convertSigned12(Float4 &cf, Short4 &ci);

 		void convertSigned15(Float4 &cf, Short4 &ci);

 		void convertUnsigned16(Float4 &cf, Short4 &ci);

diff --git a/src/Shader/SetupRoutine.cpp b/src/Shader/SetupRoutine.cpp
index 40a2a18..d77a21e 100644
--- a/src/Shader/SetupRoutine.cpp
+++ b/src/Shader/SetupRoutine.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -15,7 +15,7 @@
 #include "Renderer/Primitive.hpp"
 #include "Renderer/Polygon.hpp"
 #include "Renderer/Renderer.hpp"
-#include "Reactor/Shell.hpp"
+#include "Reactor/Reactor.hpp"
 
 namespace sw
 {
@@ -108,10 +108,16 @@
 
 				if(state.twoSidedStencil)
 				{
-					Byte8 clockwiseMask = IfThenElse(A > Float(0.0f), Byte8(0xFFFFFFFFFFFFFFFF), Byte8(0x0000000000000000));
-
-					*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) =  clockwiseMask;
-					*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = ~clockwiseMask;
+					If(A > Float(0.0f))
+					{
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) =  Byte8(0xFFFFFFFFFFFFFFFF);
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0x0000000000000000);
+					}
+					Else
+					{
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask)) =  Byte8(0x0000000000000000);
+						*Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask)) = Byte8(0xFFFFFFFFFFFFFFFF);
+					}
 				}
 
 				if(state.vFace)
@@ -145,8 +151,8 @@
 					Float w = v.w;
 					Float rhw = IfThenElse(w != Float(0.0f), Float(1.0f) / w, Float(1.0f));
 
-					X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,LLLLx16)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,WWWWx16)));
-					Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,TTTTx16)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,HHHHx16)));
+					X[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float>(data + OFFSET(DrawData,Wx16)));
+					Y[i] = RoundInt(*Pointer<Float>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float>(data + OFFSET(DrawData,Hx16)));
 
 					i++;
 				}
@@ -161,8 +167,8 @@
 			
 			Do
 			{
-				yMin = IfThenElse(Y[i] < yMin, Int(Y[i]), yMin);   // FIXME: Min(Y[i], yMin)
-				yMax = IfThenElse(Y[i] > yMax, Int(Y[i]), yMax);   // FIXME: Max(Y[i], yMax)
+				yMin = Min(Y[i], yMin);
+				yMax = Max(Y[i], yMax);
 
 				i++;
 			}
@@ -183,6 +189,9 @@
 			{
 				Return(false);
 			}
+
+			yMin = Max(yMin, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+			yMax = Min(yMax, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
 		
 			For(Int q = 0, q < state.multiSample, q++)
 			{
@@ -229,7 +238,7 @@
 
 					Do
 					{
-						edge(primitive, Int(Xq[i + 1 - d]), Int(Yq[i + 1 - d]), Int(Xq[i + d]), Int(Yq[i + d]), q);
+						edge(primitive, data, Int(Xq[i + 1 - d]), Int(Yq[i + 1 - d]), Int(Xq[i + d]), Int(Yq[i + d]), q);
 
 						i++;
 					}
@@ -372,7 +381,7 @@
 			//	M[2].w = 0;
 			}
 
-			if(state.perspective)
+			if(state.interpolateW)
 			{
 				Float4 ABC = M[0] + M[1] + M[2];
 
@@ -385,7 +394,7 @@
 				*Pointer<Float4>(primitive + OFFSET(Primitive,w.C), 16) = C;
 			}
 
-			if(state.interpolateDepth)
+			if(state.interpolateZ)
 			{
 				Float z0 = *Pointer<Float>(v0 + OFFSET(Vertex,Z));
 				Float z1 = *Pointer<Float>(v1 + OFFSET(Vertex,Z));
@@ -491,7 +500,7 @@
 				if(component == 3) i.y = Float(1.0f);
 
 				if(component == 0) i.z = Float(0.5f);
-				if(component == 1) i.z = Float(0.0f);
+				if(component == 1) i.z = Float(1.0f);
 				if(component == 2) i.z = Float(0.0f);
 				if(component == 3) i.z = Float(1.0f);
 				
@@ -518,17 +527,9 @@
 				i *= w012;
 			}
 
-			Float4 A;
-			Float4 B;
-			Float4 C;
-
-			A = i.xxxx;
-			B = i.yyyy;
-			C = i.zzzz;
-
-			A *= m[0];
-			B *= m[1];
-			C *= m[2];
+			Float4 A = i.xxxx * m[0];
+			Float4 B = i.yyyy * m[1];
+			Float4 C = i.zzzz * m[2];
 
 			C = A + B + C;
 
@@ -550,10 +551,13 @@
 		}
 	}
 
-	void SetupRoutine::edge(Pointer<Byte> &primitive, Int &X1, Int &Y1, Int &X2, Int &Y2, Int &q)
+	void SetupRoutine::edge(Pointer<Byte> &primitive, Pointer<Byte> &data, Int &X1, Int &Y1, Int &X2, Int &Y2, Int &q)
 	{
 		If(Y1 != Y2)
 		{
+			Int xMin = *Pointer<Int>(data + OFFSET(DrawData,scissorX0));
+			Int xMax = *Pointer<Int>(data + OFFSET(DrawData,scissorX1));
+
 			Bool swap = Y2 < Y1;
 
 			Pointer<Byte> leftEdge = primitive + q * sizeof(Primitive) + OFFSET(Primitive,outline->left);
@@ -567,10 +571,10 @@
 			Y1 = IfThenElse(swap, Y2, Y1);
 			Y2 = IfThenElse(swap, Y0, Y2);
 
-			Int y1 = (Y1 + 0x0000000F) >> 4;
-			Int y2 = (Y2 + 0x0000000F) >> 4;
+			Int y1 = Max((Y1 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY0)));
+			Int y2 = Min((Y2 + 0x0000000F) >> 4, *Pointer<Int>(data + OFFSET(DrawData,scissorY1)));
 
-			If(y1 != y2)
+			If(y1 < y2)
 			{
 				// Deltas
 				Int DX12 = X2 - X1;
@@ -579,7 +583,7 @@
 				Int FDX12 = DX12 << 4;
 				Int FDY12 = DY12 << 4;
 
-				Int X = DX12 * (-Y1 & 0xF) + X1 * DY12;
+				Int X = DX12 * ((y1 << 4) - Y1) + X1 * DY12;
 				Int x = X / FDY12;     // Edge
 				Int d = X % FDY12;     // Error-term
 				Int ceil = -d >> 31;   // Ceiling division: remainder <= 0
@@ -597,7 +601,7 @@
 
 				Do
 				{
-					*Pointer<Short>(edge + y * sizeof(Primitive::Span)) = Short(x);
+					*Pointer<Short>(edge + y * sizeof(Primitive::Span)) = Short(Clamp(x, xMin, xMax));
 
 					x += Q;
 					d += R;
diff --git a/src/Shader/SetupRoutine.hpp b/src/Shader/SetupRoutine.hpp
index cf40281..dd280c7 100644
--- a/src/Shader/SetupRoutine.hpp
+++ b/src/Shader/SetupRoutine.hpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer

 //

-// Copyright(c) 2005-2011 TransGaming Inc.

+// Copyright(c) 2005-2012 TransGaming Inc.

 //

 // All rights reserved. No part of this software may be copied, distributed, transmitted,

 // transcribed, stored in a retrieval system, translated into any human or computer

@@ -31,7 +31,7 @@
 

 	private:

 		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool sprite, bool perspective, bool wrap, int component);

-		void edge(Pointer<Byte> &primitive, Int &X1, Int &Y1, Int &X2, Int &Y2, Int &q);

+		void edge(Pointer<Byte> &primitive, Pointer<Byte> &data, Int &X1, Int &Y1, Int &X2, Int &Y2, Int &q);

 		void conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);

 		void conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);

 

diff --git a/src/Shader/Shader.cpp b/src/Shader/Shader.cpp
index 43e8bdc..fcbc839 100644
--- a/src/Shader/Shader.cpp
+++ b/src/Shader/Shader.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -11,42 +11,137 @@
 
 #include "Shader.hpp"
 
+#include "VertexShader.hpp"
+#include "PixelShader.hpp"
 #include "Math.hpp"
 #include "Debug.hpp"
 
-#include <stdarg.h>
 #include <fstream>
 #include <sstream>
 
 namespace sw
 {
-	Shader::Instruction::Instruction()
+	volatile int Shader::serialCounter = 1;
+
+	Shader::Opcode Shader::OPCODE_DP(int i)
 	{
-		operation.opcode = Operation::OPCODE_NOP;
-		destinationParameter.type = Parameter::PARAMETER_VOID;
-		sourceParameter[0].type = Parameter::PARAMETER_VOID;
-		sourceParameter[1].type = Parameter::PARAMETER_VOID;
-		sourceParameter[2].type = Parameter::PARAMETER_VOID;
-		sourceParameter[3].type = Parameter::PARAMETER_VOID;
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_DP1;
+		case 2: return OPCODE_DP2;
+		case 3: return OPCODE_DP3;
+		case 4: return OPCODE_DP4;
+		}
 	}
 
-	Shader::Instruction::Instruction(const unsigned long *token, int size, unsigned char majorVersion)
+	Shader::Opcode Shader::OPCODE_LEN(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_ABS;
+		case 2: return OPCODE_LEN2;
+		case 3: return OPCODE_LEN3;
+		case 4: return OPCODE_LEN4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_DIST(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_DIST1;
+		case 2: return OPCODE_DIST2;
+		case 3: return OPCODE_DIST3;
+		case 4: return OPCODE_DIST4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_NRM(int i)	
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_SGN;
+		case 2: return OPCODE_NRM2;
+		case 3: return OPCODE_NRM3;
+		case 4: return OPCODE_NRM4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_FORWARD(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_FORWARD1;
+		case 2: return OPCODE_FORWARD2;
+		case 3: return OPCODE_FORWARD3;
+		case 4: return OPCODE_FORWARD4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_REFLECT(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_REFLECT1;
+		case 2: return OPCODE_REFLECT2;
+		case 3: return OPCODE_REFLECT3;
+		case 4: return OPCODE_REFLECT4;
+		}
+	}
+
+	Shader::Opcode Shader::OPCODE_REFRACT(int i)
+	{
+		switch(i)
+		{
+		default: ASSERT(false);
+		case 1: return OPCODE_REFRACT1;
+		case 2: return OPCODE_REFRACT2;
+		case 3: return OPCODE_REFRACT3;
+		case 4: return OPCODE_REFRACT4;
+		}
+	}
+
+	Shader::Instruction::Instruction(Opcode opcode) : opcode(opcode), analysis(0)
+	{
+		control = CONTROL_RESERVED0;
+
+		predicate = false;
+		predicateNot = false;
+		predicateSwizzle = 0xE4;
+		
+		coissue = false;
+		samplerType = SAMPLER_UNKNOWN;
+		usage = USAGE_POSITION;
+		usageIndex = 0;
+	}
+
+	Shader::Instruction::Instruction(const unsigned long *token, int size, unsigned char majorVersion) : analysis(0)
 	{
 		parseOperationToken(*token++, majorVersion);
 
-		if(operation.opcode == Operation::OPCODE_IF ||
-		   operation.opcode == Operation::OPCODE_IFC ||
-		   operation.opcode == Operation::OPCODE_LOOP ||
-		   operation.opcode == Operation::OPCODE_REP ||
-		   operation.opcode == Operation::OPCODE_BREAKC ||
-		   operation.opcode == Operation::OPCODE_BREAKP)   // No destination operand
+		samplerType = SAMPLER_UNKNOWN;
+		usage = USAGE_POSITION;
+		usageIndex = 0;
+
+		if(opcode == OPCODE_IF ||
+		   opcode == OPCODE_IFC ||
+		   opcode == OPCODE_LOOP ||
+		   opcode == OPCODE_REP ||
+		   opcode == OPCODE_BREAKC ||
+		   opcode == OPCODE_BREAKP)   // No destination operand
 		{
 			if(size > 0) parseSourceToken(0, token++, majorVersion);
 			if(size > 1) parseSourceToken(1, token++, majorVersion);
 			if(size > 2) parseSourceToken(2, token++, majorVersion);
 			if(size > 3) ASSERT(false);
 		}
-		else if(operation.opcode == Operation::OPCODE_DCL)
+		else if(opcode == OPCODE_DCL)
 		{
 			parseDeclarationToken(*token++);
 			parseDestinationToken(token++, majorVersion);
@@ -57,7 +152,7 @@
 			{
 				parseDestinationToken(token, majorVersion);
 
-				if(destinationParameter.relative && majorVersion >= 3)
+				if(dst.rel.type != PARAMETER_VOID && majorVersion >= 3)
 				{
 					token++;
 					size--;
@@ -67,12 +162,12 @@
 				size--;
 			}
 
-			if(operation.predicate)
+			if(predicate)
 			{
 				ASSERT(size != 0);
 
-				operation.predicateNot = (SourceParameter::Modifier)((*token & 0x0F000000) >> 24) == SourceParameter::MODIFIER_NOT;
-				operation.predicateSwizzle = (unsigned char)((*token & 0x00FF0000) >> 16);
+				predicateNot = (Modifier)((*token & 0x0F000000) >> 24) == MODIFIER_NOT;
+				predicateSwizzle = (unsigned char)((*token & 0x00FF0000) >> 16);
 				
 				token++;
 				size--;
@@ -85,7 +180,7 @@
 				token++;
 				size--;
 
-				if(sourceParameter[i].relative && majorVersion >= 2)
+				if(src[i].rel.type != PARAMETER_VOID && majorVersion >= 2)
 				{
 					token++;
 					size--;
@@ -98,105 +193,40 @@
 	{
 	}
 
-	Shader::Instruction::Operation::Opcode Shader::Instruction::getOpcode() const
-	{
-		return operation.opcode;
-	}
-
-	const Shader::Instruction::DestinationParameter &Shader::Instruction::getDestinationParameter() const
-	{
-		return destinationParameter;
-	}
-
-	const Shader::Instruction::SourceParameter &Shader::Instruction::getSourceParameter(int i) const
-	{
-		return sourceParameter[i];
-	}
-
-	bool Shader::Instruction::isCoissue() const
-	{
-		return operation.coissue;
-	}
-
-	bool Shader::Instruction::isProject() const
-	{
-		return operation.project;
-	}
-
-	bool Shader::Instruction::isBias() const
-	{
-		return operation.bias;
-	}
-
-	bool Shader::Instruction::isPredicate() const
-	{
-		return operation.predicate;
-	}
-
-	bool Shader::Instruction::isPredicateNot() const
-	{
-		return operation.predicateNot;
-	}
-
-	unsigned char Shader::Instruction::getPredicateSwizzle() const
-	{
-		return operation.predicateSwizzle;
-	}
-
-	Shader::Instruction::Operation::Control Shader::Instruction::getControl() const
-	{
-		return operation.control;
-	}
-
-	Shader::Instruction::Operation::Usage Shader::Instruction::getUsage() const
-	{
-		return operation.usage;
-	}
-
-	unsigned char Shader::Instruction::getUsageIndex() const
-	{
-		return operation.usageIndex;
-	}
-
-	Shader::Instruction::Operation::SamplerType Shader::Instruction::getSamplerType() const
-	{
-		return operation.samplerType;
-	}
-
 	std::string Shader::Instruction::string(ShaderType shaderType, unsigned short version) const
 	{
 		std::string instructionString;
 		
-		if(operation.opcode != Operation::OPCODE_DCL)
+		if(opcode != OPCODE_DCL)
 		{
-			instructionString += operation.coissue ? "+ " : "";
+			instructionString += coissue ? "+ " : "";
 			
-			if(operation.predicate)
+			if(predicate)
 			{
-				instructionString += operation.predicateNot ? "(!p0" : "(p0";
-				instructionString += swizzleString(Parameter::PARAMETER_PREDICATE, operation.predicateSwizzle);
+				instructionString += predicateNot ? "(!p0" : "(p0";
+				instructionString += swizzleString(PARAMETER_PREDICATE, predicateSwizzle);
 				instructionString += ") ";
 			}
 
-			instructionString += operation.string(version) + operation.controlString() + destinationParameter.shiftString() + destinationParameter.modifierString();
+			instructionString += operationString(version) + controlString() + dst.shiftString() + dst.modifierString();
 
-			if(destinationParameter.type != Parameter::PARAMETER_VOID)
+			if(dst.type != PARAMETER_VOID)
 			{
-				instructionString += " " + destinationParameter.string(shaderType, version) +
-				                           destinationParameter.relativeString() +
-				                           destinationParameter.maskString(); 
+				instructionString += " " + dst.string(shaderType, version) +
+				                           dst.relativeString() +
+				                           dst.maskString(); 
 			}
 
 			for(int i = 0; i < 4; i++)
 			{
-				if(sourceParameter[i].type != Parameter::PARAMETER_VOID)
+				if(src[i].type != PARAMETER_VOID)
 				{
-					instructionString += (destinationParameter.type != Parameter::PARAMETER_VOID || i > 0) ? ", " : " ";
-					instructionString += sourceParameter[i].preModifierString() +
-										 sourceParameter[i].string(shaderType, version) +
-										 sourceParameter[i].relativeString() + 
-										 sourceParameter[i].postModifierString() + 
-										 sourceParameter[i].swizzleString();
+					instructionString += (dst.type != PARAMETER_VOID || i > 0) ? ", " : " ";
+					instructionString += src[i].preModifierString() +
+										 src[i].string(shaderType, version) +
+										 src[i].relativeString() + 
+										 src[i].postModifierString() + 
+										 src[i].swizzleString();
 				}
 			}
 		}
@@ -204,68 +234,68 @@
 		{
 			instructionString += "dcl";
 
-			if(destinationParameter.type == Parameter::PARAMETER_SAMPLER)
+			if(dst.type == PARAMETER_SAMPLER)
 			{
-				switch(operation.samplerType)
+				switch(samplerType)
 				{
-				case Operation::SAMPLER_UNKNOWN:	instructionString += " ";			break;
-				case Operation::SAMPLER_1D:			instructionString += "_1d ";		break;
-				case Operation::SAMPLER_2D:			instructionString += "_2d ";		break;
-				case Operation::SAMPLER_CUBE:		instructionString += "_cube ";		break;
-				case Operation::SAMPLER_VOLUME:		instructionString += "_volume ";	break;
+				case SAMPLER_UNKNOWN: instructionString += " ";        break;
+				case SAMPLER_1D:      instructionString += "_1d ";     break;
+				case SAMPLER_2D:      instructionString += "_2d ";     break;
+				case SAMPLER_CUBE:    instructionString += "_cube ";   break;
+				case SAMPLER_VOLUME:  instructionString += "_volume "; break;
 				default:
 					ASSERT(false);
 				}
 
-				instructionString += destinationParameter.string(shaderType, version);
+				instructionString += dst.string(shaderType, version);
 			}
-			else if(destinationParameter.type == Parameter::PARAMETER_INPUT ||
-				    destinationParameter.type == Parameter::PARAMETER_OUTPUT ||
-				    destinationParameter.type == Parameter::PARAMETER_TEXTURE)
+			else if(dst.type == PARAMETER_INPUT ||
+				    dst.type == PARAMETER_OUTPUT ||
+				    dst.type == PARAMETER_TEXTURE)
 			{
 				if(version >= 0x0300)
 				{
-					switch(operation.usage)
+					switch(usage)
 					{
-					case Operation::USAGE_POSITION:		instructionString += "_position";		break;
-					case Operation::USAGE_BLENDWEIGHT:	instructionString += "_blendweight";	break;
-					case Operation::USAGE_BLENDINDICES:	instructionString += "_blendindices";	break;
-					case Operation::USAGE_NORMAL:		instructionString += "_normal";			break;
-					case Operation::USAGE_PSIZE:		instructionString += "_psize";			break;
-					case Operation::USAGE_TEXCOORD:		instructionString += "_texcoord";		break;
-					case Operation::USAGE_TANGENT:		instructionString += "_tangent";		break;
-					case Operation::USAGE_BINORMAL:		instructionString += "_binormal";		break;
-					case Operation::USAGE_TESSFACTOR:	instructionString += "_tessfactor";		break;
-					case Operation::USAGE_POSITIONT:	instructionString += "_positiont";		break;
-					case Operation::USAGE_COLOR:		instructionString += "_color";			break;
-					case Operation::USAGE_FOG:			instructionString += "_fog";			break;
-					case Operation::USAGE_DEPTH:		instructionString += "_depth";			break;
-					case Operation::USAGE_SAMPLE:		instructionString += "_sample";			break;
+					case USAGE_POSITION:     instructionString += "_position";     break;
+					case USAGE_BLENDWEIGHT:  instructionString += "_blendweight";  break;
+					case USAGE_BLENDINDICES: instructionString += "_blendindices"; break;
+					case USAGE_NORMAL:       instructionString += "_normal";       break;
+					case USAGE_PSIZE:        instructionString += "_psize";        break;
+					case USAGE_TEXCOORD:     instructionString += "_texcoord";     break;
+					case USAGE_TANGENT:      instructionString += "_tangent";      break;
+					case USAGE_BINORMAL:     instructionString += "_binormal";     break;
+					case USAGE_TESSFACTOR:   instructionString += "_tessfactor";   break;
+					case USAGE_POSITIONT:    instructionString += "_positiont";    break;
+					case USAGE_COLOR:        instructionString += "_color";        break;
+					case USAGE_FOG:          instructionString += "_fog";          break;
+					case USAGE_DEPTH:        instructionString += "_depth";        break;
+					case USAGE_SAMPLE:       instructionString += "_sample";       break;
 					default:
 						ASSERT(false);
 					}
 
-					if(operation.usageIndex > 0)
+					if(usageIndex > 0)
 					{
 						std::ostringstream buffer;
 
-						buffer << (int)operation.usageIndex;
+						buffer << (int)usageIndex;
 
 						instructionString += buffer.str();
 					}
 				}
-				else ASSERT(destinationParameter.type != Parameter::PARAMETER_OUTPUT);
+				else ASSERT(dst.type != PARAMETER_OUTPUT);
 
 				instructionString += " ";
 
-				instructionString += destinationParameter.string(shaderType, version);
-				instructionString += destinationParameter.maskString();
+				instructionString += dst.string(shaderType, version);
+				instructionString += dst.maskString();
 			}
-			else if(destinationParameter.type == Parameter::PARAMETER_MISCTYPE)   // vPos and vFace
+			else if(dst.type == PARAMETER_MISCTYPE)   // vPos and vFace
 			{
 				instructionString += " ";
 
-				instructionString += destinationParameter.string(shaderType, version);
+				instructionString += dst.string(shaderType, version);
 			}
 			else ASSERT(false);
 		}
@@ -273,145 +303,7 @@
 		return instructionString;
 	}
 
-	std::string Shader::Instruction::Operation::string(unsigned short version) const
-	{
-		switch(opcode)
-		{
-		case OPCODE_NOP:			return "nop";
-		case OPCODE_MOV:			return "mov";
-		case OPCODE_ADD:			return "add";
-		case OPCODE_SUB:			return "sub";
-		case OPCODE_MAD:			return "mad";
-		case OPCODE_MUL:			return "mul";
-		case OPCODE_RCP:			return "rcp";
-		case OPCODE_RSQ:			return "rsq";
-		case OPCODE_DP3:			return "dp3";
-		case OPCODE_DP4:			return "dp4";
-		case OPCODE_MIN:			return "min";
-		case OPCODE_MAX:			return "max";
-		case OPCODE_SLT:			return "slt";
-		case OPCODE_SGE:			return "sge";
-		case OPCODE_EXP:			return "exp";
-		case OPCODE_LOG:			return "log";
-		case OPCODE_LIT:			return "lit";
-		case OPCODE_DST:			return "dst";
-		case OPCODE_LRP:			return "lrp";
-		case OPCODE_FRC:			return "frc";
-		case OPCODE_M4X4:			return "m4x4";
-		case OPCODE_M4X3:			return "m4x3";
-		case OPCODE_M3X4:			return "m3x4";
-		case OPCODE_M3X3:			return "m3x3";
-		case OPCODE_M3X2:			return "m3x2";
-		case OPCODE_CALL:			return "call";
-		case OPCODE_CALLNZ:			return "callnz";
-		case OPCODE_LOOP:			return "loop";
-		case OPCODE_RET:			return "ret";
-		case OPCODE_ENDLOOP:		return "endloop";
-		case OPCODE_LABEL:			return "label";
-		case OPCODE_DCL:			return "dcl";
-		case OPCODE_POW:			return "pow";
-		case OPCODE_CRS:			return "crs";
-		case OPCODE_SGN:			return "sgn";
-		case OPCODE_ABS:			return "abs";
-		case OPCODE_NRM:			return "nrm";
-		case OPCODE_SINCOS:			return "sincos";
-		case OPCODE_REP:			return "rep";
-		case OPCODE_ENDREP:			return "endrep";
-		case OPCODE_IF:				return "if";
-		case OPCODE_IFC:			return "ifc";
-		case OPCODE_ELSE:			return "else";
-		case OPCODE_ENDIF:			return "endif";
-		case OPCODE_BREAK:			return "break";
-		case OPCODE_BREAKC:			return "breakc";
-		case OPCODE_MOVA:			return "mova";
-		case OPCODE_DEFB:			return "defb";
-		case OPCODE_DEFI:			return "defi";
-		case OPCODE_TEXCOORD:		return "texcoord";
-		case OPCODE_TEXKILL:		return "texkill";
-		case OPCODE_TEX:
-			if(version < 0x0104)	return "tex";
-			else					return "texld";
-		case OPCODE_TEXBEM:			return "texbem";
-		case OPCODE_TEXBEML:		return "texbeml";
-		case OPCODE_TEXREG2AR:		return "texreg2ar";
-		case OPCODE_TEXREG2GB:		return "texreg2gb";
-		case OPCODE_TEXM3X2PAD:		return "texm3x2pad";
-		case OPCODE_TEXM3X2TEX:		return "texm3x2tex";
-		case OPCODE_TEXM3X3PAD:		return "texm3x3pad";
-		case OPCODE_TEXM3X3TEX:		return "texm3x3tex";
-		case OPCODE_RESERVED0:		return "reserved0";
-		case OPCODE_TEXM3X3SPEC:	return "texm3x3spec";
-		case OPCODE_TEXM3X3VSPEC:	return "texm3x3vspec";
-		case OPCODE_EXPP:			return "expp";
-		case OPCODE_LOGP:			return "logp";
-		case OPCODE_CND:			return "cnd";
-		case OPCODE_DEF:			return "def";
-		case OPCODE_TEXREG2RGB:		return "texreg2rgb";
-		case OPCODE_TEXDP3TEX:		return "texdp3tex";
-		case OPCODE_TEXM3X2DEPTH:	return "texm3x2depth";
-		case OPCODE_TEXDP3:			return "texdp3";
-		case OPCODE_TEXM3X3:		return "texm3x3";
-		case OPCODE_TEXDEPTH:		return "texdepth";
-		case OPCODE_CMP:			return "cmp";
-		case OPCODE_BEM:			return "bem";
-		case OPCODE_DP2ADD:			return "dp2add";
-		case OPCODE_DSX:			return "dsx";
-		case OPCODE_DSY:			return "dsy";
-		case OPCODE_TEXLDD:			return "texldd";
-		case OPCODE_SETP:			return "setp";
-		case OPCODE_TEXLDL:			return "texldl";
-		case OPCODE_BREAKP:			return "breakp";
-		case OPCODE_PHASE:			return "phase";
-		case OPCODE_COMMENT:		return "comment";
-		case OPCODE_END:			return "end";
-		case OPCODE_PS_1_0:			return "ps_1_0";
-		case OPCODE_PS_1_1:			return "ps_1_1";
-		case OPCODE_PS_1_2:			return "ps_1_2";
-		case OPCODE_PS_1_3:			return "ps_1_3";
-		case OPCODE_PS_1_4:			return "ps_1_4";
-		case OPCODE_PS_2_0:			return "ps_2_0";
-		case OPCODE_PS_2_x:			return "ps_2_x";
-		case OPCODE_PS_3_0:			return "ps_3_0";
-		case OPCODE_VS_1_0:			return "vs_1_0";
-		case OPCODE_VS_1_1:			return "vs_1_1";
-		case OPCODE_VS_2_0:			return "vs_2_0";
-		case OPCODE_VS_2_x:			return "vs_2_x";
-		case OPCODE_VS_2_sw:		return "vs_2_sw";
-		case OPCODE_VS_3_0:			return "vs_3_0";
-		case OPCODE_VS_3_sw:		return "vs_3_sw";
-		default:
-			ASSERT(false);
-		}
-
-		return "<unknown>";
-	}
-
-	std::string Shader::Instruction::Operation::controlString() const
-	{
-		if(opcode != OPCODE_LOOP && opcode != OPCODE_BREAKC && opcode != OPCODE_IFC && opcode != OPCODE_SETP)
-		{
-			if(project) return "p";
-
-			if(bias) return "b";
-
-			// FIXME: LOD
-		}
-
-		switch(control)
-		{
-		case 1: return "_gt";
-		case 2: return "_eq";
-		case 3: return "_ge";
-		case 4: return "_lt";
-		case 5: return "_ne";
-		case 6: return "_le";
-		default:
-			return "";
-		//	ASSERT(false);   // FIXME
-		}
-	}
-
-	std::string Shader::Instruction::DestinationParameter::modifierString() const
+	std::string Shader::DestinationParameter::modifierString() const
 	{
 		if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
 		{
@@ -420,6 +312,11 @@
 
 		std::string modifierString;
 
+		if(integer)
+		{
+			modifierString += "_int";
+		}
+
 		if(saturate)
 		{
 			modifierString += "_sat";
@@ -438,7 +335,7 @@
 		return modifierString;
 	}
 
-	std::string Shader::Instruction::DestinationParameter::shiftString() const
+	std::string Shader::DestinationParameter::shiftString() const
 	{
 		if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
 		{
@@ -460,7 +357,7 @@
 		}
 	}
 
-	std::string Shader::Instruction::DestinationParameter::maskString() const
+	std::string Shader::DestinationParameter::maskString() const
 	{
 		if(type == PARAMETER_VOID || type == PARAMETER_LABEL)
 		{
@@ -492,7 +389,7 @@
 		return "";
 	}
 
-	std::string Shader::Instruction::SourceParameter::preModifierString() const
+	std::string Shader::SourceParameter::preModifierString() const
 	{
 		if(type == PARAMETER_VOID)
 		{
@@ -522,30 +419,48 @@
 		return "";
 	}
 
-	std::string Shader::Instruction::Parameter::relativeString() const
+	std::string Shader::Parameter::relativeString() const
 	{
-		if(!relative) return "";
-
-		if(relativeType == Parameter::PARAMETER_ADDR)
+		if(type == PARAMETER_CONST || type == PARAMETER_INPUT || type == PARAMETER_OUTPUT || type == PARAMETER_TEMP)
 		{
-			switch(relativeSwizzle & 0x03)
+			if(rel.type == PARAMETER_VOID)
 			{
-			case 0: return "[a0.x]";
-			case 1: return "[a0.y]";
-			case 2: return "[a0.z]";
-			case 3: return "[a0.w]";
+				return "";
 			}
+			else if(rel.type == PARAMETER_ADDR)
+			{
+				switch(rel.swizzle & 0x03)
+				{
+				case 0: return "[a0.x]";
+				case 1: return "[a0.y]";
+				case 2: return "[a0.z]";
+				case 3: return "[a0.w]";
+				}
+			}
+			else if(rel.type == PARAMETER_TEMP)
+			{
+				std::ostringstream buffer;
+				buffer << rel.index;
+
+				switch(rel.swizzle & 0x03)
+				{
+				case 0: return "[r" + buffer.str() + ".x]";
+				case 1: return "[r" + buffer.str() + ".y]";
+				case 2: return "[r" + buffer.str() + ".z]";
+				case 3: return "[r" + buffer.str() + ".w]";
+				}
+			}
+			else if(rel.type == PARAMETER_LOOP)
+			{
+				return "[aL]";
+			}
+			else ASSERT(false);
 		}
-		else if(relativeType == Parameter::PARAMETER_LOOP)
-		{
-			return "[aL]";
-		}
-		else ASSERT(false);
 
 		return "";
 	}
 
-	std::string Shader::Instruction::SourceParameter::postModifierString() const
+	std::string Shader::SourceParameter::postModifierString() const
 	{
 		if(type == PARAMETER_VOID)
 		{
@@ -575,7 +490,7 @@
 		return "";
 	}
 
-	std::string Shader::Instruction::SourceParameter::swizzleString() const
+	std::string Shader::SourceParameter::swizzleString() const
 	{
 		return Instruction::swizzleString(type, swizzle);
 	}
@@ -584,19 +499,21 @@
 	{
 		if((token & 0xFFFF0000) == 0xFFFF0000 || (token & 0xFFFF0000) == 0xFFFE0000)   // Version token
 		{
-			operation.opcode = (Operation::Opcode)token;
-			operation.predicate = false;
-			operation.coissue = false;
+			opcode = (Opcode)token;
+
+			control = CONTROL_RESERVED0;
+			predicate = false;
+			coissue = false;
 		}
 		else
 		{
-			operation.opcode = (Operation::Opcode)(token & 0x0000FFFF);
-			operation.control = (Operation::Control)((token & 0x00FF0000) >> 16);
+			opcode = (Opcode)(token & 0x0000FFFF);
+			control = (Control)((token & 0x00FF0000) >> 16);
 
 			int size = (token & 0x0F000000) >> 24;
 
-			operation.predicate = (token & 0x10000000) != 0x00000000;
-			operation.coissue = (token & 0x40000000) != 0x00000000;
+			predicate = (token & 0x10000000) != 0x00000000;
+			coissue = (token & 0x40000000) != 0x00000000;
 
 			if(majorVersion < 2)
 			{
@@ -608,7 +525,7 @@
 
 			if(majorVersion < 2)
 			{
-				if(operation.predicate)
+				if(predicate)
 				{
 					ASSERT(false);
 				}
@@ -621,7 +538,7 @@
 
 			if(majorVersion >= 2)
 			{
-				if(operation.coissue)
+				if(coissue)
 				{
 					ASSERT(false);   // Reserved
 				}
@@ -636,43 +553,44 @@
 
 	void Shader::Instruction::parseDeclarationToken(unsigned long token)
 	{
-		operation.samplerType = (Operation::SamplerType)((token & 0x78000000) >> 27);
-		operation.usage = (Operation::Usage)(token & 0x0000001F);
-		operation.usageIndex = (unsigned char)((token & 0x000F0000) >> 16);
+		samplerType = (SamplerType)((token & 0x78000000) >> 27);
+		usage = (Usage)(token & 0x0000001F);
+		usageIndex = (unsigned char)((token & 0x000F0000) >> 16);
 	}
 
 	void Shader::Instruction::parseDestinationToken(const unsigned long *token, unsigned char majorVersion)
 	{
-		destinationParameter.index = (unsigned short)(token[0] & 0x000007FF);
-		destinationParameter.type = (Parameter::Type)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
+		dst.index = (unsigned short)(token[0] & 0x000007FF);
+		dst.type = (ParameterType)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
 
 		// TODO: Check type and index range
 
-		destinationParameter.relative = (token[0] & 0x00002000) != 0x00000000;
-		destinationParameter.relativeType = Parameter::PARAMETER_ADDR;
-		destinationParameter.relativeSwizzle = 0x00;
+		bool relative = (token[0] & 0x00002000) != 0x00000000;
+		dst.rel.type = relative ? PARAMETER_ADDR : PARAMETER_VOID;
+		dst.rel.swizzle = 0x00;
+		dst.rel.scale = 1;
 
-		if(destinationParameter.relative && majorVersion >= 3)
+		if(relative && majorVersion >= 3)
 		{
-			destinationParameter.relativeType = (Parameter::Type)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
-			destinationParameter.relativeSwizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
+			dst.rel.type = (ParameterType)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
+			dst.rel.swizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
 		}
-		else if(destinationParameter.relative) ASSERT(false);   // Reserved
+		else if(relative) ASSERT(false);   // Reserved
 
 		if((token[0] & 0x0000C000) != 0x00000000)
 		{
 			ASSERT(false);   // Reserved
 		}
 
-		destinationParameter.mask = (unsigned char)((token[0] & 0x000F0000) >> 16);
-		destinationParameter.saturate = (token[0] & 0x00100000) != 0;
-		destinationParameter.partialPrecision = (token[0] & 0x00200000) != 0;
-		destinationParameter.centroid = (token[0] & 0x00400000) != 0;
-		destinationParameter.shift = (signed char)((token[0] & 0x0F000000) >> 20) >> 4;
+		dst.mask = (unsigned char)((token[0] & 0x000F0000) >> 16);
+		dst.saturate = (token[0] & 0x00100000) != 0;
+		dst.partialPrecision = (token[0] & 0x00200000) != 0;
+		dst.centroid = (token[0] & 0x00400000) != 0;
+		dst.shift = (signed char)((token[0] & 0x0F000000) >> 20) >> 4;
 
 		if(majorVersion >= 2)
 		{
-			if(destinationParameter.shift)
+			if(dst.shift)
 			{
 				ASSERT(false);   // Reserved
 			}
@@ -687,67 +605,71 @@
 	void Shader::Instruction::parseSourceToken(int i, const unsigned long *token, unsigned char majorVersion)
 	{
 		// Defaults
-		sourceParameter[i].value = (float&)*token;
-		sourceParameter[i].type = Parameter::PARAMETER_VOID;
-		sourceParameter[i].modifier = SourceParameter::MODIFIER_NONE;
-		sourceParameter[i].swizzle = 0xE4;
-		sourceParameter[i].relative = false;
-		sourceParameter[i].relativeType = Parameter::PARAMETER_ADDR;
-		sourceParameter[i].relativeSwizzle = 0x00;
+		src[i].index = 0;
+		src[i].type = PARAMETER_VOID;
+		src[i].modifier = MODIFIER_NONE;
+		src[i].swizzle = 0xE4;
+		src[i].rel.type = PARAMETER_VOID;
+		src[i].rel.swizzle = 0x00;
+		src[i].rel.scale = 1;
 		
-		switch(operation.opcode)
+		switch(opcode)
 		{
-		case Instruction::Operation::OPCODE_DEF:
-			sourceParameter[i].type = Parameter::PARAMETER_FLOATLITERAL;
+		case OPCODE_DEF:
+			src[0].type = PARAMETER_FLOAT4LITERAL;
+			src[0].value[i] = *(float*)token;
 			break;
-		case Instruction::Operation::OPCODE_DEFB:
-			sourceParameter[i].type = Parameter::PARAMETER_BOOLLITERAL;
+		case OPCODE_DEFB:
+			src[0].type = PARAMETER_BOOL1LITERAL;
+			src[0].boolean[0] = *(int*)token;
 			break;
-		case Instruction::Operation::OPCODE_DEFI:
-			sourceParameter[i].type = Parameter::PARAMETER_INTLITERAL;
+		case OPCODE_DEFI:
+			src[0].type = PARAMETER_INT4LITERAL;
+			src[0].integer[i] = *(int*)token;
 			break;
 		default:
-			sourceParameter[i].index = (unsigned short)(token[0] & 0x000007FF);
-			sourceParameter[i].type = (Parameter::Type)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
+			src[i].index = (unsigned short)(token[0] & 0x000007FF);
+			src[i].type = (ParameterType)(((token[0] & 0x00001800) >> 8) | ((token[0] & 0x70000000) >> 28));
 
 			// FIXME: Check type and index range
 
-			sourceParameter[i].relative = (token[0] & 0x00002000) != 0x00000000;
+			bool relative = (token[0] & 0x00002000) != 0x00000000;
+			src[i].rel.type = relative ? PARAMETER_ADDR : PARAMETER_VOID;
 
 			if((token[0] & 0x0000C000) != 0x00000000)
 			{
-				if(operation.opcode != Operation::OPCODE_DEF &&
-				   operation.opcode != Operation::OPCODE_DEFI &&
-				   operation.opcode != Operation::OPCODE_DEFB)
+				if(opcode != OPCODE_DEF &&
+				   opcode != OPCODE_DEFI &&
+				   opcode != OPCODE_DEFB)
 				{
 					ASSERT(false);
 				}
 			}
 
-			sourceParameter[i].swizzle = (unsigned char)((token[0] & 0x00FF0000) >> 16);
-			sourceParameter[i].modifier = (SourceParameter::Modifier)((token[0] & 0x0F000000) >> 24);
+			src[i].swizzle = (unsigned char)((token[0] & 0x00FF0000) >> 16);
+			src[i].modifier = (Modifier)((token[0] & 0x0F000000) >> 24);
 
 			if((token[0] & 0x80000000) != 0x80000000)
 			{
-				if(operation.opcode != Operation::OPCODE_DEF &&
-				   operation.opcode != Operation::OPCODE_DEFI &&
-				   operation.opcode != Operation::OPCODE_DEFB)
+				if(opcode != OPCODE_DEF &&
+				   opcode != OPCODE_DEFI &&
+				   opcode != OPCODE_DEFB)
 				{
 					ASSERT(false);
 				}
 			}
 
-			if(sourceParameter[i].relative && majorVersion >= 2)
+			if(relative && majorVersion >= 2)
 			{
-				sourceParameter[i].relativeType = (Parameter::Type)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
-				sourceParameter[i].relativeSwizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
+				src[i].rel.type = (ParameterType)(((token[1] & 0x00001800) >> 8) | ((token[1] & 0x70000000) >> 28));
+				src[i].rel.swizzle = (unsigned char)((token[1] & 0x00FF0000) >> 16);
 			}
 		}
 	}
 
-	std::string Shader::Instruction::swizzleString(Parameter::Type type, unsigned char swizzle)
+	std::string Shader::Instruction::swizzleString(ParameterType type, unsigned char swizzle)
 	{
-		if(type == Parameter::PARAMETER_VOID || type == Parameter::PARAMETER_LABEL || swizzle == 0xE4)
+		if(type == PARAMETER_VOID || type == PARAMETER_LABEL || swizzle == 0xE4)
 		{
 			return "";
 		}
@@ -803,32 +725,230 @@
 		return swizzleString;
 	}
 
-	std::string Shader::Instruction::Parameter::string(ShaderType shaderType, unsigned short version) const
+	std::string Shader::Instruction::operationString(unsigned short version) const
 	{
-		std::ostringstream buffer;
-
-		if(type == PARAMETER_FLOATLITERAL)
+		switch(opcode)
 		{
-			buffer << value;
-
-			return buffer.str();
+		case OPCODE_NOP:			return "nop";
+		case OPCODE_MOV:			return "mov";
+		case OPCODE_ADD:			return "add";
+		case OPCODE_SUB:			return "sub";
+		case OPCODE_MAD:			return "mad";
+		case OPCODE_MUL:			return "mul";
+		case OPCODE_RCPX:			return "rcpx";
+		case OPCODE_DIV:			return "div";
+		case OPCODE_MOD:			return "mod";
+		case OPCODE_RSQX:			return "rsqx";
+		case OPCODE_SQRT:			return "sqrt";
+		case OPCODE_RSQ:			return "rsq";
+		case OPCODE_LEN2:			return "len2";
+		case OPCODE_LEN3:			return "len3";
+		case OPCODE_LEN4:			return "len4";
+		case OPCODE_DIST1:			return "dist1";
+		case OPCODE_DIST2:			return "dist2";
+		case OPCODE_DIST3:			return "dist3";
+		case OPCODE_DIST4:			return "dist4";
+		case OPCODE_DP3:			return "dp3";
+		case OPCODE_DP4:			return "dp4";
+		case OPCODE_MIN:			return "min";
+		case OPCODE_MAX:			return "max";
+		case OPCODE_SLT:			return "slt";
+		case OPCODE_SGE:			return "sge";
+		case OPCODE_EXP2X:			return "exp2x";
+		case OPCODE_LOG2X:			return "log2x";
+		case OPCODE_LIT:			return "lit";
+		case OPCODE_ATT:			return "att";
+		case OPCODE_LRP:			return "lrp";
+		case OPCODE_STEP:			return "step";
+		case OPCODE_SMOOTH:			return "smooth";
+		case OPCODE_FRC:			return "frc";
+		case OPCODE_M4X4:			return "m4x4";
+		case OPCODE_M4X3:			return "m4x3";
+		case OPCODE_M3X4:			return "m3x4";
+		case OPCODE_M3X3:			return "m3x3";
+		case OPCODE_M3X2:			return "m3x2";
+		case OPCODE_CALL:			return "call";
+		case OPCODE_CALLNZ:			return "callnz";
+		case OPCODE_LOOP:			return "loop";
+		case OPCODE_RET:			return "ret";
+		case OPCODE_ENDLOOP:		return "endloop";
+		case OPCODE_LABEL:			return "label";
+		case OPCODE_DCL:			return "dcl";
+		case OPCODE_POWX:			return "powx";
+		case OPCODE_CRS:			return "crs";
+		case OPCODE_SGN:			return "sgn";
+		case OPCODE_ABS:			return "abs";
+		case OPCODE_NRM2:			return "nrm2";
+		case OPCODE_NRM3:			return "nrm3";
+		case OPCODE_NRM4:			return "nrm4";
+		case OPCODE_SINCOS:			return "sincos";
+		case OPCODE_REP:			return "rep";
+		case OPCODE_ENDREP:			return "endrep";
+		case OPCODE_IF:				return "if";
+		case OPCODE_IFC:			return "ifc";
+		case OPCODE_ELSE:			return "else";
+		case OPCODE_ENDIF:			return "endif";
+		case OPCODE_BREAK:			return "break";
+		case OPCODE_BREAKC:			return "breakc";
+		case OPCODE_MOVA:			return "mova";
+		case OPCODE_DEFB:			return "defb";
+		case OPCODE_DEFI:			return "defi";
+		case OPCODE_TEXCOORD:		return "texcoord";
+		case OPCODE_TEXKILL:		return "texkill";
+		case OPCODE_DISCARD:		return "discard";
+		case OPCODE_TEX:
+			if(version < 0x0104)	return "tex";
+			else					return "texld";
+		case OPCODE_TEXBEM:			return "texbem";
+		case OPCODE_TEXBEML:		return "texbeml";
+		case OPCODE_TEXREG2AR:		return "texreg2ar";
+		case OPCODE_TEXREG2GB:		return "texreg2gb";
+		case OPCODE_TEXM3X2PAD:		return "texm3x2pad";
+		case OPCODE_TEXM3X2TEX:		return "texm3x2tex";
+		case OPCODE_TEXM3X3PAD:		return "texm3x3pad";
+		case OPCODE_TEXM3X3TEX:		return "texm3x3tex";
+		case OPCODE_RESERVED0:		return "reserved0";
+		case OPCODE_TEXM3X3SPEC:	return "texm3x3spec";
+		case OPCODE_TEXM3X3VSPEC:	return "texm3x3vspec";
+		case OPCODE_EXPP:			return "expp";
+		case OPCODE_LOGP:			return "logp";
+		case OPCODE_CND:			return "cnd";
+		case OPCODE_DEF:			return "def";
+		case OPCODE_TEXREG2RGB:		return "texreg2rgb";
+		case OPCODE_TEXDP3TEX:		return "texdp3tex";
+		case OPCODE_TEXM3X2DEPTH:	return "texm3x2depth";
+		case OPCODE_TEXDP3:			return "texdp3";
+		case OPCODE_TEXM3X3:		return "texm3x3";
+		case OPCODE_TEXDEPTH:		return "texdepth";
+		case OPCODE_CMP0:			return "cmp0";
+		case OPCODE_ICMP:			return "icmp";
+		case OPCODE_SELECT:			return "select";
+		case OPCODE_EXTRACT:		return "extract";
+		case OPCODE_INSERT:			return "insert";
+		case OPCODE_BEM:			return "bem";
+		case OPCODE_DP2ADD:			return "dp2add";
+		case OPCODE_DFDX:			return "dFdx";
+		case OPCODE_DFDY:			return "dFdy";
+		case OPCODE_FWIDTH:			return "fwidth";
+		case OPCODE_TEXLDD:			return "texldd";
+		case OPCODE_CMP:			return "cmp";
+		case OPCODE_TEXLDL:			return "texldl";
+		case OPCODE_BREAKP:			return "breakp";
+		case OPCODE_PHASE:			return "phase";
+		case OPCODE_COMMENT:		return "comment";
+		case OPCODE_END:			return "end";
+		case OPCODE_PS_1_0:			return "ps_1_0";
+		case OPCODE_PS_1_1:			return "ps_1_1";
+		case OPCODE_PS_1_2:			return "ps_1_2";
+		case OPCODE_PS_1_3:			return "ps_1_3";
+		case OPCODE_PS_1_4:			return "ps_1_4";
+		case OPCODE_PS_2_0:			return "ps_2_0";
+		case OPCODE_PS_2_x:			return "ps_2_x";
+		case OPCODE_PS_3_0:			return "ps_3_0";
+		case OPCODE_VS_1_0:			return "vs_1_0";
+		case OPCODE_VS_1_1:			return "vs_1_1";
+		case OPCODE_VS_2_0:			return "vs_2_0";
+		case OPCODE_VS_2_x:			return "vs_2_x";
+		case OPCODE_VS_2_sw:		return "vs_2_sw";
+		case OPCODE_VS_3_0:			return "vs_3_0";
+		case OPCODE_VS_3_sw:		return "vs_3_sw";
+		case OPCODE_WHILE:          return "while";
+		case OPCODE_ENDWHILE:       return "endwhile";
+		case OPCODE_COS:            return "cos";
+		case OPCODE_SIN:            return "sin";
+		case OPCODE_TAN:            return "tan";
+		case OPCODE_ACOS:           return "acos";
+		case OPCODE_ASIN:           return "asin";
+		case OPCODE_ATAN:           return "atan";
+		case OPCODE_ATAN2:          return "atan2";
+		case OPCODE_DP1:            return "dp1";
+		case OPCODE_DP2:            return "dp2";
+		case OPCODE_TRUNC:          return "trunc";
+		case OPCODE_FLOOR:          return "floor";
+		case OPCODE_CEIL:           return "ceil";
+		case OPCODE_EXP2:           return "exp2";
+		case OPCODE_LOG2:           return "log2";
+		case OPCODE_EXP:            return "exp";
+		case OPCODE_LOG:            return "log";
+		case OPCODE_POW:            return "pow";
+		case OPCODE_F2B:            return "f2b";
+		case OPCODE_B2F:            return "b2f";
+		case OPCODE_ALL:            return "all";
+		case OPCODE_ANY:            return "any";
+		case OPCODE_NOT:            return "not";
+		case OPCODE_OR:             return "or";
+		case OPCODE_XOR:            return "xor";
+		case OPCODE_AND:            return "and";
+		case OPCODE_FORWARD1:       return "forward1";
+		case OPCODE_FORWARD2:       return "forward2";
+		case OPCODE_FORWARD3:       return "forward3";
+		case OPCODE_FORWARD4:       return "forward4";
+		case OPCODE_REFLECT1:       return "reflect1";
+		case OPCODE_REFLECT2:       return "reflect2";
+		case OPCODE_REFLECT3:       return "reflect3";
+		case OPCODE_REFLECT4:       return "reflect4";
+		case OPCODE_REFRACT1:       return "refract1";
+		case OPCODE_REFRACT2:       return "refract2";
+		case OPCODE_REFRACT3:       return "refract3";
+		case OPCODE_REFRACT4:       return "refract4";
+		case OPCODE_LEAVE:          return "leave";
+		case OPCODE_CONTINUE:       return "continue";
+		case OPCODE_TEST:           return "test";
+		default:
+			ASSERT(false);
 		}
-		else
-		{
-			if(type != PARAMETER_RASTOUT && !(type == PARAMETER_ADDR && shaderType == SHADER_VERTEX) && type != PARAMETER_LOOP && type != PARAMETER_PREDICATE && type != PARAMETER_MISCTYPE)
-			{
-				buffer << index;
 
-				return typeString(shaderType, version) + buffer.str();
-			}
-			else
-			{
-				return typeString(shaderType, version);
-			}
+		return "<unknown>";
+	}
+
+	std::string Shader::Instruction::controlString() const
+	{
+		if(opcode != OPCODE_LOOP && opcode != OPCODE_BREAKC && opcode != OPCODE_IFC && opcode != OPCODE_CMP)
+		{
+			if(project) return "p";
+
+			if(bias) return "b";
+
+			// FIXME: LOD
+		}
+
+		switch(control)
+		{
+		case 1: return "_gt";
+		case 2: return "_eq";
+		case 3: return "_ge";
+		case 4: return "_lt";
+		case 5: return "_ne";
+		case 6: return "_le";
+		default:
+			return "";
+		//	ASSERT(false);   // FIXME
 		}
 	}
 
-	std::string Shader::Instruction::Parameter::typeString(ShaderType shaderType, unsigned short version) const
+	std::string Shader::Parameter::string(ShaderType shaderType, unsigned short version) const
+	{
+		std::ostringstream buffer;
+
+		if(type == PARAMETER_FLOAT4LITERAL)
+		{
+			buffer << '{' << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << '}';
+
+			return buffer.str();
+		}
+		else if(type != PARAMETER_RASTOUT && !(type == PARAMETER_ADDR && shaderType == SHADER_VERTEX) && type != PARAMETER_LOOP && type != PARAMETER_PREDICATE && type != PARAMETER_MISCTYPE)
+		{
+			buffer << index;
+			
+			return typeString(shaderType, version) + buffer.str();
+		}
+		else
+		{
+			return typeString(shaderType, version);
+		}
+	}
+
+	std::string Shader::Parameter::typeString(ShaderType shaderType, unsigned short version) const
 	{
 		switch(type)
 		{
@@ -865,9 +985,9 @@
 			else						ASSERT(false);
 		case PARAMETER_LABEL:			return "l";
 		case PARAMETER_PREDICATE:		return "p0";
-		case PARAMETER_FLOATLITERAL:	return "";
-		case PARAMETER_BOOLLITERAL:		return "";
-		case PARAMETER_INTLITERAL:		return "";
+		case PARAMETER_FLOAT4LITERAL:	return "";
+		case PARAMETER_BOOL1LITERAL:	return "";
+		case PARAMETER_INT4LITERAL:		return "";
 	//	case PARAMETER_VOID:			return "";
 		default:
 			ASSERT(false);
@@ -876,55 +996,83 @@
 		return "";
 	}
 
-	Shader::Shader(const unsigned long *shaderToken)
+	bool Shader::Instruction::isBranch() const
 	{
-		instruction = 0;
-		length = 0;
+		return opcode == OPCODE_IF || opcode == OPCODE_IFC;
+	}
+	
+	bool Shader::Instruction::isCall() const
+	{
+		return opcode == OPCODE_CALL || opcode == OPCODE_CALLNZ;
+	}
 
-		tokenCount = 0;
+	bool Shader::Instruction::isBreak() const
+	{
+		return opcode == OPCODE_BREAK || opcode == OPCODE_BREAKC || opcode == OPCODE_BREAKP;
+	}
 
-		while(shaderToken[tokenCount] != 0x0000FFFF)
-		{
-			tokenCount += sw::Shader::size(shaderToken[tokenCount], (unsigned short)(shaderToken[0] & 0xFFFF)) + 1;
-		}
+	bool Shader::Instruction::isLoop() const
+	{
+		return opcode == OPCODE_LOOP || opcode == OPCODE_REP || opcode == OPCODE_WHILE;
+	}
 
-		tokenCount += 1;
+	bool Shader::Instruction::isEndLoop() const
+	{
+		return opcode == OPCODE_ENDLOOP || opcode == OPCODE_ENDREP || opcode == OPCODE_ENDWHILE;
+	}
 
-		this->shaderToken = new unsigned long[tokenCount];
-		memcpy(this->shaderToken, shaderToken, tokenCount * sizeof(unsigned long));
-
-		unsigned long *hashTokens = new unsigned long[tokenCount];
-		memcpy(hashTokens, shaderToken, tokenCount * sizeof(unsigned long));
-		removeComments(hashTokens, tokenCount);
-		hash = FNV_1((unsigned char*)hashTokens, tokenCount * sizeof(unsigned long));
-		delete[] hashTokens;
+	Shader::Shader() : serialID(serialCounter++)
+	{
+		usedSamplers = 0;
 	}
 
 	Shader::~Shader()
 	{
-		delete[] shaderToken;
-		shaderToken = 0;
-
-		for(int i = 0; i < length; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
 			delete instruction[i];
 			instruction[i] = 0;
 		}
-
-		delete[] instruction;
-		instruction = 0;
 	}
 
-	void Shader::getFunction(void *data, unsigned int *size)
+	void Shader::parse(const unsigned long *token)
 	{
-		if(data)
+		minorVersion = (unsigned char)(token[0] & 0x000000FF);
+		majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
+		shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
+
+		int length;
+
+		if(shaderType == SHADER_VERTEX)
 		{
-			memcpy(data, shaderToken, tokenCount * 4);
+			length = VertexShader::validate(token);
 		}
+		else if(shaderType == SHADER_PIXEL)
+		{
+			length = PixelShader::validate(token);
+		}
+		else ASSERT(false);
 
-		*size = tokenCount * 4;
+		ASSERT(length != 0);
+		instruction.resize(length);
+
+		for(int i = 0; i < length; i++)
+		{
+			while((*token & 0x0000FFFF) == 0x0000FFFE)   // Comment token
+			{
+				int length = (*token & 0x7FFF0000) >> 16;
+
+				token += length + 1;
+			}
+
+			int tokenCount = size(*token);
+
+			instruction[i] = new Instruction(token, tokenCount, majorVersion);
+
+			token += 1 + tokenCount;
+		}
 	}
-
+	
 	int Shader::size(unsigned long opcode) const
 	{
 		return size(opcode, version);
@@ -1056,28 +1204,28 @@
 
 		int length = 0;
 
-		if((opcode & 0x0000FFFF) == ShaderOperation::OPCODE_COMMENT)
+		if((opcode & 0x0000FFFF) == OPCODE_COMMENT)
 		{
 			return (opcode & 0x7FFF0000) >> 16;
 		}
 
-		if(opcode != ShaderOperation::OPCODE_PS_1_0 &&
-		   opcode != ShaderOperation::OPCODE_PS_1_1 &&
-		   opcode != ShaderOperation::OPCODE_PS_1_2 &&
-		   opcode != ShaderOperation::OPCODE_PS_1_3 &&
-		   opcode != ShaderOperation::OPCODE_PS_1_4 &&
-		   opcode != ShaderOperation::OPCODE_PS_2_0 &&
-		   opcode != ShaderOperation::OPCODE_PS_2_x &&
-		   opcode != ShaderOperation::OPCODE_PS_3_0 &&
-		   opcode != ShaderOperation::OPCODE_VS_1_0 &&
-		   opcode != ShaderOperation::OPCODE_VS_1_1 &&
-		   opcode != ShaderOperation::OPCODE_VS_2_0 &&
-		   opcode != ShaderOperation::OPCODE_VS_2_x &&
-		   opcode != ShaderOperation::OPCODE_VS_2_sw &&
-		   opcode != ShaderOperation::OPCODE_VS_3_0 &&
-		   opcode != ShaderOperation::OPCODE_VS_3_sw &&
-		   opcode != ShaderOperation::OPCODE_PHASE &&
-		   opcode != ShaderOperation::OPCODE_END)
+		if(opcode != OPCODE_PS_1_0 &&
+		   opcode != OPCODE_PS_1_1 &&
+		   opcode != OPCODE_PS_1_2 &&
+		   opcode != OPCODE_PS_1_3 &&
+		   opcode != OPCODE_PS_1_4 &&
+		   opcode != OPCODE_PS_2_0 &&
+		   opcode != OPCODE_PS_2_x &&
+		   opcode != OPCODE_PS_3_0 &&
+		   opcode != OPCODE_VS_1_0 &&
+		   opcode != OPCODE_VS_1_1 &&
+		   opcode != OPCODE_VS_2_0 &&
+		   opcode != OPCODE_VS_2_x &&
+		   opcode != OPCODE_VS_2_sw &&
+		   opcode != OPCODE_VS_3_0 &&
+		   opcode != OPCODE_VS_3_sw &&
+		   opcode != OPCODE_PHASE &&
+		   opcode != OPCODE_END)
 		{
 			if(version >= 0x0200)
 			{
@@ -1098,10 +1246,10 @@
 		{
 			switch(opcode & 0x0000FFFF)
 			{
-			case ShaderOperation::OPCODE_TEX:
+			case OPCODE_TEX:
 				length += 1;
 				break;
-			case ShaderOperation::OPCODE_TEXCOORD:
+			case OPCODE_TEXCOORD:
 				length += 1;
 				break;
 			default:
@@ -1142,19 +1290,34 @@
 		return dynamicBranching;
 	}
 
-	bool Shader::usesSampler(int index) const
+	bool Shader::containsBreakInstruction() const
 	{
-		return (sampler & (1 << index)) != 0;
+		return containsBreak;
 	}
 
-	int64_t Shader::getHash() const
+	bool Shader::containsContinueInstruction() const
 	{
-		return hash;
+		return containsContinue;
+	}
+
+	bool Shader::containsLeaveInstruction() const
+	{
+		return containsLeave;
+	}
+
+	bool Shader::usesSampler(int index) const
+	{
+		return (usedSamplers & (1 << index)) != 0;
+	}
+
+	int Shader::getSerialID() const
+	{
+		return serialID;
 	}
 
 	int Shader::getLength() const
 	{
-		return length;
+		return instruction.size();
 	}
 
 	Shader::ShaderType Shader::getShaderType() const
@@ -1176,9 +1339,9 @@
 		vsnprintf(fullName, 1024, fileName, vararg);
 		va_end(vararg);
 
-		std::ofstream file(fullName, std::ofstream::out | std::ofstream::app);
+		std::ofstream file(fullName, std::ofstream::out);
 
-		for(int i = 0; i < length; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
 			file << instruction[i]->string(shaderType, version) << std::endl;
 		}
@@ -1191,12 +1354,19 @@
 		file << instruction[index]->string(shaderType, version) << std::endl;
 	}
 
-	const ShaderInstruction *Shader::getInstruction(int i) const
+	void Shader::append(Instruction *instruction)
 	{
-		if(i < 0 || i >= length)
-		{
-			ASSERT(false);
-		}
+		this->instruction.push_back(instruction);
+	}
+
+	void Shader::declareSampler(int i)
+	{
+		usedSamplers |= 1 << i;
+	}
+
+	const Shader::Instruction *Shader::getInstruction(unsigned int i) const
+	{
+		ASSERT(i < instruction.size());
 
 		return instruction[i];
 	}
@@ -1207,26 +1377,26 @@
 		dirtyConstantsI = 0;
 		dirtyConstantsB = 0;
 
-		for(int i = 0; i < length; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
-			switch(instruction[i]->operation.opcode)
+			switch(instruction[i]->opcode)
 			{
-			case ShaderOperation::OPCODE_DEF:
-				if(instruction[i]->destinationParameter.index + 1 > dirtyConstantsF)
+			case OPCODE_DEF:
+				if(instruction[i]->dst.index + 1 > dirtyConstantsF)
 				{
-					dirtyConstantsF = instruction[i]->destinationParameter.index + 1;
+					dirtyConstantsF = instruction[i]->dst.index + 1;
 				}
 				break;
-			case ShaderOperation::OPCODE_DEFI:
-				if(instruction[i]->destinationParameter.index + 1 > dirtyConstantsI)
+			case OPCODE_DEFI:
+				if(instruction[i]->dst.index + 1 > dirtyConstantsI)
 				{
-					dirtyConstantsI = instruction[i]->destinationParameter.index + 1;
+					dirtyConstantsI = instruction[i]->dst.index + 1;
 				}
 				break;
-			case ShaderOperation::OPCODE_DEFB:
-				if(instruction[i]->destinationParameter.index + 1 > dirtyConstantsB)
+			case OPCODE_DEFB:
+				if(instruction[i]->dst.index + 1 > dirtyConstantsB)
 				{
-					dirtyConstantsB = instruction[i]->destinationParameter.index + 1;
+					dirtyConstantsB = instruction[i]->dst.index + 1;
 				}
 				break;
 			}
@@ -1236,61 +1406,205 @@
 	void Shader::analyzeDynamicBranching()
 	{
 		dynamicBranching = false;
+		containsLeave = false;
+		containsBreak = false;
+		containsContinue = false;
 
-		for(int i = 0; i < length; i++)
+		// Determine global presence of branching instructions
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
-			switch(instruction[i]->getOpcode())
+			switch(instruction[i]->opcode)
 			{
-			case ShaderOperation::OPCODE_CALLNZ:
-			case ShaderOperation::OPCODE_IF:
-			case ShaderOperation::OPCODE_IFC:
-			case ShaderOperation::OPCODE_BREAK:
-			case ShaderOperation::OPCODE_BREAKC:
-			case ShaderOperation::OPCODE_SETP:
-			case ShaderOperation::OPCODE_BREAKP:
-				if(instruction[i]->sourceParameter[0].type != ShaderParameter::PARAMETER_CONSTBOOL)
+			case OPCODE_CALLNZ:
+			case OPCODE_IF:
+			case OPCODE_IFC:
+			case OPCODE_BREAK:
+			case OPCODE_BREAKC:
+			case OPCODE_CMP:
+			case OPCODE_BREAKP:
+			case OPCODE_LEAVE:
+			case OPCODE_CONTINUE:
+				if(instruction[i]->src[0].type != PARAMETER_CONSTBOOL)
 				{
 					dynamicBranching = true;
+				}
+
+				if(instruction[i]->opcode == OPCODE_LEAVE)
+				{
+					containsLeave = true;
+				}
+				
+				if(instruction[i]->isBreak())
+				{
+					containsBreak = true;
+				}
+
+				if(instruction[i]->opcode == OPCODE_CONTINUE)
+				{
+					containsContinue = true;
+				}
+			}
+		}
+
+		// Conservatively determine which instructions are affected by dynamic branching
+		int branchDepth = 0;
+		int breakDepth = 0;
+		int continueDepth = 0;
+		bool leaveReturn = false;
+
+		for(unsigned int i = 0; i < instruction.size(); i++)
+		{
+			// If statements
+			if(instruction[i]->isBranch())
+			{
+				branchDepth++;
+			}
+			else if(instruction[i]->opcode == OPCODE_ENDIF)
+			{
+				branchDepth--;
+			}
+
+			if(branchDepth > 0)
+			{
+				instruction[i]->analysisBranch = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_BRANCH);
+				}
+			}
+
+			// Break statemement
+			if(instruction[i]->isBreak())
+			{
+				breakDepth++;
+			}
+			else if(instruction[i]->isEndLoop())
+			{
+				breakDepth--;
+			}
+
+			if(breakDepth > 0)
+			{
+				if(instruction[i]->isLoop())   // Nested loop, don't make the end of it disable the break execution mask
+				{
+					breakDepth++;
+				}
+
+				instruction[i]->analysisBreak = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_BRANCH);
+				}
+			}
+
+			// Continue statement
+			if(instruction[i]->opcode == OPCODE_CONTINUE)
+			{
+				continueDepth++;
+			}
+			else if(instruction[i]->isEndLoop())
+			{
+				continueDepth--;
+			}
+
+			if(continueDepth > 0)
+			{
+				if(instruction[i]->isLoop())   // Nested loop, don't make the end of it disable the break execution mask
+				{
+					continueDepth++;
+				}
+
+				instruction[i]->analysisContinue = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_CONTINUE);
+				}
+			}
+
+			// Return (leave) statement
+			if(instruction[i]->opcode == OPCODE_LEAVE)
+			{
+				leaveReturn = true;
+			}
+			else if(instruction[i]->opcode == OPCODE_RET)   // End of the function
+			{
+				leaveReturn = false;
+			}
+
+			if(leaveReturn)
+			{
+				instruction[i]->analysisLeave = true;
+
+				if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, ANALYSIS_LEAVE);
+				}
+			}
+		}
+	}
+
+	void Shader::markFunctionAnalysis(int functionLabel, Analysis flag)
+	{
+		bool marker = false;
+		for(unsigned int i = 0; i < instruction.size(); i++)
+		{
+			if(!marker)
+			{
+				if(instruction[i]->opcode == OPCODE_LABEL && instruction[i]->dst.label == functionLabel)
+				{
+					marker = true;
+				}
+			}
+			else
+			{
+				if(instruction[i]->opcode == OPCODE_RET)
+				{
 					break;
 				}
+				else if(instruction[i]->isCall())
+				{
+					markFunctionAnalysis(instruction[i]->dst.label, flag);
+				}
+
+				instruction[i]->analysis |= flag;
 			}
 		}
 	}
 
 	void Shader::analyzeSamplers()
 	{
-		sampler = 0;
-
-		for(int i = 0; i < length; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
-			switch(instruction[i]->getOpcode())
+			switch(instruction[i]->opcode)
 			{
-			case ShaderOperation::OPCODE_TEX:
-			case ShaderOperation::OPCODE_TEXBEM:
-			case ShaderOperation::OPCODE_TEXBEML:
-			case ShaderOperation::OPCODE_TEXREG2AR:
-			case ShaderOperation::OPCODE_TEXREG2GB:
-			case ShaderOperation::OPCODE_TEXM3X2TEX:
-			case ShaderOperation::OPCODE_TEXM3X3TEX:
-			case ShaderOperation::OPCODE_TEXM3X3SPEC:
-			case ShaderOperation::OPCODE_TEXM3X3VSPEC:
-			case ShaderOperation::OPCODE_TEXREG2RGB:
-			case ShaderOperation::OPCODE_TEXDP3TEX:
-			case ShaderOperation::OPCODE_TEXM3X2DEPTH:
-			case ShaderOperation::OPCODE_TEXLDD:
-			case ShaderOperation::OPCODE_TEXLDL:
+			case OPCODE_TEX:
+			case OPCODE_TEXBEM:
+			case OPCODE_TEXBEML:
+			case OPCODE_TEXREG2AR:
+			case OPCODE_TEXREG2GB:
+			case OPCODE_TEXM3X2TEX:
+			case OPCODE_TEXM3X3TEX:
+			case OPCODE_TEXM3X3SPEC:
+			case OPCODE_TEXM3X3VSPEC:
+			case OPCODE_TEXREG2RGB:
+			case OPCODE_TEXDP3TEX:
+			case OPCODE_TEXM3X2DEPTH:
+			case OPCODE_TEXLDD:
+			case OPCODE_TEXLDL:
 				{
-					ShaderParameter &dst = instruction[i]->destinationParameter;
-					ShaderParameter &src1 = instruction[i]->sourceParameter[1];
+					Parameter &dst = instruction[i]->dst;
+					Parameter &src1 = instruction[i]->src[1];
 
 					if(majorVersion >= 2)
 					{
-						ASSERT(src1.type == ShaderParameter::PARAMETER_SAMPLER);
-						sampler |= 1 << src1.index;
+						usedSamplers |= 1 << src1.index;
 					}
 					else
 					{
-						sampler |= 1 << dst.index;
+						usedSamplers |= 1 << dst.index;
 					}
 				}
 				break;
@@ -1298,21 +1612,57 @@
 		}
 	}
 
-	void Shader::removeComments(unsigned long *shaderToken, int tokenCount)
+	// Assigns a unique index to each call instruction, on a per label basis.
+	// This is used to know what basic block to return to.
+	void Shader::analyzeCallSites()
 	{
-		for(int i = 0; i < tokenCount; )
-		{
-			int instructionSize = sw::Shader::size(shaderToken[i], (unsigned short)(shaderToken[0] & 0xFFFF)) + 1;
+		int callSiteIndex[2048] = {0};
 
-			if((shaderToken[i] & 0x0000FFFF) == ShaderOperation::OPCODE_COMMENT)
+		for(unsigned int i = 0; i < instruction.size(); i++)
+		{
+			if(instruction[i]->opcode == OPCODE_CALL || instruction[i]->opcode == OPCODE_CALLNZ)
 			{
-				for(int j = 0; j < instructionSize; j++)
+				int label = instruction[i]->dst.label;
+
+				instruction[i]->dst.callSite = callSiteIndex[label]++;
+			}
+		}
+	}
+
+	void Shader::analyzeDynamicIndexing()
+	{
+		dynamicallyIndexedTemporaries = false;
+		dynamicallyIndexedInput = false;
+		dynamicallyIndexedOutput = false;
+
+		for(unsigned int i = 0; i < instruction.size(); i++)
+		{
+			if(instruction[i]->dst.rel.type == PARAMETER_ADDR ||
+			   instruction[i]->dst.rel.type == PARAMETER_LOOP ||
+			   instruction[i]->dst.rel.type == PARAMETER_TEMP)
+			{
+				switch(instruction[i]->dst.type)
 				{
-					shaderToken[i + j] = ShaderOperation::OPCODE_NOP;
+				case PARAMETER_TEMP:   dynamicallyIndexedTemporaries = true; break;
+				case PARAMETER_INPUT:  dynamicallyIndexedInput = true;       break;
+				case PARAMETER_OUTPUT: dynamicallyIndexedOutput = true;      break;
 				}
 			}
 
-			i += instructionSize;
+			for(int j = 0; j < 3; j++)
+			{
+				if(instruction[i]->src[j].rel.type == PARAMETER_ADDR ||
+				   instruction[i]->src[j].rel.type == PARAMETER_LOOP ||
+				   instruction[i]->src[j].rel.type == PARAMETER_TEMP)
+				{
+					switch(instruction[i]->src[j].type)
+					{
+					case PARAMETER_TEMP:   dynamicallyIndexedTemporaries = true; break;
+					case PARAMETER_INPUT:  dynamicallyIndexedInput = true;       break;
+					case PARAMETER_OUTPUT: dynamicallyIndexedOutput = true;      break;
+					}
+				}
+			}
 		}
 	}
 }
diff --git a/src/Shader/Shader.hpp b/src/Shader/Shader.hpp
index d2d6fe3..b08b56e 100644
--- a/src/Shader/Shader.hpp
+++ b/src/Shader/Shader.hpp
@@ -1,459 +1,572 @@
-// SwiftShader Software Renderer
-//
-// Copyright(c) 2005-2011 TransGaming Inc.
-//
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
-//
-
-#ifndef sw_Shader_hpp
-#define sw_Shader_hpp
-
-#include "Common/Types.hpp"
-
-#include <string>
-
-namespace sw
-{
-	class Shader
-	{
-	public:
-		enum ShaderType
-		{
-			SHADER_PIXEL = 0xFFFF,
-			SHADER_VERTEX = 0xFFFE,
-			SHADER_GEOMETRY = 0xFFFD
-		};
-
-		class Instruction
-		{
-			friend Shader;
-
-		public:
-			Instruction();
-			Instruction(const unsigned long *token, int size, unsigned char majorVersion);
-
-			virtual ~Instruction();
-
-			struct Operation
-			{
-				enum Opcode
-				{
-					// Extracted from d3d9types.h
-					OPCODE_NOP = 0,
-					OPCODE_MOV,
-					OPCODE_ADD,
-					OPCODE_SUB,
-					OPCODE_MAD,
-					OPCODE_MUL,
-					OPCODE_RCP,
-					OPCODE_RSQ,
-					OPCODE_DP3,
-					OPCODE_DP4,
-					OPCODE_MIN,
-					OPCODE_MAX,
-					OPCODE_SLT,
-					OPCODE_SGE,
-					OPCODE_EXP,
-					OPCODE_LOG,
-					OPCODE_LIT,
-					OPCODE_DST,
-					OPCODE_LRP,
-					OPCODE_FRC,
-					OPCODE_M4X4,
-					OPCODE_M4X3,
-					OPCODE_M3X4,
-					OPCODE_M3X3,
-					OPCODE_M3X2,
-					OPCODE_CALL,
-					OPCODE_CALLNZ,
-					OPCODE_LOOP,
-					OPCODE_RET,
-					OPCODE_ENDLOOP,
-					OPCODE_LABEL,
-					OPCODE_DCL,
-					OPCODE_POW,
-					OPCODE_CRS,
-					OPCODE_SGN,
-					OPCODE_ABS,
-					OPCODE_NRM,
-					OPCODE_SINCOS,
-					OPCODE_REP,
-					OPCODE_ENDREP,
-					OPCODE_IF,
-					OPCODE_IFC,
-					OPCODE_ELSE,
-					OPCODE_ENDIF,
-					OPCODE_BREAK,
-					OPCODE_BREAKC,
-					OPCODE_MOVA,
-					OPCODE_DEFB,
-					OPCODE_DEFI,
-
-					OPCODE_TEXCOORD = 64,
-					OPCODE_TEXKILL,
-					OPCODE_TEX,
-					OPCODE_TEXBEM,
-					OPCODE_TEXBEML,
-					OPCODE_TEXREG2AR,
-					OPCODE_TEXREG2GB,
-					OPCODE_TEXM3X2PAD,
-					OPCODE_TEXM3X2TEX,
-					OPCODE_TEXM3X3PAD,
-					OPCODE_TEXM3X3TEX,
-					OPCODE_RESERVED0,
-					OPCODE_TEXM3X3SPEC,
-					OPCODE_TEXM3X3VSPEC,
-					OPCODE_EXPP,
-					OPCODE_LOGP,
-					OPCODE_CND,
-					OPCODE_DEF,
-					OPCODE_TEXREG2RGB,
-					OPCODE_TEXDP3TEX,
-					OPCODE_TEXM3X2DEPTH,
-					OPCODE_TEXDP3,
-					OPCODE_TEXM3X3,
-					OPCODE_TEXDEPTH,
-					OPCODE_CMP,
-					OPCODE_BEM,
-					OPCODE_DP2ADD,
-					OPCODE_DSX,
-					OPCODE_DSY,
-					OPCODE_TEXLDD,
-					OPCODE_SETP,
-					OPCODE_TEXLDL,
-					OPCODE_BREAKP,
-
-					OPCODE_PHASE = 0xFFFD,
-					OPCODE_COMMENT = 0xFFFE,
-					OPCODE_END = 0xFFFF,
-
-					OPCODE_PS_1_0 = 0xFFFF0100,
-					OPCODE_PS_1_1 = 0xFFFF0101,
-					OPCODE_PS_1_2 = 0xFFFF0102,
-					OPCODE_PS_1_3 = 0xFFFF0103,
-					OPCODE_PS_1_4 = 0xFFFF0104,
-					OPCODE_PS_2_0 = 0xFFFF0200,
-					OPCODE_PS_2_x = 0xFFFF0201,
-					OPCODE_PS_3_0 = 0xFFFF0300,
-					
-					OPCODE_VS_1_0 = 0xFFFE0100,
-					OPCODE_VS_1_1 = 0xFFFE0101,
-					OPCODE_VS_2_0 = 0xFFFE0200,
-					OPCODE_VS_2_x = 0xFFFE0201,
-					OPCODE_VS_2_sw = 0xFFFE02FF,
-					OPCODE_VS_3_0 = 0xFFFE0300,
-					OPCODE_VS_3_sw = 0xFFFE03FF,
-				};
-
-				enum Control
-				{
-					CONTROL_RESERVED0,
-					CONTROL_GT,
-					CONTROL_EQ,
-					CONTROL_GE,
-					CONTROL_LT,
-					CONTROL_NE,
-					CONTROL_LE,
-					CONTROL_RESERVED1
-				};
-
-				enum SamplerType
-				{
-					SAMPLER_UNKNOWN,
-					SAMPLER_1D,
-					SAMPLER_2D,
-					SAMPLER_CUBE,
-					SAMPLER_VOLUME
-				};
-
-				enum Usage   // For vertex input/output declarations
-				{
-					USAGE_POSITION = 0,
-					USAGE_BLENDWEIGHT = 1,
-					USAGE_BLENDINDICES = 2,
-					USAGE_NORMAL = 3,
-					USAGE_PSIZE = 4,
-					USAGE_TEXCOORD = 5,
-					USAGE_TANGENT = 6,
-					USAGE_BINORMAL = 7,
-					USAGE_TESSFACTOR = 8,
-					USAGE_POSITIONT = 9,
-					USAGE_COLOR = 10,
-					USAGE_FOG = 11,
-					USAGE_DEPTH = 12,
-					USAGE_SAMPLE = 13
-				};
-
-				Operation() : opcode(OPCODE_NOP), control(CONTROL_RESERVED0), predicate(false), predicateNot(false), predicateSwizzle(0xE4), coissue(false), samplerType(SAMPLER_UNKNOWN), usage(USAGE_POSITION), usageIndex(0)
-				{
-				}
-
-				std::string string(unsigned short version) const;
-				std::string controlString() const;
-
-				Opcode opcode;
-				
-				union
-				{
-					Control control;
-					
-					struct
-					{
-						unsigned char project : 1;
-						unsigned char bias : 1;
-					};
-				};
-
-				bool predicate;
-				bool predicateNot;   // Negative predicate
-				unsigned char predicateSwizzle;
-
-				bool coissue;
-				SamplerType samplerType;
-				Usage usage;
-				unsigned char usageIndex;
-			};
-
-			struct Parameter
-			{
-				enum Type
-				{
-					PARAMETER_TEMP = 0,
-					PARAMETER_INPUT = 1,
-					PARAMETER_CONST = 2,
-					PARAMETER_TEXTURE = 3,
-					PARAMETER_ADDR = 3,
-					PARAMETER_RASTOUT = 4,
-					PARAMETER_ATTROUT = 5,
-					PARAMETER_TEXCRDOUT = 6,
-					PARAMETER_OUTPUT = 6,
-					PARAMETER_CONSTINT = 7,
-					PARAMETER_COLOROUT = 8,
-					PARAMETER_DEPTHOUT = 9,
-					PARAMETER_SAMPLER = 10,
-					PARAMETER_CONST2 = 11,
-					PARAMETER_CONST3 = 12,
-					PARAMETER_CONST4 = 13,
-					PARAMETER_CONSTBOOL = 14,
-					PARAMETER_LOOP = 15,
-					PARAMETER_TEMPFLOAT16 = 16,
-					PARAMETER_MISCTYPE = 17,
-					PARAMETER_LABEL = 18,
-					PARAMETER_PREDICATE = 19,
-
-					// Internally used
-					PARAMETER_FLOATLITERAL = 20,
-					PARAMETER_BOOLLITERAL = 21,
-					PARAMETER_INTLITERAL = 22,
-
-					PARAMETER_VOID
-				};
-
-				union
-				{
-					unsigned int index;   // For registers
-					float value;          // For float constants
-					int integer;          // For integer constants
-					bool boolean;         // For boolean constants
-				};
-
-				Parameter() : type(PARAMETER_VOID), index(0), relative(false), relativeType(PARAMETER_VOID), relativeSwizzle(0xE4)
-				{
-				}
-
-				std::string string(ShaderType shaderType, unsigned short version) const;
-				std::string typeString(ShaderType shaderType, unsigned short version) const;
-				std::string relativeString() const;
-
-				Type type;
-				bool relative;
-				Type relativeType;
-				unsigned char relativeSwizzle;
-			};
-
-			struct DestinationParameter : Parameter
-			{
-				union
-				{
-					unsigned char mask;
-
-					struct
-					{
-						bool x : 1;
-						bool y : 1;
-						bool z : 1;
-						bool w : 1;
-					};
-				};
-
-				DestinationParameter() : mask(0xF), saturate(false), partialPrecision(false), centroid(false), shift(0)
-				{
-				}
-
-				std::string modifierString() const;
-				std::string shiftString() const;
-				std::string maskString() const;
-
-				bool saturate;
-				bool partialPrecision;
-				bool centroid;
-				signed char shift;
-			};
-
-			struct SourceParameter : Parameter
-			{
-				enum Modifier
-				{
-					MODIFIER_NONE,
-					MODIFIER_NEGATE,
-					MODIFIER_BIAS,
-					MODIFIER_BIAS_NEGATE,
-					MODIFIER_SIGN,
-					MODIFIER_SIGN_NEGATE,
-					MODIFIER_COMPLEMENT,
-					MODIFIER_X2,
-					MODIFIER_X2_NEGATE,
-					MODIFIER_DZ,
-					MODIFIER_DW,
-					MODIFIER_ABS,
-					MODIFIER_ABS_NEGATE,
-					MODIFIER_NOT
-				};
-
-				SourceParameter() : swizzle(0xE4), modifier(MODIFIER_NONE)
-				{
-				}
-
-				std::string swizzleString() const;
-				std::string preModifierString() const;
-				std::string postModifierString() const;
-
-				unsigned char swizzle;
-				Modifier modifier;
-			};
-
-			void parseOperationToken(unsigned long token, unsigned char majorVersion);
-			void parseDeclarationToken(unsigned long token);
-			void parseDestinationToken(const unsigned long *token, unsigned char majorVersion);
-			void parseSourceToken(int i, const unsigned long *token, unsigned char majorVersion);
-
-			Operation::Opcode getOpcode() const;
-			const DestinationParameter &getDestinationParameter() const;
-			const SourceParameter &getSourceParameter(int i) const;
-
-			bool isCoissue() const;
-			bool isProject() const;
-			bool isBias() const;
-			bool isPredicate() const;
-			bool isPredicateNot() const;
-			unsigned char getPredicateSwizzle() const;
-			Operation::Control getControl() const;
-			Operation::Usage getUsage() const;
-			unsigned char getUsageIndex() const;
-			Operation::SamplerType getSamplerType() const;
-
-			std::string string(ShaderType shaderType, unsigned short version) const;
-
-		protected:
-			Operation operation;
-			DestinationParameter destinationParameter;
-			SourceParameter sourceParameter[4];
-
-		private:
-			static std::string swizzleString(Parameter::Type type, unsigned char swizzle);
-		};
-
-		Shader(const unsigned long *shaderToken);
-
-		~Shader();
-
-		void getFunction(void *data, unsigned int *size);
-
-		int64_t getHash() const;
-		int getLength() const;
-		ShaderType getShaderType() const;
-		unsigned short getVersion() const;
-
-		const Instruction *getInstruction(int i) const;
-		int size(unsigned long opcode) const;
-		static int size(unsigned long opcode, unsigned short version);
-
-		void print(const char *fileName, ...) const;
-		void printInstruction(int index, const char *fileName) const;
-
-		static bool maskContainsComponent(int mask, int component);
-		static bool swizzleContainsComponent(int swizzle, int component);
-		static bool swizzleContainsComponentMasked(int swizzle, int component, int mask);
-
-		bool containsDynamicBranching() const;
-		bool usesSampler(int i) const;
-
-		struct Semantic
-		{
-			Semantic(unsigned char usage = 0xFF, unsigned char index = 0xFF) : usage(usage), index(index), centroid(false)
-			{
-			}
-
-			bool operator==(const Semantic &semantic) const
-			{
-				return usage == semantic.usage && index == semantic.index;
-			}
-
-			bool active() const
-			{
-				return usage != 0xFF;
-			}
-
-			unsigned char usage;
-			unsigned char index;
-			bool centroid;
-		};
-
-		unsigned int dirtyConstantsF;   // FIXME: Private
-		unsigned int dirtyConstantsI;   // FIXME: Private
-		unsigned int dirtyConstantsB;   // FIXME: Private
-
-	protected:
-		void analyzeDirtyConstants();
-		void analyzeDynamicBranching();
-		void analyzeSamplers();
-
-		ShaderType shaderType;
-
-		union
-		{
-			unsigned short version;
-
-			struct
-			{
-				unsigned char minorVersion;
-				unsigned char majorVersion;
-			};
-		};
-
-		int length;
-		Instruction **instruction;
-
-	private:
-		static void removeComments(unsigned long *shaderToken, int tokenCount);
-
-		int64_t hash;
-
-		bool dynamicBranching;
-		unsigned short sampler;
-
-		unsigned long *shaderToken;
-		int tokenCount;
-	};
-
-	typedef Shader::Instruction::Operation ShaderOperation;
-	typedef Shader::Instruction ShaderInstruction;
-	typedef Shader::Instruction::Parameter ShaderParameter;
-	typedef Shader::Instruction::Operation::Opcode ShaderOpcode;
-}
-
-#endif   // sw_Shader_hpp
+// SwiftShader Software Renderer

+//

+// Copyright(c) 2005-2012 TransGaming Inc.

+//

+// All rights reserved. No part of this software may be copied, distributed, transmitted,

+// transcribed, stored in a retrieval system, translated into any human or computer

+// language by any means, or disclosed to third parties without the explicit written

+// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express

+// or implied, including but not limited to any patent rights, are granted to you.

+//

+

+#ifndef sw_Shader_hpp

+#define sw_Shader_hpp

+

+#include "Common/Types.hpp"

+

+#include <string>

+#include <vector>

+

+namespace sw

+{

+	class Shader

+	{

+	public:

+		enum ShaderType

+		{

+			SHADER_PIXEL = 0xFFFF,

+			SHADER_VERTEX = 0xFFFE,

+			SHADER_GEOMETRY = 0xFFFD

+		};

+

+		enum Opcode

+		{

+			// Matches order in d3d9types.h

+			OPCODE_NOP = 0,

+			OPCODE_MOV,

+			OPCODE_ADD,

+			OPCODE_SUB,

+			OPCODE_MAD,

+			OPCODE_MUL,

+			OPCODE_RCPX,

+			OPCODE_RSQX,

+			OPCODE_DP3,

+			OPCODE_DP4,

+			OPCODE_MIN,

+			OPCODE_MAX,

+			OPCODE_SLT,

+			OPCODE_SGE,

+			OPCODE_EXP2X,   // D3DSIO_EXP

+			OPCODE_LOG2X,   // D3DSIO_LOG

+			OPCODE_LIT,

+			OPCODE_ATT,   // D3DSIO_DST

+			OPCODE_LRP,

+			OPCODE_FRC,

+			OPCODE_M4X4,

+			OPCODE_M4X3,

+			OPCODE_M3X4,

+			OPCODE_M3X3,

+			OPCODE_M3X2,

+			OPCODE_CALL,

+			OPCODE_CALLNZ,

+			OPCODE_LOOP,

+			OPCODE_RET,

+			OPCODE_ENDLOOP,

+			OPCODE_LABEL,

+			OPCODE_DCL,

+			OPCODE_POWX,

+			OPCODE_CRS,

+			OPCODE_SGN,

+			OPCODE_ABS,

+			OPCODE_NRM3,   // D3DSIO_NRM

+			OPCODE_SINCOS,

+			OPCODE_REP,

+			OPCODE_ENDREP,

+			OPCODE_IF,

+			OPCODE_IFC,

+			OPCODE_ELSE,

+			OPCODE_ENDIF,

+			OPCODE_BREAK,

+			OPCODE_BREAKC,

+			OPCODE_MOVA,

+			OPCODE_DEFB,

+			OPCODE_DEFI,

+

+			OPCODE_TEXCOORD = 64,

+			OPCODE_TEXKILL,

+			OPCODE_TEX,

+			OPCODE_TEXBEM,

+			OPCODE_TEXBEML,

+			OPCODE_TEXREG2AR,

+			OPCODE_TEXREG2GB,

+			OPCODE_TEXM3X2PAD,

+			OPCODE_TEXM3X2TEX,

+			OPCODE_TEXM3X3PAD,

+			OPCODE_TEXM3X3TEX,

+			OPCODE_RESERVED0,

+			OPCODE_TEXM3X3SPEC,

+			OPCODE_TEXM3X3VSPEC,

+			OPCODE_EXPP,

+			OPCODE_LOGP,

+			OPCODE_CND,

+			OPCODE_DEF,

+			OPCODE_TEXREG2RGB,

+			OPCODE_TEXDP3TEX,

+			OPCODE_TEXM3X2DEPTH,

+			OPCODE_TEXDP3,

+			OPCODE_TEXM3X3,

+			OPCODE_TEXDEPTH,

+			OPCODE_CMP0,   // D3DSIO_CMP

+			OPCODE_BEM,

+			OPCODE_DP2ADD,

+			OPCODE_DFDX,   // D3DSIO_DSX

+			OPCODE_DFDY,   // D3DSIO_DSY

+			OPCODE_TEXLDD,

+			OPCODE_CMP,   // D3DSIO_SETP

+			OPCODE_TEXLDL,

+			OPCODE_BREAKP,

+

+			OPCODE_PHASE = 0xFFFD,

+			OPCODE_COMMENT = 0xFFFE,

+			OPCODE_END = 0xFFFF,

+

+			OPCODE_PS_1_0 = 0xFFFF0100,

+			OPCODE_PS_1_1 = 0xFFFF0101,

+			OPCODE_PS_1_2 = 0xFFFF0102,

+			OPCODE_PS_1_3 = 0xFFFF0103,

+			OPCODE_PS_1_4 = 0xFFFF0104,

+			OPCODE_PS_2_0 = 0xFFFF0200,

+			OPCODE_PS_2_x = 0xFFFF0201,

+			OPCODE_PS_3_0 = 0xFFFF0300,

+					

+			OPCODE_VS_1_0 = 0xFFFE0100,

+			OPCODE_VS_1_1 = 0xFFFE0101,

+			OPCODE_VS_2_0 = 0xFFFE0200,

+			OPCODE_VS_2_x = 0xFFFE0201,

+			OPCODE_VS_2_sw = 0xFFFE02FF,

+			OPCODE_VS_3_0 = 0xFFFE0300,

+			OPCODE_VS_3_sw = 0xFFFE03FF,

+

+			OPCODE_WHILE = 0x80000001,

+			OPCODE_ENDWHILE,

+			OPCODE_COS,

+			OPCODE_SIN,

+			OPCODE_TAN,

+			OPCODE_ACOS,

+			OPCODE_ASIN,

+			OPCODE_ATAN,

+			OPCODE_ATAN2,

+			OPCODE_DP1,

+			OPCODE_DP2,

+			OPCODE_TRUNC,

+			OPCODE_FLOOR,

+			OPCODE_CEIL,

+			OPCODE_SQRT,

+			OPCODE_RSQ,

+			OPCODE_LEN2,

+			OPCODE_LEN3,

+			OPCODE_LEN4,

+			OPCODE_DIST1,

+			OPCODE_DIST2,

+			OPCODE_DIST3,

+			OPCODE_DIST4,

+			OPCODE_NRM2,

+			OPCODE_NRM4,

+			OPCODE_DIV,

+			OPCODE_MOD,

+			OPCODE_EXP2,

+			OPCODE_LOG2,

+			OPCODE_EXP,

+			OPCODE_LOG,

+			OPCODE_POW,

+			OPCODE_F2B,   // Float to bool

+			OPCODE_B2F,   // Bool to float

+			OPCODE_ALL,

+			OPCODE_ANY,

+			OPCODE_NOT,

+			OPCODE_OR,

+			OPCODE_XOR,

+			OPCODE_AND,

+			OPCODE_STEP,

+			OPCODE_SMOOTH,

+			OPCODE_FORWARD1,

+			OPCODE_FORWARD2,

+			OPCODE_FORWARD3,

+			OPCODE_FORWARD4,

+			OPCODE_REFLECT1,

+			OPCODE_REFLECT2,

+			OPCODE_REFLECT3,

+			OPCODE_REFLECT4,

+			OPCODE_REFRACT1,

+			OPCODE_REFRACT2,

+			OPCODE_REFRACT3,

+			OPCODE_REFRACT4,

+			OPCODE_ICMP,

+			OPCODE_SELECT,

+			OPCODE_EXTRACT,

+			OPCODE_INSERT,

+			OPCODE_DISCARD,

+			OPCODE_FWIDTH,

+			OPCODE_LEAVE,   // Return before the end of the function

+			OPCODE_CONTINUE,

+			OPCODE_TEST,   // Marks the end of the code that can be skipped by 'continue'

+		};

+

+		static Opcode OPCODE_DP(int);

+		static Opcode OPCODE_LEN(int);

+		static Opcode OPCODE_DIST(int);

+		static Opcode OPCODE_NRM(int);

+		static Opcode OPCODE_FORWARD(int);

+		static Opcode OPCODE_REFLECT(int);

+		static Opcode OPCODE_REFRACT(int);

+

+		enum Control

+		{

+			CONTROL_RESERVED0,

+			CONTROL_GT,

+			CONTROL_EQ,

+			CONTROL_GE,

+			CONTROL_LT,

+			CONTROL_NE,

+			CONTROL_LE,

+			CONTROL_RESERVED1

+		};

+

+		enum SamplerType

+		{

+			SAMPLER_UNKNOWN,

+			SAMPLER_1D,

+			SAMPLER_2D,

+			SAMPLER_CUBE,

+			SAMPLER_VOLUME

+		};

+

+		enum Usage   // For vertex input/output declarations

+		{

+			USAGE_POSITION = 0,

+			USAGE_BLENDWEIGHT = 1,

+			USAGE_BLENDINDICES = 2,

+			USAGE_NORMAL = 3,

+			USAGE_PSIZE = 4,

+			USAGE_TEXCOORD = 5,

+			USAGE_TANGENT = 6,

+			USAGE_BINORMAL = 7,

+			USAGE_TESSFACTOR = 8,

+			USAGE_POSITIONT = 9,

+			USAGE_COLOR = 10,

+			USAGE_FOG = 11,

+			USAGE_DEPTH = 12,

+			USAGE_SAMPLE = 13

+		};

+

+		enum ParameterType

+		{

+			PARAMETER_TEMP = 0,

+			PARAMETER_INPUT = 1,

+			PARAMETER_CONST = 2,

+			PARAMETER_TEXTURE = 3,

+			PARAMETER_ADDR = 3,

+			PARAMETER_RASTOUT = 4,

+			PARAMETER_ATTROUT = 5,

+			PARAMETER_TEXCRDOUT = 6,

+			PARAMETER_OUTPUT = 6,

+			PARAMETER_CONSTINT = 7,

+			PARAMETER_COLOROUT = 8,

+			PARAMETER_DEPTHOUT = 9,

+			PARAMETER_SAMPLER = 10,

+			PARAMETER_CONST2 = 11,

+			PARAMETER_CONST3 = 12,

+			PARAMETER_CONST4 = 13,

+			PARAMETER_CONSTBOOL = 14,

+			PARAMETER_LOOP = 15,

+			PARAMETER_TEMPFLOAT16 = 16,

+			PARAMETER_MISCTYPE = 17,

+			PARAMETER_LABEL = 18,

+			PARAMETER_PREDICATE = 19,

+

+		//	PARAMETER_FLOAT1LITERAL,

+		//	PARAMETER_FLOAT2LITERAL,

+		//	PARAMETER_FLOAT3LITERAL,

+			PARAMETER_FLOAT4LITERAL,

+			PARAMETER_BOOL1LITERAL,

+		//	PARAMETER_BOOL2LITERAL,

+		//	PARAMETER_BOOL3LITERAL,

+		//	PARAMETER_BOOL4LITERAL,

+		//	PARAMETER_INT1LITERAL,

+		//	PARAMETER_INT2LITERAL,

+		//	PARAMETER_INT3LITERAL,

+			PARAMETER_INT4LITERAL,

+

+			PARAMETER_VOID

+		};

+

+		enum Modifier

+		{

+			MODIFIER_NONE,

+			MODIFIER_NEGATE,

+			MODIFIER_BIAS,

+			MODIFIER_BIAS_NEGATE,

+			MODIFIER_SIGN,

+			MODIFIER_SIGN_NEGATE,

+			MODIFIER_COMPLEMENT,

+			MODIFIER_X2,

+			MODIFIER_X2_NEGATE,

+			MODIFIER_DZ,

+			MODIFIER_DW,

+			MODIFIER_ABS,

+			MODIFIER_ABS_NEGATE,

+			MODIFIER_NOT

+		};

+

+		enum Analysis

+		{

+			// Flags indicating whether an instruction is affected by an execution enable mask

+			ANALYSIS_BRANCH   = 0x00000001,

+			ANALYSIS_BREAK    = 0x00000002,

+			ANALYSIS_CONTINUE = 0x00000004,

+			ANALYSIS_LEAVE    = 0x00000008,

+		};

+

+		struct RelativeAddress

+		{

+			RelativeAddress() : type(PARAMETER_VOID), index(0), swizzle(0), scale(1), deterministic(false)

+			{

+			}

+

+			ParameterType type : 8;

+			unsigned int index;

+			unsigned int swizzle : 8;

+			unsigned int scale;

+			bool deterministic;   // Equal accross shader instances run in lockstep (e.g. unrollable loop couters)

+		};

+

+		struct Parameter

+		{

+			union

+			{

+				struct

+				{

+					unsigned int index;   // For registers types

+

+					RelativeAddress rel;

+				};

+

+				float value[4];       // For float constants

+				int integer[4];       // For integer constants

+				int boolean[4];       // For boolean constants

+

+				struct

+				{

+					unsigned int label;      // Label index

+					unsigned int callSite;   // Call index (per label)

+				};

+			};

+

+			Parameter() : type(PARAMETER_VOID), index(0)

+			{

+			}

+

+			std::string string(ShaderType shaderType, unsigned short version) const;

+			std::string typeString(ShaderType shaderType, unsigned short version) const;

+			std::string relativeString() const;

+

+			ParameterType type : 8;

+		};

+

+		struct DestinationParameter : Parameter

+		{

+			union

+			{

+				unsigned char mask;

+

+				struct

+				{

+					bool x : 1;

+					bool y : 1;

+					bool z : 1;

+					bool w : 1;

+				};

+			};

+

+			DestinationParameter() : mask(0xF), integer(false), saturate(false), partialPrecision(false), centroid(false), shift(0)

+			{

+			}

+

+			std::string modifierString() const;

+			std::string shiftString() const;

+			std::string maskString() const;

+

+			bool integer          : 1;

+			bool saturate         : 1;

+			bool partialPrecision : 1;

+			bool centroid         : 1;

+			signed char shift     : 4;

+		};

+

+		struct SourceParameter : Parameter

+		{

+			SourceParameter() : swizzle(0xE4), modifier(MODIFIER_NONE)

+			{

+			}

+

+			std::string swizzleString() const;

+			std::string preModifierString() const;

+			std::string postModifierString() const;

+

+			unsigned int swizzle : 8;

+			Modifier modifier : 8;

+		};

+

+		struct Instruction

+		{

+			explicit Instruction(Opcode opcode);

+			Instruction(const unsigned long *token, int size, unsigned char majorVersion);

+

+			virtual ~Instruction();

+

+			void parseOperationToken(unsigned long token, unsigned char majorVersion);

+			void parseDeclarationToken(unsigned long token);

+			void parseDestinationToken(const unsigned long *token, unsigned char majorVersion);

+			void parseSourceToken(int i, const unsigned long *token, unsigned char majorVersion);

+

+			std::string string(ShaderType shaderType, unsigned short version) const;

+			static std::string swizzleString(ParameterType type, unsigned char swizzle);

+			std::string operationString(unsigned short version) const;

+			std::string controlString() const;

+

+			bool isBranch() const;

+			bool isCall() const;

+			bool isBreak() const;

+			bool isLoop() const;

+			bool isEndLoop() const;

+

+			Opcode opcode;

+			

+			union

+			{

+				Control control;

+				

+				struct

+				{

+					unsigned char project : 1;

+					unsigned char bias : 1;

+				};

+			};

+

+			bool predicate;

+			bool predicateNot;   // Negative predicate

+			unsigned char predicateSwizzle;

+

+			bool coissue;

+			SamplerType samplerType;

+			Usage usage;

+			unsigned char usageIndex;

+

+			DestinationParameter dst;

+			SourceParameter src[4];

+

+			union

+			{

+				unsigned int analysis;

+

+				struct

+				{

+					// Keep in sync with Shader::Analysis flags

+					unsigned int analysisBranch : 1;

+					unsigned int analysisBreak : 1;

+					unsigned int analysisContinue : 1;

+					unsigned int analysisLeave : 1;

+				};

+			};

+		};

+

+		Shader();

+

+		~Shader();

+

+		int getSerialID() const;

+		int getLength() const;

+		ShaderType getShaderType() const;

+		unsigned short getVersion() const;

+

+		void append(Instruction *instruction);

+		void declareSampler(int i);

+

+		const Instruction *getInstruction(unsigned int i) const;

+		int size(unsigned long opcode) const;

+		static int size(unsigned long opcode, unsigned short version);

+

+		void print(const char *fileName, ...) const;

+		void printInstruction(int index, const char *fileName) const;

+

+		static bool maskContainsComponent(int mask, int component);

+		static bool swizzleContainsComponent(int swizzle, int component);

+		static bool swizzleContainsComponentMasked(int swizzle, int component, int mask);

+

+		bool containsDynamicBranching() const;

+		bool containsBreakInstruction() const;

+		bool containsContinueInstruction() const;

+		bool containsLeaveInstruction() const;

+		bool usesSampler(int i) const;

+

+		struct Semantic

+		{

+			Semantic(unsigned char usage = 0xFF, unsigned char index = 0xFF) : usage(usage), index(index), centroid(false)

+			{

+			}

+

+			bool operator==(const Semantic &semantic) const

+			{

+				return usage == semantic.usage && index == semantic.index;

+			}

+

+			bool active() const

+			{

+				return usage != 0xFF;

+			}

+

+			unsigned char usage;

+			unsigned char index;

+			bool centroid;

+		};

+

+		virtual void analyze() = 0;

+

+		// FIXME: Private

+		unsigned int dirtyConstantsF;

+		unsigned int dirtyConstantsI;

+		unsigned int dirtyConstantsB;

+

+		bool dynamicallyIndexedTemporaries;

+		bool dynamicallyIndexedInput;

+		bool dynamicallyIndexedOutput;

+

+	protected:

+		void parse(const unsigned long *token);

+

+		void analyzeDirtyConstants();

+		void analyzeDynamicBranching();

+		void analyzeSamplers();

+		void analyzeCallSites();

+		void analyzeDynamicIndexing();

+		void markFunctionAnalysis(int functionLabel, Analysis flag);

+

+		ShaderType shaderType;

+

+		union

+		{

+			unsigned short version;

+

+			struct

+			{

+				unsigned char minorVersion;

+				unsigned char majorVersion;

+			};

+		};

+

+		std::vector<Instruction*> instruction;

+

+		unsigned short usedSamplers;   // Bit flags

+

+	private:

+		const int serialID;

+		static volatile int serialCounter;

+

+		bool dynamicBranching;

+		bool containsBreak;

+		bool containsContinue;

+		bool containsLeave;

+	};

+}

+

+#endif   // sw_Shader_hpp

diff --git a/src/Shader/ShaderCore.cpp b/src/Shader/ShaderCore.cpp
index 26facac..1fc2b2a 100644
--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -11,11 +11,459 @@
 
 #include "ShaderCore.hpp"
 
-#include "Debug.hpp"
+#include "Renderer/Renderer.hpp"
+#include "Common/Debug.hpp"
 
 namespace sw
 {
-	void ShaderCore::mov(Color4f &dst, Color4f &src, bool floorToInteger)
+	extern TranscendentalPrecision logPrecision;
+	extern TranscendentalPrecision expPrecision;
+	extern TranscendentalPrecision rcpPrecision;
+	extern TranscendentalPrecision rsqPrecision;
+
+	Vector4i::Vector4i()
+	{
+	}
+
+	Vector4i::Vector4i(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+	{
+		this->x = Short4(x);
+		this->y = Short4(y);
+		this->z = Short4(z);
+		this->w = Short4(w);
+	}
+
+	Vector4i::Vector4i(const Vector4i &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+	}
+
+	Vector4i &Vector4i::operator=(const Vector4i &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+
+		return *this;
+	}
+
+	Short4 &Vector4i::operator[](int i)
+	{
+		switch(i)
+		{
+		case 0: return x;
+		case 1: return y;
+		case 2: return z;
+		case 3: return w;
+		}
+
+		return x;
+	}
+
+	Vector4f::Vector4f()
+	{
+	}
+
+	Vector4f::Vector4f(float x, float y, float z, float w)
+	{
+		this->x = Float4(x);
+		this->y = Float4(y);
+		this->z = Float4(z);
+		this->w = Float4(w);
+	}
+
+	Vector4f::Vector4f(const Vector4f &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+	}
+
+	Vector4f &Vector4f::operator=(const Vector4f &rhs)
+	{
+		x = rhs.x;
+		y = rhs.y;
+		z = rhs.z;
+		w = rhs.w;
+
+		return *this;
+	}
+
+	Float4 &Vector4f::operator[](int i)
+	{
+		switch(i)
+		{
+		case 0: return x;
+		case 1: return y;
+		case 2: return z;
+		case 3: return w;
+		}
+
+		return x;
+	}
+
+	Float4 exponential2(RValue<Float4> x, bool pp)
+	{
+		Float4 x0;
+		Float4 x1;
+		Int4 x2;
+	
+		x0 = x;
+
+		x0 = Min(x0, As<Float4>(Int4(0x43010000)));   // 129.00000e+0f
+		x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));   // -126.99999e+0f
+		x1 = x0;
+		x1 -= Float4(0.5f);
+		x2 = RoundInt(x1);
+		x1 = Float4(x2);
+		x2 += Int4(0x0000007F);   // 127
+		x2 = x2 << 23;
+		x0 -= x1;
+		x1 = As<Float4>(Int4(0x3AF61905));   // 1.8775767e-3f
+		x1 *= x0;
+		x1 += As<Float4>(Int4(0x3C134806));   // 8.9893397e-3f
+		x1 *= x0;
+		x1 += As<Float4>(Int4(0x3D64AA23));   // 5.5826318e-2f
+		x1 *= x0;
+		x1 += As<Float4>(Int4(0x3E75EAD4));   // 2.4015361e-1f
+		x1 *= x0;
+		x1 += As<Float4>(Int4(0x3F31727B));   // 6.9315308e-1f
+		x1 *= x0;
+		x1 += As<Float4>(Int4(0x3F7FFFFF));   // 9.9999994e-1f
+		x1 *= As<Float4>(x2);
+			
+		return x1;
+	}
+
+	Float4 logarithm2(RValue<Float4> x, bool absolute, bool pp)
+	{
+		Float4 x0;
+		Float4 x1;
+		Float4 x2;
+		Float4 x3;
+		
+		x0 = x;
+		
+		x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
+		x1 = As<Float4>(As<UInt4>(x1) >> 8);
+		x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
+		x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);   // FIXME: (x1 - 1.4960938f) * 256.0f;
+		x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
+
+		x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
+		x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
+		x2 /= x3;
+
+		x1 += (x0 - Float4(1.0f)) * x2;
+				
+		return x1;
+	}
+
+	Float4 exponential(RValue<Float4> x, bool pp)
+	{
+		// FIXME: Propagate the constant
+		return exponential2(Float4(1.44269541f) * x, pp);   // 1/ln(2)
+	}
+
+	Float4 logarithm(RValue<Float4> x, bool absolute, bool pp)
+	{
+		// FIXME: Propagate the constant
+		return Float4(6.93147181e-1f) * logarithm2(x, absolute, pp);   // ln(2)
+	}
+
+	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
+	{
+		Float4 log = logarithm2(x, true, pp);
+		log *= y;
+		return exponential2(log, pp);
+	}
+
+	Float4 reciprocal(RValue<Float4> x, bool pp, bool finite)
+	{
+		Float4 rcp;
+
+		if(!pp && rcpPrecision >= WHQL)
+		{
+			rcp = Float4(1.0f) / x;
+		}
+		else
+		{
+			rcp = Rcp_pp(x);
+
+			if(!pp)
+			{
+				rcp = (rcp + rcp) - (x * rcp * rcp);
+			}
+		}
+
+		if(finite)
+		{
+			int big = 0x7F7FFFFF;
+			rcp = Min(rcp, Float4((float&)big));
+		}
+
+		return rcp;
+	}
+
+	Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
+	{
+		Float4 abs = x;
+
+		if(absolute)
+		{
+			abs = Abs(abs);
+		}
+
+		Float4 rsq;
+
+		if(!pp && rsqPrecision >= IEEE)
+		{
+			rsq = Float4(1.0f) / Sqrt(abs);
+		}
+		else
+		{
+			rsq = RcpSqrt_pp(abs);
+
+			if(!pp)
+			{
+				rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
+			}
+		}
+
+		int big = 0x7F7FFFFF;
+		rsq = Min(rsq, Float4((float&)big));
+
+		return rsq;
+	}
+
+	Float4 modulo(RValue<Float4> x, RValue<Float4> y)
+	{
+		return x - y * Floor(x / y);
+	}
+
+	Float4 sine_pi(RValue<Float4> x, bool pp)
+	{
+		const Float4 A = Float4(-4.05284734e-1f);   // -4/pi^2
+		const Float4 B = Float4(1.27323954e+0f);    // 4/pi
+		const Float4 C = Float4(7.75160950e-1f);
+		const Float4 D = Float4(2.24839049e-1f);
+
+		// Parabola approximating sine
+		Float4 sin = x * (Abs(x) * A + B);
+
+		// Improve precision from 0.06 to 0.001
+		if(true)
+		{
+			sin = sin * (Abs(sin) * D + C);
+		}
+
+		return sin;
+	}
+
+	Float4 cosine_pi(RValue<Float4> x, bool pp)
+	{
+		// cos(x) = sin(x + pi/2)
+		Float4 y = x + Float4(1.57079632e+0f);
+		
+		// Wrap around
+		y -= As<Float4>(CmpNLT(y, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
+
+		return sine_pi(y, pp);
+	}
+
+	Float4 sine(RValue<Float4> x, bool pp)
+	{
+		// Reduce to [-0.5, 0.5] range
+		Float4 y = x * Float4(1.59154943e-1f);   // 1/2pi
+		y = y - Round(y);
+
+		const Float4 A = Float4(-16.0f);
+		const Float4 B = Float4(8.0f);
+		const Float4 C = Float4(7.75160950e-1f);
+		const Float4 D = Float4(2.24839049e-1f);
+
+		// Parabola approximating sine
+		Float4 sin = y * (Abs(y) * A + B);
+
+		// Improve precision from 0.06 to 0.001
+		if(true)
+		{
+			sin = sin * (Abs(sin) * D + C);
+		}
+
+		return sin;
+	}
+
+	Float4 cosine(RValue<Float4> x, bool pp)
+	{
+		// cos(x) = sin(x + pi/2)
+		Float4 y = x + Float4(1.57079632e+0f);
+		return sine(y, pp);
+	}
+
+	Float4 tangent(RValue<Float4> x, bool pp)
+	{
+		return sine(x, pp) / cosine(x, pp);
+	}
+
+	Float4 arccos(RValue<Float4> x, bool pp)
+	{
+		// pi/2 - arcsin(x)
+		return Float4(1.57079632e+0f) - arcsin(x);
+	}
+
+	Float4 arcsin(RValue<Float4> x, bool pp)
+	{
+		// x*(pi/2-sqrt(1-x*x)*pi/5)
+		return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
+	}
+
+	Float4 arctan(RValue<Float4> x, bool pp)
+	{
+		Int4 O = CmpNLT(Abs(x), Float4(1.0f));
+		Float4 y = As<Float4>(O & As<Int4>(Float4(1.0f) / x) | ~O & As<Int4>(x));   // FIXME: Vector select
+
+		// Approximation of atan in [-1..1]
+		Float4 theta = y * (Float4(-0.27f) * Abs(y) + Float4(1.05539816f));
+		
+		// +/-pi/2 depending on sign of x
+		Float4 sgnPi_2 = As<Float4>(As<Int4>(Float4(1.57079632e+0f)) ^ (As<Int4>(x) & Int4(0x80000000)));
+
+		theta = As<Float4>(O & As<Int4>(sgnPi_2 - theta) | ~O & As<Int4>(theta));   // FIXME: Vector select
+
+		return theta;
+	}
+
+	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
+	{
+		// Rotate to upper semicircle when in lower semicircle
+		Int4 S = CmpLT(y, Float4(0.0f));
+		Float4 theta = As<Float4>(S & As<Int4>(Float4(-3.14159265e+0f)));   // -pi
+		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
+		Float4 y0 = Abs(y);
+
+		// Rotate to right quadrant when in left quadrant
+		Int4 Q = CmpLT(x0, Float4(0.0f));
+		theta += As<Float4>(Q & As<Int4>(Float4(1.57079632e+0f)));   // pi/2
+		Float4 x1 = As<Float4>(Q & As<Int4>(y0) | ~Q & As<Int4>(x0));    // FIXME: Vector select
+		Float4 y1 = As<Float4>(Q & As<Int4>(-x0) | ~Q & As<Int4>(y0));   // FIXME: Vector select
+
+		// Rotate to first octant when in second octant
+		Int4 O = CmpNLT(y1, x1);
+		theta += As<Float4>(O & As<Int4>(Float4(7.85398163e-1f)));   // pi/4
+		Float4 x2 = As<Float4>(O & As<Int4>(Float4(7.07106781e-1f) * x1 + Float4(7.07106781e-1f) * y1) | ~O & As<Int4>(x1));   // sqrt(2)/2   // FIXME: Vector select
+		Float4 y2 = As<Float4>(O & As<Int4>(Float4(7.07106781e-1f) * y1 - Float4(7.07106781e-1f) * x1) | ~O & As<Int4>(y1));   // FIXME: Vector select
+
+		// Approximation of atan in [0..1]
+		Float4 y_x = y2 / x2;
+		theta += y_x * (Float4(-0.27f) * y_x + Float4(1.05539816f));
+
+		return theta;
+	}
+
+	Float4 dot2(Vector4f &v0, Vector4f &v1)
+	{
+		return v0.x * v1.x + v0.y * v1.y;
+	}
+
+	Float4 dot3(Vector4f &v0, Vector4f &v1)
+	{
+		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
+	}
+
+	Float4 dot4(Vector4f &v0, Vector4f &v1)
+	{
+		return v0.x * v1.x + v0.y * v1.y + v0.z * v1.z + v0.w * v1.w;
+	}
+
+	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
+	{
+		Int2 tmp0 = UnpackHigh(row0, row1);
+		Int2 tmp1 = UnpackHigh(row2, row3);
+		Int2 tmp2 = UnpackLow(row0, row1);
+		Int2 tmp3 = UnpackLow(row2, row3);
+
+		row0 = As<Short4>(UnpackLow(tmp2, tmp3));
+		row1 = As<Short4>(UnpackHigh(tmp2, tmp3));
+		row2 = As<Short4>(UnpackLow(tmp0, tmp1));
+		row3 = As<Short4>(UnpackHigh(tmp0, tmp1));
+	}
+
+	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+		Float4 tmp2 = UnpackHigh(row0, row1);
+		Float4 tmp3 = UnpackHigh(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+		row1 = Float4(tmp0.zw, tmp1.zw);
+		row2 = Float4(tmp2.xy, tmp3.xy);
+		row3 = Float4(tmp2.zw, tmp3.zw);
+	}
+
+	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+		Float4 tmp2 = UnpackHigh(row0, row1);
+		Float4 tmp3 = UnpackHigh(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+		row1 = Float4(tmp0.zw, tmp1.zw);
+		row2 = Float4(tmp2.xy, tmp3.xy);
+	}
+
+	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+		row1 = Float4(tmp0.zw, tmp1.zw);
+	}
+
+	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		Float4 tmp0 = UnpackLow(row0, row1);
+		Float4 tmp1 = UnpackLow(row2, row3);
+
+		row0 = Float4(tmp0.xy, tmp1.xy);
+	}
+
+	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		row0 = UnpackLow(row0, row1);
+		row1 = Float4(row0.zw, row1.zw);
+		row2 = UnpackHigh(row0, row1);
+		row3 = Float4(row2.zw, row3.zw);
+	}
+
+	void transpose2x4h(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+	{
+		row0 = UnpackLow(row2, row3);
+		row1 = Float4(row0.zw, row1.zw);
+		row2 = UnpackHigh(row2, row3);
+		row3 = Float4(row2.zw, row3.zw);
+	}
+
+	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N)
+	{
+		switch(N)
+		{
+		case 1: transpose4x1(row0, row1, row2, row3); break;
+		case 2: transpose4x2(row0, row1, row2, row3); break;
+		case 3: transpose4x3(row0, row1, row2, row3); break;
+		case 4: transpose4x4(row0, row1, row2, row3); break;
+		}
+	}
+
+	void ShaderCore::mov(Vector4f &dst, Vector4f &src, bool floorToInteger)
 	{
 		if(floorToInteger)
 		{
@@ -27,7 +475,23 @@
 		}
 	}
 
-	void ShaderCore::add(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::f2b(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
+		dst.y = As<Float4>(CmpNEQ(src.y, Float4(0.0f)));
+		dst.z = As<Float4>(CmpNEQ(src.z, Float4(0.0f)));
+		dst.w = As<Float4>(CmpNEQ(src.w, Float4(0.0f)));
+	}
+
+	void ShaderCore::b2f(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = As<Float4>(As<Int4>(src.x) & As<Int4>(Float4(1.0f)));
+		dst.y = As<Float4>(As<Int4>(src.y) & As<Int4>(Float4(1.0f)));
+		dst.z = As<Float4>(As<Int4>(src.z) & As<Int4>(Float4(1.0f)));
+		dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
+	}
+
+	void ShaderCore::add(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		dst.x = src0.x + src1.x;
 		dst.y = src0.y + src1.y;
@@ -35,7 +499,7 @@
 		dst.w = src0.w + src1.w;
 	}
 
-	void ShaderCore::sub(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::sub(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		dst.x = src0.x - src1.x;
 		dst.y = src0.y - src1.y;
@@ -43,7 +507,7 @@
 		dst.w = src0.w - src1.w;
 	}
 
-	void ShaderCore::mad(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2)
+	void ShaderCore::mad(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2)
 	{
 		dst.x = src0.x * src1.x + src2.x;
 		dst.y = src0.y * src1.y + src2.y;
@@ -51,7 +515,7 @@
 		dst.w = src0.w * src1.w + src2.w;
 	}
 
-	void ShaderCore::mul(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::mul(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		dst.x = src0.x * src1.x;
 		dst.y = src0.y * src1.y;
@@ -59,7 +523,7 @@
 		dst.w = src0.w * src1.w;
 	}
 
-	void ShaderCore::rcp(Color4f &dst, Color4f &src, bool pp)
+	void ShaderCore::rcpx(Vector4f &dst, Vector4f &src, bool pp)
 	{
 		Float4 rcp = reciprocal(src.x, pp, true);
 
@@ -69,17 +533,126 @@
 		dst.w = rcp;
 	}
 
-	void ShaderCore::rsq(Color4f &dst, Color4f &src, bool pp)
+	void ShaderCore::div(Vector4f &dst, Vector4f &src0, Vector4f &src1)
+	{
+		dst.x = src0.x / src1.x;
+		dst.y = src0.y / src1.y;
+		dst.z = src0.z / src1.z;
+		dst.w = src0.w / src1.w;
+	}
+
+	void ShaderCore::mod(Vector4f &dst, Vector4f &src0, Vector4f &src1)
+	{
+		dst.x = modulo(src0.x, src1.x);
+		dst.y = modulo(src0.y, src1.y);
+		dst.z = modulo(src0.z, src1.z);
+		dst.w = modulo(src0.w, src1.w);
+	}
+
+	void ShaderCore::rsqx(Vector4f &dst, Vector4f &src, bool pp)
 	{
 		Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
 
-		dst.r = rsq;
-		dst.g = rsq;
-		dst.b = rsq;
-		dst.a = rsq;
+		dst.x = rsq;
+		dst.y = rsq;
+		dst.z = rsq;
+		dst.w = rsq;
 	}
 
-	void ShaderCore::dp3(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::sqrt(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = Sqrt(src.x);
+		dst.y = Sqrt(src.y);
+		dst.z = Sqrt(src.z);
+		dst.w = Sqrt(src.w);
+	}
+
+	void ShaderCore::rsq(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = reciprocalSquareRoot(src.x, false, pp);
+		dst.y = reciprocalSquareRoot(src.y, false, pp);
+		dst.z = reciprocalSquareRoot(src.z, false, pp);
+		dst.w = reciprocalSquareRoot(src.w, false, pp);
+	}
+
+	void ShaderCore::len2(Float4 &dst, Vector4f &src, bool pp)
+	{
+		dst = Sqrt(dot2(src, src));
+	}
+
+	void ShaderCore::len3(Float4 &dst, Vector4f &src, bool pp)
+	{
+		dst = Sqrt(dot3(src, src));
+	}
+
+	void ShaderCore::len4(Float4 &dst, Vector4f &src, bool pp)
+	{
+		dst = Sqrt(dot4(src, src));
+	}
+
+	void ShaderCore::dist1(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp)
+	{
+		dst = Abs(src0.x - src1.x);
+	}
+
+	void ShaderCore::dist2(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp)
+	{
+		Float4 dx = src0.x - src1.x;
+		Float4 dy = src0.y - src1.y;
+		Float4 dot2 = dx * dx + dy * dy;
+		dst = Sqrt(dot2);
+	}
+
+	void ShaderCore::dist3(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp)
+	{
+		Float4 dx = src0.x - src1.x;
+		Float4 dy = src0.y - src1.y;
+		Float4 dz = src0.z - src1.z;
+		Float4 dot3 = dx * dx + dy * dy + dz * dz;
+		dst = Sqrt(dot3);
+	}
+
+	void ShaderCore::dist4(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp)
+	{
+		Float4 dx = src0.x - src1.x;
+		Float4 dy = src0.y - src1.y;
+		Float4 dz = src0.z - src1.z;
+		Float4 dw = src0.w - src1.w;
+		Float4 dot4 = dx * dx + dy * dy + dz * dz + dw * dw;
+		dst = Sqrt(dot4);
+	}
+
+	void ShaderCore::dp1(Vector4f &dst, Vector4f &src0, Vector4f &src1)
+	{
+		Float4 t = src0.x * src1.x;
+
+		dst.x = t;
+		dst.y = t;
+		dst.z = t;
+		dst.w = t;
+	}
+
+	void ShaderCore::dp2(Vector4f &dst, Vector4f &src0, Vector4f &src1)
+	{
+		Float4 t = dot2(src0, src1);
+
+		dst.x = t;
+		dst.y = t;
+		dst.z = t;
+		dst.w = t;
+	}
+
+	void ShaderCore::dp2add(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2)
+	{
+		Float4 t = dot2(src0, src1) + src2.x;
+
+		dst.x = t;
+		dst.y = t;
+		dst.z = t;
+		dst.w = t;
+	}
+
+	void ShaderCore::dp3(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		Float4 dot = dot3(src0, src1);
 
@@ -89,7 +662,7 @@
 		dst.w = dot;
 	}
 
-	void ShaderCore::dp4(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::dp4(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		Float4 dot = dot4(src0, src1);
 
@@ -99,7 +672,7 @@
 		dst.w = dot;
 	}
 
-	void ShaderCore::min(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::min(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		dst.x = Min(src0.x, src1.x);
 		dst.y = Min(src0.y, src1.y);
@@ -107,7 +680,7 @@
 		dst.w = Min(src0.w, src1.w);
 	}
 
-	void ShaderCore::max(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::max(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		dst.x = Max(src0.x, src1.x);
 		dst.y = Max(src0.y, src1.y);
@@ -115,39 +688,25 @@
 		dst.w = Max(src0.w, src1.w);
 	}
 
-	void ShaderCore::slt(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::slt(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
-		Int4 xMask = As<Int4>(CmpLT(src0.x, src1.x));
-		Int4 yMask = As<Int4>(CmpLT(src0.y, src1.y));
-		Int4 zMask = As<Int4>(CmpLT(src0.z, src1.z));
-		Int4 wMask = As<Int4>(CmpLT(src0.w, src1.w));
-
-		Int4 iOne = As<Int4>(Float4(1, 1, 1, 1));
-
-		dst.x = As<Float4>(xMask & iOne);
-		dst.y = As<Float4>(yMask & iOne);
-		dst.z = As<Float4>(zMask & iOne);
-		dst.w = As<Float4>(wMask & iOne);
+		dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
+		dst.y = As<Float4>(As<Int4>(CmpLT(src0.y, src1.y)) & As<Int4>(Float4(1.0f)));
+		dst.z = As<Float4>(As<Int4>(CmpLT(src0.z, src1.z)) & As<Int4>(Float4(1.0f)));
+		dst.w = As<Float4>(As<Int4>(CmpLT(src0.w, src1.w)) & As<Int4>(Float4(1.0f)));
 	}
 
-	void ShaderCore::sge(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::step(Vector4f &dst, Vector4f &edge, Vector4f &x)
 	{
-		Int4 xMask = As<Int4>(CmpNLT(src0.x, src1.x));
-		Int4 yMask = As<Int4>(CmpNLT(src0.y, src1.y));
-		Int4 zMask = As<Int4>(CmpNLT(src0.z, src1.z));
-		Int4 wMask = As<Int4>(CmpNLT(src0.w, src1.w));
-
-		Int4 iOne = As<Int4>(Float4(1, 1, 1, 1));
-
-		dst.x = As<Float4>(xMask & iOne);
-		dst.y = As<Float4>(yMask & iOne);
-		dst.z = As<Float4>(zMask & iOne);
-		dst.w = As<Float4>(wMask & iOne);
+		dst.x = As<Float4>(CmpNLT(x.x, edge.x) & As<Int4>(Float4(1.0f)));
+		dst.y = As<Float4>(CmpNLT(x.y, edge.y) & As<Int4>(Float4(1.0f)));
+		dst.z = As<Float4>(CmpNLT(x.z, edge.z) & As<Int4>(Float4(1.0f)));
+		dst.w = As<Float4>(CmpNLT(x.w, edge.w) & As<Int4>(Float4(1.0f)));
 	}
 
-	void ShaderCore::exp(Color4f &dst, Color4f &src, bool pp)
+	void ShaderCore::exp2x(Vector4f &dst, Vector4f &src, bool pp)
 	{ 
-		Float4 exp = exponential(src.x, pp);
+		Float4 exp = exponential2(src.x, pp);
 
 		dst.x = exp;
 		dst.y = exp;
@@ -155,9 +714,25 @@
 		dst.w = exp;
 	}
 
-	void ShaderCore::log(Color4f &dst, Color4f &src, bool pp)
+	void ShaderCore::exp2(Vector4f &dst, Vector4f &src, bool pp)
 	{
-		Float4 log = logarithm(src.x, true, pp);
+		dst.x = exponential2(src.x, pp);
+		dst.y = exponential2(src.y, pp);
+		dst.z = exponential2(src.z, pp);
+		dst.w = exponential2(src.w, pp);
+	}
+
+	void ShaderCore::exp(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = exponential(src.x, pp);
+		dst.y = exponential(src.y, pp);
+		dst.z = exponential(src.z, pp);
+		dst.w = exponential(src.w, pp);
+	}
+
+	void ShaderCore::log2x(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		Float4 log = logarithm2(src.x, true, pp);
 
 		dst.x = log;
 		dst.y = log;
@@ -165,33 +740,50 @@
 		dst.w = log;
 	}
 
-	void ShaderCore::lit(Color4f &dst, Color4f &src)
+	void ShaderCore::log2(Vector4f &dst, Vector4f &src, bool pp)
 	{
-		dst.x = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-		dst.y = Max(src.x, Float4(0.0f, 0.0f, 0.0f, 0.0f));
+		dst.x = logarithm2(src.x, pp);
+		dst.y = logarithm2(src.y, pp);
+		dst.z = logarithm2(src.z, pp);
+		dst.w = logarithm2(src.w, pp);
+	}
+
+	void ShaderCore::log(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = logarithm(src.x, false, pp);
+		dst.y = logarithm(src.y, false, pp);
+		dst.z = logarithm(src.z, false, pp);
+		dst.w = logarithm(src.w, false, pp);
+	}
+
+	void ShaderCore::lit(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = Float4(1.0f);
+		dst.y = Max(src.x, Float4(0.0f));
 
 		Float4 pow;
 
 		pow = src.w;
-		pow = Min(pow, Float4(127.9961f, 127.9961f, 127.9961f, 127.9961f));
-		pow = Max(pow, Float4(-127.9961f, -127.9961f, -127.9961f, -127.9961f));
+		pow = Min(pow, Float4(127.9961f));
+		pow = Max(pow, Float4(-127.9961f));
 
 		dst.z = power(src.y, pow);
-		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f, 0.0f, 0.0f, 0.0f)));
-		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f, 0.0f, 0.0f, 0.0f)));
+		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.x, Float4(0.0f)));
+		dst.z = As<Float4>(As<Int4>(dst.z) & CmpNLT(src.y, Float4(0.0f)));
 
-		dst.w = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+		dst.w = Float4(1.0f);
 	}
 
-	void ShaderCore::dst(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::att(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
+		// Computes attenuation factors (1, d, d^2, 1/d) assuming src0 = d^2, src1 = 1/d
 		dst.x = 1;
 		dst.y = src0.y * src1.y;
 		dst.z = src0.z;
 		dst.w = src1.w;
 	}
 
-	void ShaderCore::lrp(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2)
+	void ShaderCore::lrp(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2)
 	{
 		dst.x = src0.x * (src1.x - src2.x) + src2.x;
 		dst.y = src0.y * (src1.y - src2.y) + src2.y;
@@ -199,15 +791,47 @@
 		dst.w = src0.w * (src1.w - src2.w) + src2.w;
 	}
 
-	void ShaderCore::frc(Color4f &dst, Color4f &src)
+	void ShaderCore::smooth(Vector4f &dst, Vector4f &edge0, Vector4f &edge1, Vector4f &x)
 	{
-		dst.x = Fraction(src.x);
-		dst.y = Fraction(src.y);
-		dst.z = Fraction(src.z);
-		dst.w = Fraction(src.w);
+		Float4 tx = Min(Max((x.x - edge0.x) / (edge1.x - edge0.x), Float4(0.0f)), Float4(1.0f)); dst.x = tx * tx * (Float4(3.0f) - Float4(2.0f) * tx);
+		Float4 ty = Min(Max((x.y - edge0.y) / (edge1.y - edge0.y), Float4(0.0f)), Float4(1.0f)); dst.y = ty * ty * (Float4(3.0f) - Float4(2.0f) * ty);
+		Float4 tz = Min(Max((x.z - edge0.z) / (edge1.z - edge0.z), Float4(0.0f)), Float4(1.0f)); dst.z = tz * tz * (Float4(3.0f) - Float4(2.0f) * tz);
+		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
 	}
 
-	void ShaderCore::pow(Color4f &dst, Color4f &src0, Color4f &src1, bool pp)
+	void ShaderCore::frc(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = Frac(src.x);
+		dst.y = Frac(src.y);
+		dst.z = Frac(src.z);
+		dst.w = Frac(src.w);
+	}
+
+	void ShaderCore::trunc(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = Trunc(src.x);
+		dst.y = Trunc(src.y);
+		dst.z = Trunc(src.z);
+		dst.w = Trunc(src.w);
+	}
+
+	void ShaderCore::floor(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = Floor(src.x);
+		dst.y = Floor(src.y);
+		dst.z = Floor(src.z);
+		dst.w = Floor(src.w);
+	}
+
+	void ShaderCore::ceil(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = Ceil(src.x);
+		dst.y = Ceil(src.y);
+		dst.z = Ceil(src.z);
+		dst.w = Ceil(src.w);
+	}
+
+	void ShaderCore::powx(Vector4f &dst, Vector4f &src0, Vector4f &src1, bool pp)
 	{
 		Float4 pow = power(src0.x, src1.x, pp);
 
@@ -217,14 +841,136 @@
 		dst.w = pow;
 	}
 
-	void ShaderCore::crs(Color4f &dst, Color4f &src0, Color4f &src1)
+	void ShaderCore::pow(Vector4f &dst, Vector4f &src0, Vector4f &src1, bool pp)
+	{
+		dst.x = power(src0.x, src1.x, pp);
+		dst.y = power(src0.y, src1.y, pp);
+		dst.z = power(src0.z, src1.z, pp);
+		dst.w = power(src0.w, src1.w, pp);
+	}
+
+	void ShaderCore::crs(Vector4f &dst, Vector4f &src0, Vector4f &src1)
 	{
 		dst.x = src0.y * src1.z - src0.z * src1.y;
 		dst.y = src0.z * src1.x - src0.x * src1.z;
 		dst.z = src0.x * src1.y - src0.y * src1.x;
 	}
 
-	void ShaderCore::sgn(Color4f &dst, Color4f &src)
+	void ShaderCore::forward1(Vector4f &dst, Vector4f &N, Vector4f &I, Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(Nref.x * I.x, Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+	}
+
+	void ShaderCore::forward2(Vector4f &dst, Vector4f &N, Vector4f &I, Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(dot2(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
+	}
+
+	void ShaderCore::forward3(Vector4f &dst, Vector4f &N, Vector4f &I, Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(dot3(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
+		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
+	}
+
+	void ShaderCore::forward4(Vector4f &dst, Vector4f &N, Vector4f &I, Vector4f &Nref)
+	{
+		Int4 flip = CmpNLT(dot4(Nref, I), Float4(0.0f)) & Int4(0x80000000);
+
+		dst.x =  As<Float4>(flip ^ As<Int4>(N.x));
+		dst.y =  As<Float4>(flip ^ As<Int4>(N.y));
+		dst.z =  As<Float4>(flip ^ As<Int4>(N.z));
+		dst.w =  As<Float4>(flip ^ As<Int4>(N.w));
+	}
+	
+	void ShaderCore::reflect1(Vector4f &dst, Vector4f &I, Vector4f &N)
+	{
+		Float4 d = N.x * I.x;
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+	}
+
+	void ShaderCore::reflect2(Vector4f &dst, Vector4f &I, Vector4f &N)
+	{
+		Float4 d = dot2(N, I);
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+		dst.y = I.y - Float4(2.0f) * d * N.y;
+	}
+
+	void ShaderCore::reflect3(Vector4f &dst, Vector4f &I, Vector4f &N)
+	{
+		Float4 d = dot3(N, I);
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+		dst.y = I.y - Float4(2.0f) * d * N.y;
+		dst.z = I.z - Float4(2.0f) * d * N.z;
+	}
+
+	void ShaderCore::reflect4(Vector4f &dst, Vector4f &I, Vector4f &N)
+	{
+		Float4 d = dot4(N, I);
+
+		dst.x = I.x - Float4(2.0f) * d * N.x;
+		dst.y = I.y - Float4(2.0f) * d * N.y;
+		dst.z = I.z - Float4(2.0f) * d * N.z;
+		dst.w = I.w - Float4(2.0f) * d * N.w;
+	}
+
+	void ShaderCore::refract1(Vector4f &dst, Vector4f &I, Vector4f &N, Float4 &eta)
+	{
+		Float4 d = N.x * I.x;
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+	}
+
+	void ShaderCore::refract2(Vector4f &dst, Vector4f &I, Vector4f &N, Float4 &eta)
+	{
+		Float4 d = dot2(N, I);
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+	}
+
+	void ShaderCore::refract3(Vector4f &dst, Vector4f &I, Vector4f &N, Float4 &eta)
+	{
+		Float4 d = dot3(N, I);
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
+	}
+
+	void ShaderCore::refract4(Vector4f &dst, Vector4f &I, Vector4f &N, Float4 &eta)
+	{
+		Float4 d = dot4(N, I);
+		Float4 k = Float4(1.0f) - eta * eta * (Float4(1.0f) - d * d);
+		Int4 pos = CmpNLT(k, Float4(0.0f));
+		Float4 t = (eta * d + Sqrt(k));
+
+		dst.x = As<Float4>(pos & As<Int4>(eta * I.x - t * N.x));
+		dst.y = As<Float4>(pos & As<Int4>(eta * I.y - t * N.y));
+		dst.z = As<Float4>(pos & As<Int4>(eta * I.z - t * N.z));
+		dst.w = As<Float4>(pos & As<Int4>(eta * I.w - t * N.w));
+	}
+
+	void ShaderCore::sgn(Vector4f &dst, Vector4f &src)
 	{
 		sgn(dst.x, src.x);
 		sgn(dst.y, src.y);
@@ -232,15 +978,26 @@
 		sgn(dst.w, src.w);
 	}
 
-	void ShaderCore::abs(Color4f &dst, Color4f &src)
+	void ShaderCore::abs(Vector4f &dst, Vector4f &src)
 	{
 		dst.x = Abs(src.x);
 		dst.y = Abs(src.y);
 		dst.z = Abs(src.z);
 		dst.w = Abs(src.w);
 	}
+	
+	void ShaderCore::nrm2(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		Float4 dot = dot2(src, src);
+		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
 
-	void ShaderCore::nrm(Color4f &dst, Color4f &src, bool pp)
+		dst.x = src.x * rsq;
+		dst.y = src.y * rsq;
+		dst.z = src.z * rsq;
+		dst.w = src.w * rsq;
+	}
+
+	void ShaderCore::nrm3(Vector4f &dst, Vector4f &src, bool pp)
 	{
 		Float4 dot = dot3(src, src);
 		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
@@ -250,42 +1007,99 @@
 		dst.z = src.z * rsq;
 		dst.w = src.w * rsq;
 	}
-	
-	void ShaderCore::sincos(Color4f &dst, Color4f &src, bool pp)
+
+	void ShaderCore::nrm4(Vector4f &dst, Vector4f &src, bool pp)
 	{
-		Float4 tmp0;
-		Float4 tmp1;
+		Float4 dot = dot4(src, src);
+		Float4 rsq = reciprocalSquareRoot(dot, false, pp);
 
-		tmp0 = src.x;
-
-		// cos(x) = sin(x + pi/2)
-		tmp0 += Float4(1.57079632e+0f);
-		tmp1 = As<Float4>(CmpNLT(tmp0, Float4(3.14159265e+0f)) & As<Int4>(Float4(6.28318530e+0f)));
-		tmp0 -= tmp1;
-
-		dst.x = sine(tmp0, pp);
-		dst.y = sine(src.x, pp);
+		dst.x = src.x * rsq;
+		dst.y = src.y * rsq;
+		dst.z = src.z * rsq;
+		dst.w = src.w * rsq;
+	}
+	
+	void ShaderCore::sincos(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = cosine_pi(src.x, pp);
+		dst.y = sine_pi(src.x, pp);
 	}
 
-	void ShaderCore::expp(Color4f &dst, Color4f &src, unsigned short version)
+	void ShaderCore::cos(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = cosine(src.x, pp);
+		dst.y = cosine(src.y, pp);
+		dst.z = cosine(src.z, pp);
+		dst.w = cosine(src.w, pp);
+	}
+
+	void ShaderCore::sin(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = sine(src.x, pp);
+		dst.y = sine(src.y, pp);
+		dst.z = sine(src.z, pp);
+		dst.w = sine(src.w, pp);
+	}
+
+	void ShaderCore::tan(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = tangent(src.x, pp);
+		dst.y = tangent(src.y, pp);
+		dst.z = tangent(src.z, pp);
+		dst.w = tangent(src.w, pp);
+	}
+
+	void ShaderCore::acos(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = arccos(src.x, pp);
+		dst.y = arccos(src.y, pp);
+		dst.z = arccos(src.z, pp);
+		dst.w = arccos(src.w, pp);
+	}
+
+	void ShaderCore::asin(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = arcsin(src.x, pp);
+		dst.y = arcsin(src.y, pp);
+		dst.z = arcsin(src.z, pp);
+		dst.w = arcsin(src.w, pp);
+	}
+
+	void ShaderCore::atan(Vector4f &dst, Vector4f &src, bool pp)
+	{
+		dst.x = arctan(src.x, pp);
+		dst.y = arctan(src.y, pp);
+		dst.z = arctan(src.z, pp);
+		dst.w = arctan(src.w, pp);
+	}
+
+	void ShaderCore::atan2(Vector4f &dst, Vector4f &src0, Vector4f &src1, bool pp)
+	{
+		dst.x = arctan(src0.x, src1.x, pp);
+		dst.y = arctan(src0.y, src1.y, pp);
+		dst.z = arctan(src0.z, src1.z, pp);
+		dst.w = arctan(src0.w, src1.w, pp);
+	}
+
+	void ShaderCore::expp(Vector4f &dst, Vector4f &src, unsigned short version)
 	{
 		if(version < 0x0200)
 		{
-			Float4 frc = Fraction(src.x);
+			Float4 frc = Frac(src.x);
 			Float4 floor = src.x - frc;
 
-			dst.x = exponential(floor, true);
+			dst.x = exponential2(floor, true);
 			dst.y = frc;
-			dst.z = exponential(src.x, true);
-			dst.w = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+			dst.z = exponential2(src.x, true);
+			dst.w = Float4(1.0f);
 		}
 		else   // Version >= 2.0
 		{
-			exp(dst, src, true);   // FIXME: 10-bit precision suffices
+			exp2x(dst, src, true);   // FIXME: 10-bit precision suffices
 		}
 	}
 	
-	void ShaderCore::logp(Color4f &dst, Color4f &src, unsigned short version)
+	void ShaderCore::logp(Vector4f &dst, Vector4f &src, unsigned short version)
 	{
 		if(version < 0x0200)
 		{
@@ -298,92 +1112,109 @@
 			tmp1 = tmp0;
 
 			// X component
-			r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127, 127, 127, 127);
+			r = As<Int4>(As<UInt4>(tmp0) >> 23) - Int4(127);
 			dst.x = Float4(r);
 
 			// Y component
 			dst.y = As<Float4>((As<Int4>(tmp1) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
 
 			// Z component
-			dst.z = logarithm(src.x, true, true);
+			dst.z = logarithm2(src.x, true, true);
 
 			// W component
 			dst.w = 1.0f;
 		}
 		else
 		{
-			log(dst, src, true);
+			log2x(dst, src, true);
 		}
 	}
 	
-	void ShaderCore::cmp(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2)
+	void ShaderCore::cmp0(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2)
 	{
-		cmp(dst.x, src0.x, src1.x, src2.x);
-		cmp(dst.y, src0.y, src1.y, src2.y);
-		cmp(dst.z, src0.z, src1.z, src2.z);
-		cmp(dst.w, src0.w, src1.w, src2.w);
+		cmp0(dst.x, src0.x, src1.x, src2.x);
+		cmp0(dst.y, src0.y, src1.y, src2.y);
+		cmp0(dst.z, src0.z, src1.z, src2.z);
+		cmp0(dst.w, src0.w, src1.w, src2.w);
 	}
-	
-	void ShaderCore::dp2add(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2)
-	{
-		Float4 t = src0.x * src1.x + src0.y * src1.y + src2.x;
 
-		dst.x = t;
-		dst.y = t;
-		dst.z = t;
-		dst.w = t;
+	void ShaderCore::select(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2)
+	{
+		select(dst.x, As<Int4>(src0.x), src1.x, src2.x);
+		select(dst.y, As<Int4>(src0.y), src1.y, src2.y);
+		select(dst.z, As<Int4>(src0.z), src1.z, src2.z);
+		select(dst.w, As<Int4>(src0.w), src1.w, src2.w);
+	}
+
+	void ShaderCore::extract(Float4 &dst, Vector4f &src0, Float4 &src1)
+	{
+		select(dst, CmpEQ(src1, Float4(1.0f)), src0.y, src0.x);
+		select(dst, CmpEQ(src1, Float4(2.0f)), src0.z, dst);
+		select(dst, CmpEQ(src1, Float4(3.0f)), src0.w, dst);
+	}
+
+	void ShaderCore::insert(Vector4f &dst, Vector4f &src, Float4 &element, Float4 &index)
+	{
+		select(dst.x, CmpEQ(index, Float4(0.0f)), element, src.x);
+		select(dst.y, CmpEQ(index, Float4(1.0f)), element, src.y);
+		select(dst.z, CmpEQ(index, Float4(2.0f)), element, src.z);
+		select(dst.w, CmpEQ(index, Float4(3.0f)), element, src.w);
 	}
 
 	void ShaderCore::sgn(Float4 &dst, Float4 &src)
 	{
-		Int4 neg = As<Int4>(CmpLT(src, Float4(0, 0, 0, 0))) & As<Int4>(Float4(-1, -1, -1, -1));
-		Int4 pos = As<Int4>(CmpNLT(src, Float4(0, 0, 0, 0))) & As<Int4>(Float4(1, 1, 1, 1));
+		Int4 neg = As<Int4>(CmpLT(src, Float4(-0.0f))) & As<Int4>(Float4(-1.0f));
+		Int4 pos = As<Int4>(CmpNLE(src, Float4(+0.0f))) & As<Int4>(Float4(1.0f));
 		dst = As<Float4>(neg | pos);
 	}
 
-	void ShaderCore::cmp(Float4 &dst, Float4 &src0, Float4 &src1, Float4 &src2)
+	void ShaderCore::cmp0(Float4 &dst, Float4 &src0, Float4 &src1, Float4 &src2)
 	{
-		Int4 pos = CmpNLE(Float4(0.0f, 0.0f, 0.0f, 0.0f), src0);
-		Int4 t0 = pos & As<Int4>(src2);
-		Int4 t1 = ~pos & As<Int4>(src1);
-		dst = As<Float4>(t0 | t1);
+		Int4 pos = CmpLE(Float4(0.0f), src0);
+		select(dst, pos, src1, src2);
 	}
 
-	void ShaderCore::setp(Color4f &dst, Color4f &src0, Color4f &src1, Control control)
+	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, Float4 &src1, Float4 &src2)
+	{
+		// FIXME: LLVM vector select
+		dst = As<Float4>(src0 & As<Int4>(src1) | ~src0 & As<Int4>(src2));
+	}
+
+	void ShaderCore::cmp(Vector4f &dst, Vector4f &src0, Vector4f &src1, Control control)
 	{
 		switch(control)
 		{
-		case Op::CONTROL_GT:
+		case Shader::CONTROL_GT:
 			dst.x = As<Float4>(CmpNLE(src0.x, src1.x));
 			dst.y = As<Float4>(CmpNLE(src0.y, src1.y));
 			dst.z = As<Float4>(CmpNLE(src0.z, src1.z));
 			dst.w = As<Float4>(CmpNLE(src0.w, src1.w));
 			break;
-		case Op::CONTROL_EQ:
+		case Shader::CONTROL_EQ:
 			dst.x = As<Float4>(CmpEQ(src0.x, src1.x));
 			dst.y = As<Float4>(CmpEQ(src0.y, src1.y));
 			dst.z = As<Float4>(CmpEQ(src0.z, src1.z));
 			dst.w = As<Float4>(CmpEQ(src0.w, src1.w));
 			break;
-		case Op::CONTROL_GE:
+		case Shader::CONTROL_GE:
 			dst.x = As<Float4>(CmpNLT(src0.x, src1.x));
 			dst.y = As<Float4>(CmpNLT(src0.y, src1.y));
 			dst.z = As<Float4>(CmpNLT(src0.z, src1.z));
 			dst.w = As<Float4>(CmpNLT(src0.w, src1.w));
 			break;
-		case Op::CONTROL_LT:
+		case Shader::CONTROL_LT:
 			dst.x = As<Float4>(CmpLT(src0.x, src1.x));
 			dst.y = As<Float4>(CmpLT(src0.y, src1.y));
 			dst.z = As<Float4>(CmpLT(src0.z, src1.z));
 			dst.w = As<Float4>(CmpLT(src0.w, src1.w));
 			break;
-		case Op::CONTROL_NE:
+		case Shader::CONTROL_NE:
 			dst.x = As<Float4>(CmpNEQ(src0.x, src1.x));
 			dst.y = As<Float4>(CmpNEQ(src0.y, src1.y));
 			dst.z = As<Float4>(CmpNEQ(src0.z, src1.z));
 			dst.w = As<Float4>(CmpNEQ(src0.w, src1.w));
 			break;
-		case Op::CONTROL_LE:
+		case Shader::CONTROL_LE:
 			dst.x = As<Float4>(CmpLE(src0.x, src1.x));
 			dst.y = As<Float4>(CmpLE(src0.y, src1.y));
 			dst.z = As<Float4>(CmpLE(src0.z, src1.z));
@@ -393,4 +1224,82 @@
 			ASSERT(false);
 		}
 	}
+
+	void ShaderCore::icmp(Vector4f &dst, Vector4f &src0, Vector4f &src1, Control control)
+	{
+		switch(control)
+		{
+		case Shader::CONTROL_GT:
+			dst.x = As<Float4>(CmpNLE(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpNLE(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpNLE(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpNLE(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_EQ:
+			dst.x = As<Float4>(CmpEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_GE:
+			dst.x = As<Float4>(CmpNLT(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpNLT(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpNLT(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpNLT(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_LT:
+			dst.x = As<Float4>(CmpLT(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpLT(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpLT(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpLT(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_NE:
+			dst.x = As<Float4>(CmpNEQ(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpNEQ(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpNEQ(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpNEQ(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		case Shader::CONTROL_LE:
+			dst.x = As<Float4>(CmpLE(As<Int4>(src0.x), As<Int4>(src1.x)));
+			dst.y = As<Float4>(CmpLE(As<Int4>(src0.y), As<Int4>(src1.y)));
+			dst.z = As<Float4>(CmpLE(As<Int4>(src0.z), As<Int4>(src1.z)));
+			dst.w = As<Float4>(CmpLE(As<Int4>(src0.w), As<Int4>(src1.w)));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
+	void ShaderCore::all(Float4 &dst, Vector4f &src)
+	{
+		dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
+	}
+
+	void ShaderCore::any(Float4 &dst, Vector4f &src)
+	{
+		dst = As<Float4>(As<Int4>(src.x) | As<Int4>(src.y) | As<Int4>(src.z) | As<Int4>(src.w));
+	}
+
+	void ShaderCore::not(Vector4f &dst, Vector4f &src)
+	{
+		dst.x = As<Float4>(As<Int4>(src.x) ^ Int4(0xFFFFFFFF));
+		dst.y = As<Float4>(As<Int4>(src.y) ^ Int4(0xFFFFFFFF));
+		dst.z = As<Float4>(As<Int4>(src.z) ^ Int4(0xFFFFFFFF));
+		dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
+	}
+
+	void ShaderCore::or(Float4 &dst, Float4 &src0, Float4 &src1)
+	{
+		dst = As<Float4>(As<Int4>(src0) | As<Int4>(src1));
+	}
+
+	void ShaderCore::xor(Float4 &dst, Float4 &src0, Float4 &src1)
+	{
+		dst = As<Float4>(As<Int4>(src0) ^ As<Int4>(src1));
+	}
+
+	void ShaderCore::and(Float4 &dst, Float4 &src0, Float4 &src1)
+	{
+		dst = As<Float4>(As<Int4>(src0) & As<Int4>(src1));
+	}
 }
diff --git a/src/Shader/ShaderCore.hpp b/src/Shader/ShaderCore.hpp
index 662e50a..6cb42f4 100644
--- a/src/Shader/ShaderCore.hpp
+++ b/src/Shader/ShaderCore.hpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer

 //

-// Copyright(c) 2005-2011 TransGaming Inc.

+// Copyright(c) 2005-2012 TransGaming Inc.

 //

 // All rights reserved. No part of this software may be copied, distributed, transmitted,

 // transcribed, stored in a retrieval system, translated into any human or computer

@@ -12,51 +12,284 @@
 #ifndef sw_ShaderCore_hpp

 #define sw_ShaderCore_hpp

 

+#include "Debug.hpp"

 #include "Shader.hpp"

 #include "Reactor/Reactor.hpp"

 

 namespace sw

 {

+	class Vector4i

+	{

+	public:

+		Vector4i();

+		Vector4i(unsigned short x, unsigned short y, unsigned short z, unsigned short w);

+		Vector4i(const Vector4i &rhs);

+

+		Short4 &operator[](int i);

+		Vector4i &operator=(const Vector4i &rhs);

+

+		Short4 x;

+		Short4 y;

+		Short4 z;

+		Short4 w;

+	};

+

+	class Vector4f

+	{

+	public:

+		Vector4f();

+		Vector4f(float x, float y, float z, float w);

+		Vector4f(const Vector4f &rhs);

+

+		Float4 &operator[](int i);

+		Vector4f &operator=(const Vector4f &rhs);

+		

+		Float4 x;

+		Float4 y;

+		Float4 z;

+		Float4 w;

+	};

+

+	Float4 exponential2(RValue<Float4> x, bool pp = false);

+	Float4 logarithm2(RValue<Float4> x, bool abs, bool pp = false);

+	Float4 exponential(RValue<Float4> x, bool pp = false);

+	Float4 logarithm(RValue<Float4> x, bool abs, bool pp = false);

+	Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);

+	Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false);

+	Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);

+	Float4 modulo(RValue<Float4> x, RValue<Float4> y);

+	Float4 sine_pi(RValue<Float4> x, bool pp = false);     // limited to [-pi, pi] range

+	Float4 cosine_pi(RValue<Float4> x, bool pp = false);   // limited to [-pi, pi] range

+	Float4 sine(RValue<Float4> x, bool pp = false);

+	Float4 cosine(RValue<Float4> x, bool pp = false);

+	Float4 tangent(RValue<Float4> x, bool pp = false);

+	Float4 arccos(RValue<Float4> x, bool pp = false);

+	Float4 arcsin(RValue<Float4> x, bool pp = false);

+	Float4 arctan(RValue<Float4> x, bool pp = false);

+	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);

+

+	Float4 dot2(Vector4f &v0, Vector4f &v1);

+	Float4 dot3(Vector4f &v0, Vector4f &v1);

+	Float4 dot4(Vector4f &v0, Vector4f &v1);

+

+	void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);

+	void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);

+	void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);

+	void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);

+	void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);

+	void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);

+	void transpose2x4h(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);

+	void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);

+

+	class Register

+	{

+	public:

+		Register(Reference<Float4> &x, Reference<Float4> &y, Reference<Float4> &z, Reference<Float4> &w) : x(x), y(y), z(z), w(w)

+		{

+		}

+

+		Reference<Float4> &operator[](int i)

+		{

+			switch(i)

+			{

+			default:

+			case 0: return x;

+			case 1: return y;

+			case 2: return z;

+			case 3: return w;

+			}

+		}

+

+		Register &operator=(const Register &rhs)

+		{

+			x = rhs.x;

+			y = rhs.y;

+			z = rhs.z;

+			w = rhs.w;

+

+			return *this;

+		}

+

+		Register &operator=(const Vector4f &rhs)

+		{

+			x = rhs.x;

+			y = rhs.y;

+			z = rhs.z;

+			w = rhs.w;

+

+			return *this;

+		}

+

+		operator Vector4f()

+		{

+			Vector4f v;

+

+			v.x = x;

+			v.y = y;

+			v.z = z;

+			v.w = w;

+

+			return v;

+		}

+

+		Reference<Float4> x;

+		Reference<Float4> y;

+		Reference<Float4> z;

+		Reference<Float4> w;

+	};

+

+	template<int S, bool D = false>

+	class RegisterArray

+	{

+	public:

+		RegisterArray(bool dynamic = D) : dynamic(dynamic)

+		{

+			if(dynamic)

+			{

+				x = new Array<Float4>(S);

+				y = new Array<Float4>(S);

+				z = new Array<Float4>(S);

+				w = new Array<Float4>(S);

+			}

+			else

+			{

+				x = new Array<Float4>[S];

+				y = new Array<Float4>[S];

+				z = new Array<Float4>[S];

+				w = new Array<Float4>[S];

+			}

+		}

+

+		~RegisterArray()

+		{

+			delete[] x;

+			delete[] y;

+			delete[] z;

+			delete[] w;

+		}

+

+		Register operator[](int i)

+		{

+			if(dynamic)

+			{

+				return Register(x[0][i], y[0][i], z[0][i], w[0][i]);

+			}

+			else

+			{

+				return Register(x[i][0], y[i][0], z[i][0], w[i][0]);

+			}

+		}

+

+		Register operator[](RValue<Int> i)

+		{

+			ASSERT(dynamic);

+

+			return Register(x[0][i], y[0][i], z[0][i], w[0][i]);

+		}

+

+	private:

+		const bool dynamic;

+		Array<Float4> *x;

+		Array<Float4> *y;

+		Array<Float4> *z;

+		Array<Float4> *w;

+	};

+

 	class ShaderCore

 	{

-		typedef Shader::Instruction::Operation::Control Control;

-		typedef Shader::Instruction::Operation Op;

+		typedef Shader::Control Control;

 

 	public:

-		void mov(Color4f &dst, Color4f &src, bool floorToInteger = false);

-		void add(Color4f &dst, Color4f &src0, Color4f &src1);

-		void sub(Color4f &dst, Color4f &src0, Color4f &src1);

-		void mad(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2);

-		void mul(Color4f &dst, Color4f &src0, Color4f &src1);

-		void rcp(Color4f &dst, Color4f &src, bool pp = false);

-		void rsq(Color4f &dst, Color4f &src, bool pp = false);

-		void dp3(Color4f &dst, Color4f &src0, Color4f &src1);

-		void dp4(Color4f &dst, Color4f &src0, Color4f &src1);

-		void min(Color4f &dst, Color4f &src0, Color4f &src1);

-		void max(Color4f &dst, Color4f &src0, Color4f &src1);

-		void slt(Color4f &dst, Color4f &src0, Color4f &src1);

-		void sge(Color4f &dst, Color4f &src0, Color4f &src1);

-		void exp(Color4f &dst, Color4f &src, bool pp = false);

-		void log(Color4f &dst, Color4f &src, bool pp = false);

-		void lit(Color4f &dst, Color4f &src);

-		void dst(Color4f &dst, Color4f &src0, Color4f &src1);

-		void lrp(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2);

-		void frc(Color4f &dst, Color4f &src);

-		void pow(Color4f &dst, Color4f &src0, Color4f &src1, bool pp = false);

-		void crs(Color4f &dst, Color4f &src0, Color4f &src1);

-		void sgn(Color4f &dst, Color4f &src);

-		void abs(Color4f &dst, Color4f &src);

-		void nrm(Color4f &dst, Color4f &src, bool pp = false);

-		void sincos(Color4f &dst, Color4f &src, bool pp = false);

-		void expp(Color4f &dst, Color4f &src, unsigned short version);

-		void logp(Color4f &dst, Color4f &src, unsigned short version);

-		void cmp(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2);

-		void dp2add(Color4f &dst, Color4f &src0, Color4f &src1, Color4f &src2);

-		void setp(Color4f &dst, Color4f &src0, Color4f &src1, Control control);

+		void mov(Vector4f &dst, Vector4f &src, bool floorToInteger = false);

+		void f2b(Vector4f &dst, Vector4f &src);

+		void b2f(Vector4f &dst, Vector4f &src);

+		void add(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void sub(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void mad(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void mul(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void rcpx(Vector4f &dst, Vector4f &src, bool pp = false);

+		void div(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void mod(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void rsqx(Vector4f &dst, Vector4f &src, bool pp = false);

+		void sqrt(Vector4f &dst, Vector4f &src, bool pp = false);

+		void rsq(Vector4f &dst, Vector4f &src, bool pp = false);

+		void len2(Float4 &dst, Vector4f &src, bool pp = false);

+		void len3(Float4 &dst, Vector4f &src, bool pp = false);

+		void len4(Float4 &dst, Vector4f &src, bool pp = false);

+		void dist1(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp = false);

+		void dist2(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp = false);

+		void dist3(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp = false);

+		void dist4(Float4 &dst, Vector4f &src0, Vector4f &src1, bool pp = false);

+		void dp1(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void dp2(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void dp2add(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void dp3(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void dp4(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void min(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void max(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void slt(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void step(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void exp2x(Vector4f &dst, Vector4f &src, bool pp = false);

+		void exp2(Vector4f &dst, Vector4f &src, bool pp = false);

+		void exp(Vector4f &dst, Vector4f &src, bool pp = false);

+		void log2x(Vector4f &dst, Vector4f &src, bool pp = false);

+		void log2(Vector4f &dst, Vector4f &src, bool pp = false);

+		void log(Vector4f &dst, Vector4f &src, bool pp = false);

+		void lit(Vector4f &dst, Vector4f &src);

+		void att(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void lrp(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void smooth(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void frc(Vector4f &dst, Vector4f &src);

+		void trunc(Vector4f &dst, Vector4f &src);

+		void floor(Vector4f &dst, Vector4f &src);

+		void ceil(Vector4f &dst, Vector4f &src);

+		void powx(Vector4f &dst, Vector4f &src0, Vector4f &src1, bool pp = false);

+		void pow(Vector4f &dst, Vector4f &src0, Vector4f &src1, bool pp = false);

+		void crs(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void forward1(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void forward2(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void forward3(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void forward4(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void reflect1(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void reflect2(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void reflect3(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void reflect4(Vector4f &dst, Vector4f &src0, Vector4f &src1);

+		void refract1(Vector4f &dst, Vector4f &src0, Vector4f &src1, Float4 &src2);

+		void refract2(Vector4f &dst, Vector4f &src0, Vector4f &src1, Float4 &src2);

+		void refract3(Vector4f &dst, Vector4f &src0, Vector4f &src1, Float4 &src2);

+		void refract4(Vector4f &dst, Vector4f &src0, Vector4f &src1, Float4 &src2);

+		void sgn(Vector4f &dst, Vector4f &src);

+		void abs(Vector4f &dst, Vector4f &src);

+		void nrm2(Vector4f &dst, Vector4f &src, bool pp = false);

+		void nrm3(Vector4f &dst, Vector4f &src, bool pp = false);

+		void nrm4(Vector4f &dst, Vector4f &src, bool pp = false);

+		void sincos(Vector4f &dst, Vector4f &src, bool pp = false);

+		void cos(Vector4f &dst, Vector4f &src, bool pp = false);

+		void sin(Vector4f &dst, Vector4f &src, bool pp = false);

+		void tan(Vector4f &dst, Vector4f &src, bool pp = false);

+		void acos(Vector4f &dst, Vector4f &src, bool pp = false);

+		void asin(Vector4f &dst, Vector4f &src, bool pp = false);

+		void atan(Vector4f &dst, Vector4f &src, bool pp = false);

+		void atan2(Vector4f &dst, Vector4f &src0, Vector4f &src1, bool pp = false);

+		void expp(Vector4f &dst, Vector4f &src, unsigned short version);

+		void logp(Vector4f &dst, Vector4f &src, unsigned short version);

+		void cmp0(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void cmp(Vector4f &dst, Vector4f &src0, Vector4f &src1, Control control);

+		void icmp(Vector4f &dst, Vector4f &src0, Vector4f &src1, Control control);

+		void select(Vector4f &dst, Vector4f &src0, Vector4f &src1, Vector4f &src2);

+		void extract(Float4 &dst, Vector4f &src0, Float4 &src1);

+		void insert(Vector4f &dst, Vector4f &src, Float4 &element, Float4 &index);

+		void all(Float4 &dst, Vector4f &src);

+		void any(Float4 &dst, Vector4f &src);

+		void not(Vector4f &dst, Vector4f &src);

+		void or(Float4 &dst, Float4 &src0, Float4 &src1);

+		void xor(Float4 &dst, Float4 &src0, Float4 &src1);

+		void and(Float4 &dst, Float4 &src0, Float4 &src1);

 

 	private:

 		void sgn(Float4 &dst, Float4 &src);

-		void cmp(Float4 &dst, Float4 &src0, Float4 &src1, Float4 &src2);

+		void cmp0(Float4 &dst, Float4 &src0, Float4 &src1, Float4 &src2);

+		void select(Float4 &dst, RValue<Int4> src0, Float4 &src1, Float4 &src2);

 	};

 }

 

diff --git a/src/Shader/VertexPipeline.cpp b/src/Shader/VertexPipeline.cpp
index 112b451..1e39ca7 100644
--- a/src/Shader/VertexPipeline.cpp
+++ b/src/Shader/VertexPipeline.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -24,7 +24,7 @@
 
 namespace sw
 {
-	VertexPipeline::VertexPipeline(const VertexProcessor::State &state) : VertexRoutine(state)
+	VertexPipeline::VertexPipeline(const VertexProcessor::State &state) : VertexRoutine(state, 0)
 	{
 	}
 
@@ -32,9 +32,9 @@
 	{
 	}
 
-	Color4f VertexPipeline::transformBlend(Registers &r, Color4f &src, Pointer<Byte> &matrix, bool homogeneous)
+	Vector4f VertexPipeline::transformBlend(Registers &r, Register &src, Pointer<Byte> &matrix, bool homogeneous)
 	{
-		Color4f dst;
+		Vector4f dst;
 
 		if(state.vertexBlendMatrixCount == 0)
 		{
@@ -51,14 +51,15 @@
 			{
 				for(int i = 0; i < 4; i++)
 				{
+					Float4 B = r.v[BlendIndices].x;
 					UInt indices;
 					
 					switch(i)
 					{
-					case 0: indices = As<UInt>(Float(r.v[BlendIndices].x.x)); break;
-					case 1: indices = As<UInt>(Float(r.v[BlendIndices].x.y)); break;
-					case 2: indices = As<UInt>(Float(r.v[BlendIndices].x.z)); break;
-					case 3: indices = As<UInt>(Float(r.v[BlendIndices].x.w)); break;
+					case 0: indices = As<UInt>(Float(B.x)); break;
+					case 1: indices = As<UInt>(Float(B.y)); break;
+					case 2: indices = As<UInt>(Float(B.z)); break;
+					case 3: indices = As<UInt>(Float(B.w)); break;
 					}
 
 					index0[i] = (indices & UInt(0x000000FF)) << UInt(6);   // FIXME: (indices & 0x000000FF) << 6
@@ -100,13 +101,13 @@
 			{
 				weight1 = Float4(1.0f) - weight0;
 
-				Color4f pos0;
-				Color4f pos1;
+				Vector4f pos0;
+				Vector4f pos1;
 
 				pos0 = transform(src, matrix, index0, homogeneous);
 				pos1 = transform(src, matrix, index1, homogeneous);
 
-				dst.x = pos0.x * weight0 + pos1.x * weight1;   // FIXME: Color4f operators
+				dst.x = pos0.x * weight0 + pos1.x * weight1;   // FIXME: Vector4f operators
 				dst.y = pos0.y * weight0 + pos1.y * weight1;
 				dst.z = pos0.z * weight0 + pos1.z * weight1;
 				dst.w = pos0.w * weight0 + pos1.w * weight1;
@@ -115,9 +116,9 @@
 			{
 				weight2 = Float4(1.0f) - (weight0 + weight1);
 
-				Color4f pos0;
-				Color4f pos1;
-				Color4f pos2;
+				Vector4f pos0;
+				Vector4f pos1;
+				Vector4f pos2;
 
 				pos0 = transform(src, matrix, index0, homogeneous);
 				pos1 = transform(src, matrix, index1, homogeneous);
@@ -132,10 +133,10 @@
 			{
 				weight3 = Float4(1.0f) - (weight0 + weight1 + weight2);
 
-				Color4f pos0;
-				Color4f pos1;
-				Color4f pos2;
-				Color4f pos3;
+				Vector4f pos0;
+				Vector4f pos1;
+				Vector4f pos2;
+				Vector4f pos3;
 
 				pos0 = transform(src, matrix, index0, homogeneous);
 				pos1 = transform(src, matrix, index1, homogeneous);
@@ -154,8 +155,8 @@
 
 	void VertexPipeline::pipeline(Registers &r)
 	{
-		Color4f position;
-		Color4f normal;
+		Vector4f position;
+		Vector4f normal;
 
 		if(!state.preTransformed)
 		{
@@ -166,10 +167,10 @@
 			position = r.v[PositionT];
 		}
 
-		r.ox[Pos] = position.x;
-		r.oy[Pos] = position.y;
-		r.oz[Pos] = position.z;
-		r.ow[Pos] = position.w;
+		r.o[Pos].x = position.x;
+		r.o[Pos].y = position.y;
+		r.o[Pos].z = position.z;
+		r.o[Pos].w = position.w;
 
 		if(state.vertexNormalActive)
 		{
@@ -186,59 +187,59 @@
 			// FIXME: Don't process if not used at all
 			if(state.diffuseActive && state.input[Color0])
 			{
-				Color4f diffuse = r.v[Color0];
+				Vector4f diffuse = r.v[Color0];
 
-				r.ox[D0] = diffuse.x;
-				r.oy[D0] = diffuse.y;
-				r.oz[D0] = diffuse.z;
-				r.ow[D0] = diffuse.w;
+				r.o[D0].x = diffuse.x;
+				r.o[D0].y = diffuse.y;
+				r.o[D0].z = diffuse.z;
+				r.o[D0].w = diffuse.w;
 			}
 			else
 			{
-				r.ox[D0] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-				r.oy[D0] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-				r.oz[D0] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-				r.ow[D0] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+				r.o[D0].x = Float4(1.0f);
+				r.o[D0].y = Float4(1.0f);
+				r.o[D0].z = Float4(1.0f);
+				r.o[D0].w = Float4(1.0f);
 			}
 
 			// FIXME: Don't process if not used at all
 			if(state.specularActive && state.input[Color1])
 			{
-				Color4f specular = r.v[Color1];
+				Vector4f specular = r.v[Color1];
 
-				r.ox[D1] = specular.x;
-				r.oy[D1] = specular.y;
-				r.oz[D1] = specular.z;
-				r.ow[D1] = specular.w;
+				r.o[D1].x = specular.x;
+				r.o[D1].y = specular.y;
+				r.o[D1].z = specular.z;
+				r.o[D1].w = specular.w;
 			}
 			else
 			{
-				r.ox[D1] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-				r.oy[D1] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-				r.oz[D1] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-				r.ow[D1] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+				r.o[D1].x = Float4(0.0f);
+				r.o[D1].y = Float4(0.0f);
+				r.o[D1].z = Float4(0.0f);
+				r.o[D1].w = Float4(1.0f);
 			}
 		}
 		else
 		{
-			Color4f diffuseSum;
+			Vector4f diffuseSum;
 
-			r.ox[D0] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			r.oy[D0] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			r.oz[D0] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			r.ow[D0] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+			r.o[D0].x = Float4(0.0f);
+			r.o[D0].y = Float4(0.0f);
+			r.o[D0].z = Float4(0.0f);
+			r.o[D0].w = Float4(0.0f);
 
-			r.ox[D1] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			r.oy[D1] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			r.oz[D1] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			r.ow[D1] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+			r.o[D1].x = Float4(0.0f);
+			r.o[D1].y = Float4(0.0f);
+			r.o[D1].z = Float4(0.0f);
+			r.o[D1].w = Float4(0.0f);
 
-			diffuseSum.x = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			diffuseSum.y = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			diffuseSum.z = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-			diffuseSum.w = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+			diffuseSum.x = Float4(0.0f);
+			diffuseSum.y = Float4(0.0f);
+			diffuseSum.z = Float4(0.0f);
+			diffuseSum.w = Float4(0.0f);
 
-			Color4f vertexPosition = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);
+			Vector4f vertexPosition = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);
 
 			for(int i = 0; i < 8; i++)
 			{
@@ -247,7 +248,7 @@
 					continue;
 				}
 
-				Color4f L;    // Light vector
+				Vector4f L;    // Light vector
 				Float4 att;   // Attenuation
 
 				// Attenuation
@@ -280,9 +281,9 @@
 				{
 					Float4 lightAmbient = *Pointer<Float4>(r.data + OFFSET(DrawData,ff.lightAmbient[i]));   // FIXME: Unpack
 
-					r.ox[D0] = r.ox[D0] + lightAmbient.x * att;
-					r.oy[D0] = r.oy[D0] + lightAmbient.y * att;
-					r.oz[D0] = r.oz[D0] + lightAmbient.z * att;
+					r.o[D0].x = r.o[D0].x + lightAmbient.x * att;
+					r.o[D0].y = r.o[D0].y + lightAmbient.y * att;
+					r.o[D0].z = r.o[D0].z + lightAmbient.z * att;
 				}
 
 				// Diffuse
@@ -291,10 +292,10 @@
 					Float4 dot;
 
 					dot = dot3(L, normal);
-					dot = Max(dot, Float4(0.0f, 0.0f, 0.0f, 0.0f));
+					dot = Max(dot, Float4(0.0f));
 					dot *= att;
 
-					Color4f diff;
+					Vector4f diff;
 
 					if(state.vertexDiffuseMaterialSourceActive == Context::MATERIAL)
 					{
@@ -323,15 +324,15 @@
 				// Specular
 				if(state.vertexSpecularActive)
 				{
-					Color4f S;
-					Color4f C;   // Camera vector
+					Vector4f S;
+					Vector4f C;   // Camera vector
 					Float4 pow;
 
 					pow = *Pointer<Float>(r.data + OFFSET(DrawData,ff.materialShininess));
 
-					S.x = Float4(0.0f, 0.0f, 0.0f, 0.0f) - vertexPosition.x;
-					S.y = Float4(0.0f, 0.0f, 0.0f, 0.0f) - vertexPosition.y;
-					S.z = Float4(0.0f, 0.0f, 0.0f, 0.0f) - vertexPosition.z;
+					S.x = Float4(0.0f) - vertexPosition.x;
+					S.y = Float4(0.0f) - vertexPosition.y;
+					S.z = Float4(0.0f) - vertexPosition.z;
 					C = normalize(S);
 
 					S.x = L.x + C.x;
@@ -344,7 +345,7 @@
 					Float4 P = power(dot, pow);
 					P *= att;
 
-					Color4f spec;
+					Vector4f spec;
 
 					if(state.vertexSpecularMaterialSourceActive == Context::MATERIAL)
 					{
@@ -374,93 +375,93 @@
 					spec.y *= P;
 					spec.z *= P;
 
-					spec.x = Max(spec.x, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					spec.y = Max(spec.y, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					spec.z = Max(spec.z, Float4(0.0f, 0.0f, 0.0f, 0.0f));
+					spec.x = Max(spec.x, Float4(0.0f));
+					spec.y = Max(spec.y, Float4(0.0f));
+					spec.z = Max(spec.z, Float4(0.0f));
 
-					r.ox[D1] = r.ox[D1] + spec.x;
-					r.oy[D1] = r.oy[D1] + spec.y;
-					r.oz[D1] = r.oz[D1] + spec.z;
+					r.o[D1].x = r.o[D1].x + spec.x;
+					r.o[D1].y = r.o[D1].y + spec.y;
+					r.o[D1].z = r.o[D1].z + spec.z;
 				}
 			}
 
 			Float4 globalAmbient = *Pointer<Float4>(r.data + OFFSET(DrawData,ff.globalAmbient));   // FIXME: Unpack
 
-			r.ox[D0] = r.ox[D0] + globalAmbient.x;
-			r.oy[D0] = r.oy[D0] + globalAmbient.y;
-			r.oz[D0] = r.oz[D0] + globalAmbient.z;
+			r.o[D0].x = r.o[D0].x + globalAmbient.x;
+			r.o[D0].y = r.o[D0].y + globalAmbient.y;
+			r.o[D0].z = r.o[D0].z + globalAmbient.z;
 
 			if(state.vertexAmbientMaterialSourceActive == Context::MATERIAL)
 			{
 				Float4 materialAmbient = *Pointer<Float4>(r.data + OFFSET(DrawData,ff.materialAmbient));   // FIXME: Unpack
 
-				r.ox[D0] = r.ox[D0] * materialAmbient.x;
-				r.oy[D0] = r.oy[D0] * materialAmbient.y;
-				r.oz[D0] = r.oz[D0] * materialAmbient.z;
+				r.o[D0].x = r.o[D0].x * materialAmbient.x;
+				r.o[D0].y = r.o[D0].y * materialAmbient.y;
+				r.o[D0].z = r.o[D0].z * materialAmbient.z;
 			}
 			else if(state.vertexAmbientMaterialSourceActive == Context::COLOR1)
 			{
-				Color4f materialDiffuse = r.v[Color0];
+				Vector4f materialDiffuse = r.v[Color0];
 
-				r.ox[D0] = r.ox[D0] * materialDiffuse.x;
-				r.oy[D0] = r.oy[D0] * materialDiffuse.y;
-				r.oz[D0] = r.oz[D0] * materialDiffuse.z;
+				r.o[D0].x = r.o[D0].x * materialDiffuse.x;
+				r.o[D0].y = r.o[D0].y * materialDiffuse.y;
+				r.o[D0].z = r.o[D0].z * materialDiffuse.z;
 			}
 			else if(state.vertexAmbientMaterialSourceActive == Context::COLOR2)
 			{
-				Color4f materialSpecular = r.v[Color1];
+				Vector4f materialSpecular = r.v[Color1];
 
-				r.ox[D0] = r.ox[D0] * materialSpecular.x;
-				r.oy[D0] = r.oy[D0] * materialSpecular.y;
-				r.oz[D0] = r.oz[D0] * materialSpecular.z;
+				r.o[D0].x = r.o[D0].x * materialSpecular.x;
+				r.o[D0].y = r.o[D0].y * materialSpecular.y;
+				r.o[D0].z = r.o[D0].z * materialSpecular.z;
 			}
 			else ASSERT(false);
 
-			r.ox[D0] = r.ox[D0] + diffuseSum.x;
-			r.oy[D0] = r.oy[D0] + diffuseSum.y;
-			r.oz[D0] = r.oz[D0] + diffuseSum.z;
+			r.o[D0].x = r.o[D0].x + diffuseSum.x;
+			r.o[D0].y = r.o[D0].y + diffuseSum.y;
+			r.o[D0].z = r.o[D0].z + diffuseSum.z;
 
 			// Emissive
 			if(state.vertexEmissiveMaterialSourceActive == Context::MATERIAL)
 			{
 				Float4 materialEmission = *Pointer<Float4>(r.data + OFFSET(DrawData,ff.materialEmission));   // FIXME: Unpack
 
-				r.ox[D0] = r.ox[D0] + materialEmission.x;
-				r.oy[D0] = r.oy[D0] + materialEmission.y;
-				r.oz[D0] = r.oz[D0] + materialEmission.z;
+				r.o[D0].x = r.o[D0].x + materialEmission.x;
+				r.o[D0].y = r.o[D0].y + materialEmission.y;
+				r.o[D0].z = r.o[D0].z + materialEmission.z;
 			}
 			else if(state.vertexEmissiveMaterialSourceActive == Context::COLOR1)
 			{
-				Color4f materialSpecular = r.v[Color0];
+				Vector4f materialSpecular = r.v[Color0];
 
-				r.ox[D0] = r.ox[D0] + materialSpecular.x;
-				r.oy[D0] = r.oy[D0] + materialSpecular.y;
-				r.oz[D0] = r.oz[D0] + materialSpecular.z;
+				r.o[D0].x = r.o[D0].x + materialSpecular.x;
+				r.o[D0].y = r.o[D0].y + materialSpecular.y;
+				r.o[D0].z = r.o[D0].z + materialSpecular.z;
 			}
 			else if(state.vertexEmissiveMaterialSourceActive == Context::COLOR2)
 			{
-				Color4f materialSpecular = r.v[Color1];
+				Vector4f materialSpecular = r.v[Color1];
 
-				r.ox[D0] = r.ox[D0] + materialSpecular.x;
-				r.oy[D0] = r.oy[D0] + materialSpecular.y;
-				r.oz[D0] = r.oz[D0] + materialSpecular.z;
+				r.o[D0].x = r.o[D0].x + materialSpecular.x;
+				r.o[D0].y = r.o[D0].y + materialSpecular.y;
+				r.o[D0].z = r.o[D0].z + materialSpecular.z;
 			}
 			else ASSERT(false);
 
 			// Diffuse alpha component
 			if(state.vertexDiffuseMaterialSourceActive == Context::MATERIAL)
 			{
-				r.ow[D0] = Float4(*Pointer<Float4>(r.data + OFFSET(DrawData,ff.materialDiffuse[0]))).wwww;   // FIXME: Unpack
+				r.o[D0].w = Float4(*Pointer<Float4>(r.data + OFFSET(DrawData,ff.materialDiffuse[0]))).wwww;   // FIXME: Unpack
 			}
 			else if(state.vertexDiffuseMaterialSourceActive == Context::COLOR1)
 			{
-				Color4f alpha = r.v[Color0];
-				r.ow[D0] = alpha.w;
+				Vector4f alpha = r.v[Color0];
+				r.o[D0].w = alpha.w;
 			}
 			else if(state.vertexDiffuseMaterialSourceActive == Context::COLOR2)
 			{
-				Color4f alpha = r.v[Color1];
-				r.ow[D0] = alpha.w;
+				Vector4f alpha = r.v[Color1];
+				r.o[D0].w = alpha.w;
 			}
 			else ASSERT(false);
 
@@ -469,17 +470,17 @@
 				// Specular alpha component
 				if(state.vertexSpecularMaterialSourceActive == Context::MATERIAL)
 				{
-					r.ow[D1] = Float4(*Pointer<Float4>(r.data + OFFSET(DrawData,ff.materialSpecular[3]))).wwww;   // FIXME: Unpack
+					r.o[D1].w = Float4(*Pointer<Float4>(r.data + OFFSET(DrawData,ff.materialSpecular[3]))).wwww;   // FIXME: Unpack
 				}
 				else if(state.vertexSpecularMaterialSourceActive == Context::COLOR1)
 				{
-					Color4f alpha = r.v[Color0];
-					r.ow[D1] = alpha.w;
+					Vector4f alpha = r.v[Color0];
+					r.o[D1].w = alpha.w;
 				}
 				else if(state.vertexSpecularMaterialSourceActive == Context::COLOR2)
 				{
-					Color4f alpha = r.v[Color1];
-					r.ow[D1] = alpha.w;
+					Vector4f alpha = r.v[Color1];
+					r.o[D1].w = alpha.w;
 				}
 				else ASSERT(false);
 			}
@@ -492,11 +493,11 @@
 			case Context::FOG_NONE:
 				if(state.specularActive)
 				{
-					r.ox[Fog] = r.ow[D1];
+					r.o[Fog].x = r.o[D1].w;
 				}
 				else
 				{
-					r.ox[Fog] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+					r.o[Fog].x = Float4(0.0f);
 				}
 				break;
 			case Context::FOG_LINEAR:
@@ -504,21 +505,21 @@
 			case Context::FOG_EXP2:
 				if(!state.rangeFogActive)
 				{
-					r.ox[Fog] = r.oz[Pos];
+					r.o[Fog].x = r.o[Pos].z;
 				}
 				else
 				{
-					Color4f pos;
+					Vector4f pos;
 
-					pos.x = r.ox[Pos];
-					pos.y = r.oy[Pos];
-					pos.z = r.oz[Pos];
-					pos.w = r.ow[Pos];
+					pos.x = r.o[Pos].x;
+					pos.y = r.o[Pos].y;
+					pos.z = r.o[Pos].z;
+					pos.w = r.o[Pos].w;
 
-					r.ox[Fog] = Sqrt(dot3(pos, pos));   // FIXME: oFog = length(o[Pos]);
+					r.o[Fog].x = Sqrt(dot3(pos, pos));   // FIXME: oFog = length(o[Pos]);
 				}
 
-				r.ox[Fog] = r.ox[Fog] * *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale)) + *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
+				r.o[Fog].x = r.o[Fog].x * *Pointer<Float4>(r.data + OFFSET(DrawData,fog.scale)) + *Pointer<Float4>(r.data + OFFSET(DrawData,fog.offset));
 				break;
 			default:
 				ASSERT(false);
@@ -533,7 +534,7 @@
 		processPointSize(r);
 	}
 
-	void VertexPipeline::processTextureCoordinate(Registers &r, int stage, Color4f &normal, Color4f &position)
+	void VertexPipeline::processTextureCoordinate(Registers &r, int stage, Vector4f &normal, Vector4f &position)
 	{
 		if(state.output[T0 + stage].write)
 		{
@@ -543,28 +544,28 @@
 			{
 			case Context::TEXGEN_PASSTHRU:
 				{
-					Color4f v = r.v[TexCoord0 + i];
+					Vector4f v = r.v[TexCoord0 + i];
 
-					r.ox[T0 + stage] = v.x;
-					r.oy[T0 + stage] = v.y;
-					r.oz[T0 + stage] = v.z;
-					r.ow[T0 + stage] = v.w;
+					r.o[T0 + stage].x = v.x;
+					r.o[T0 + stage].y = v.y;
+					r.o[T0 + stage].z = v.z;
+					r.o[T0 + stage].w = v.w;
 
-					if(state.input[TexCoord0 + i].type == STREAMTYPE_FLOAT)
+					if(state.input[TexCoord0 + i])
 					{
 						switch(state.input[TexCoord0 + i].count)
 						{
 						case 1:
-							r.oy[T0 + stage] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-							r.oz[T0 + stage] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-							r.ow[T0 + stage] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+							r.o[T0 + stage].y = Float4(1.0f);
+							r.o[T0 + stage].z = Float4(0.0f);
+							r.o[T0 + stage].w = Float4(0.0f);
 							break;
 						case 2:
-							r.oz[T0 + stage] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-							r.ow[T0 + stage] = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+							r.o[T0 + stage].z = Float4(1.0f);
+							r.o[T0 + stage].w = Float4(0.0f);
 							break;
 						case 3:
-							r.ow[T0 + stage] = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+							r.o[T0 + stage].w = Float4(1.0f);
 							break;
 						case 4:
 							break;
@@ -572,12 +573,11 @@
 							ASSERT(false);
 						}
 					}
-					else ASSERT(!state.input[TexCoord0 + i]);   // Point sprite; coordinates provided by setup
 				}
 				break;
 			case Context::TEXGEN_NORMAL:
 				{
-					Color4f Nc;   // Normal vector in camera space
+					Vector4f Nc;   // Normal vector in camera space
 
 					if(state.vertexNormalActive)
 					{
@@ -585,51 +585,51 @@
 					}
 					else
 					{
-						Nc.x = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-						Nc.y = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-						Nc.z = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+						Nc.x = Float4(0.0f);
+						Nc.y = Float4(0.0f);
+						Nc.z = Float4(0.0f);
 					}
 
-					Nc.w = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					Nc.w = Float4(1.0f);
 					
-					r.ox[T0 + stage] = Nc.x;
-					r.oy[T0 + stage] = Nc.y;
-					r.oz[T0 + stage] = Nc.z;
-					r.ow[T0 + stage] = Nc.w;
+					r.o[T0 + stage].x = Nc.x;
+					r.o[T0 + stage].y = Nc.y;
+					r.o[T0 + stage].z = Nc.z;
+					r.o[T0 + stage].w = Nc.w;
 				}
 				break;
 			case Context::TEXGEN_POSITION:
 				{
-					Color4f Pn = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);   // Position in camera space
+					Vector4f Pn = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);   // Position in camera space
 
-					Pn.w = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					Pn.w = Float4(1.0f);
 					
-					r.ox[T0 + stage] = Pn.x;
-					r.oy[T0 + stage] = Pn.y;
-					r.oz[T0 + stage] = Pn.z;
-					r.ow[T0 + stage] = Pn.w;
+					r.o[T0 + stage].x = Pn.x;
+					r.o[T0 + stage].y = Pn.y;
+					r.o[T0 + stage].z = Pn.z;
+					r.o[T0 + stage].w = Pn.w;
 				}
 				break;
 			case Context::TEXGEN_REFLECTION:
 				{
-					Color4f R;   // Reflection vector
+					Vector4f R;   // Reflection vector
 
 					if(state.vertexNormalActive)
 					{
-						Color4f Nc;   // Normal vector in camera space
+						Vector4f Nc;   // Normal vector in camera space
 
 						Nc = normal;
 
 						if(state.localViewerActive)
 						{
-							Color4f Ec;   // Eye vector in camera space
-							Color4f N2;
+							Vector4f Ec;   // Eye vector in camera space
+							Vector4f N2;
 
 							Ec = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);
 							Ec = normalize(Ec);
 
 							// R = E - 2 * N * (E . N)
-							Float4 dot = Float4(2.0f, 2.0f, 2.0f, 2.0f) * dot3(Ec, Nc);
+							Float4 dot = Float4(2.0f) * dot3(Ec, Nc);
 
 							R.x = Ec.x - Nc.x * dot;
 							R.y = Ec.y - Nc.y * dot;
@@ -641,46 +641,46 @@
 							// v = -2 * Nz * Ny
 							// w = 1 - 2 * Nz * Nz
 
-							R.x = -Float4(2.0f, 2.0f, 2.0f, 2.0f) * Nc.z * Nc.x;
-							R.y = -Float4(2.0f, 2.0f, 2.0f, 2.0f) * Nc.z * Nc.y;
-							R.z = Float4(1.0f, 1.0f, 1.0f, 1.0f) - Float4(2.0f, 2.0f, 2.0f, 2.0f) * Nc.z * Nc.z;
+							R.x = -Float4(2.0f) * Nc.z * Nc.x;
+							R.y = -Float4(2.0f) * Nc.z * Nc.y;
+							R.z = Float4(1.0f) - Float4(2.0f) * Nc.z * Nc.z;
 						}
 					}
 					else
 					{
-						R.x = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-						R.y = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-						R.z = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+						R.x = Float4(0.0f);
+						R.y = Float4(0.0f);
+						R.z = Float4(0.0f);
 					}
 
-					R.w = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					R.w = Float4(1.0f);
 
-					r.ox[T0 + stage] = R.x;
-					r.oy[T0 + stage] = R.y;
-					r.oz[T0 + stage] = R.z;
-					r.ow[T0 + stage] = R.w;
+					r.o[T0 + stage].x = R.x;
+					r.o[T0 + stage].y = R.y;
+					r.o[T0 + stage].z = R.z;
+					r.o[T0 + stage].w = R.w;
 				}
 				break;
 			case Context::TEXGEN_SPHEREMAP:
 				{
-					Color4f R;   // Reflection vector
+					Vector4f R;   // Reflection vector
 
 					if(state.vertexNormalActive)
 					{
-						Color4f Nc;   // Normal vector in camera space
+						Vector4f Nc;   // Normal vector in camera space
 
 						Nc = normal;
 
 						if(state.localViewerActive)
 						{
-							Color4f Ec;   // Eye vector in camera space
-							Color4f N2;
+							Vector4f Ec;   // Eye vector in camera space
+							Vector4f N2;
 
 							Ec = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);
 							Ec = normalize(Ec);
 
 							// R = E - 2 * N * (E . N)
-							Float4 dot = Float4(2.0f, 2.0f, 2.0f, 2.0f) * dot3(Ec, Nc);
+							Float4 dot = Float4(2.0f) * dot3(Ec, Nc);
 							
 							R.x = Ec.x - Nc.x * dot;
 							R.y = Ec.y - Nc.y * dot;
@@ -692,48 +692,48 @@
 							// v = -2 * Nz * Ny
 							// w = 1 - 2 * Nz * Nz
 
-							R.x = -Float4(2.0f, 2.0f, 2.0f, 2.0f) * Nc.z * Nc.x;
-							R.y = -Float4(2.0f, 2.0f, 2.0f, 2.0f) * Nc.z * Nc.y;
-							R.z = Float4(1.0f, 1.0f, 1.0f, 1.0f) - Float4(2.0f, 2.0f, 2.0f, 2.0f) * Nc.z * Nc.z;
+							R.x = -Float4(2.0f) * Nc.z * Nc.x;
+							R.y = -Float4(2.0f) * Nc.z * Nc.y;
+							R.z = Float4(1.0f) - Float4(2.0f) * Nc.z * Nc.z;
 						}
 					}
 					else
 					{
-						R.x = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-						R.y = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-						R.z = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+						R.x = Float4(0.0f);
+						R.y = Float4(0.0f);
+						R.z = Float4(0.0f);
 					}
 
-					R.z -= Float4(1.0f, 1.0f, 1.0f, 1.0f);
+					R.z -= Float4(1.0f);
 					R = normalize(R);
-					R.x = Float4(0.5f, 0.5f, 0.5f, 0.5f) * R.x + Float4(0.5f, 0.5f, 0.5f, 0.5f);
-					R.y = Float4(0.5f, 0.5f, 0.5f, 0.5f) * R.y + Float4(0.5f, 0.5f, 0.5f, 0.5f);
+					R.x = Float4(0.5f) * R.x + Float4(0.5f);
+					R.y = Float4(0.5f) * R.y + Float4(0.5f);
 
-					R.z = Float4(1.0f, 1.0f, 1.0f, 1.0f);
-					R.w = Float4(0.0f, 0.0f, 0.0f, 0.0f);
+					R.z = Float4(1.0f);
+					R.w = Float4(0.0f);
 
-					r.ox[T0 + stage] = R.x;
-					r.oy[T0 + stage] = R.y;
-					r.oz[T0 + stage] = R.z;
-					r.ow[T0 + stage] = R.w;
+					r.o[T0 + stage].x = R.x;
+					r.o[T0 + stage].y = R.y;
+					r.o[T0 + stage].z = R.z;
+					r.o[T0 + stage].w = R.w;
 				}
 				break;
 			default:
 				ASSERT(false);
 			}
 
-			Color4f texTrans0;
-			Color4f texTrans1;
-			Color4f texTrans2;
-			Color4f texTrans3;
+			Vector4f texTrans0;
+			Vector4f texTrans1;
+			Vector4f texTrans2;
+			Vector4f texTrans3;
 
-			Color4f T;
-			Color4f t;
+			Vector4f T;
+			Vector4f t;
 
-			T.x = r.ox[T0 + stage];
-			T.y = r.oy[T0 + stage];
-			T.z = r.oz[T0 + stage];
-			T.w = r.ow[T0 + stage];
+			T.x = r.o[T0 + stage].x;
+			T.y = r.o[T0 + stage].y;
+			T.z = r.o[T0 + stage].z;
+			T.w = r.o[T0 + stage].w;
 
 			switch(state.textureState[stage].textureTransformCountActive)
 			{
@@ -766,10 +766,10 @@
 				texTrans0.w = texTrans0.w.wwww;
 				t.x = dot4(T, texTrans0);
 
-				r.ox[T0 + stage] = t.x;
-				r.oy[T0 + stage] = t.y;
-				r.oz[T0 + stage] = t.z;
-				r.ow[T0 + stage] = t.w;
+				r.o[T0 + stage].x = t.x;
+				r.o[T0 + stage].y = t.y;
+				r.o[T0 + stage].z = t.z;
+				r.o[T0 + stage].w = t.w;
 			case 0:
 				break;
 			default:
@@ -787,16 +787,16 @@
 
 		if(state.input[PSize])
 		{
-			r.oy[Pts] = r.v[PSize].x;
+			r.o[Pts].y = r.v[PSize].x;
 		}
 		else
 		{
-			r.oy[Pts] = *Pointer<Float4>(r.data + OFFSET(DrawData,point.pointSize));
+			r.o[Pts].y = *Pointer<Float4>(r.data + OFFSET(DrawData,point.pointSize));
 		}
 
 		if(state.pointScaleActive && !state.preTransformed)
 		{
-			Color4f p = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);
+			Vector4f p = transformBlend(r, r.v[Position], Pointer<Byte>(r.data + OFFSET(DrawData,ff.cameraTransformT)), true);
 
 			Float4 d = Sqrt(dot3(p, p));   // FIXME: length(p);
 
@@ -806,18 +806,13 @@
 
 			A = RcpSqrt_pp(A + d * (B + d * C));
 
-			r.oy[Pts] = r.oy[Pts] * Float4(*Pointer<Float>(r.data + OFFSET(DrawData,viewportHeight))) * A;   // FIXME: Unpack
+			r.o[Pts].y = r.o[Pts].y * Float4(*Pointer<Float>(r.data + OFFSET(DrawData,viewportHeight))) * A;   // FIXME: Unpack
 		}
 	}
 
-	Color4f VertexPipeline::transform(Color4f &src, Pointer<Byte> &matrix, bool homogeneous)
+	Vector4f VertexPipeline::transform(Register &src, Pointer<Byte> &matrix, bool homogeneous)
 	{
-		Color4f dst;
-
-		Color4f row0;
-		Color4f row1;
-		Color4f row2;
-		Color4f row3;
+		Vector4f dst;
 
 		if(homogeneous)
 		{
@@ -862,14 +857,9 @@
 		return dst;
 	}
 
-	Color4f VertexPipeline::transform(Color4f &src, Pointer<Byte> &matrix, UInt index[4], bool homogeneous)
+	Vector4f VertexPipeline::transform(Register &src, Pointer<Byte> &matrix, UInt index[4], bool homogeneous)
 	{
-		Color4f dst;
-		
-		Color4f row0;
-		Color4f row1;
-		Color4f row2;
-		Color4f row3;
+		Vector4f dst;
 
 		if(homogeneous)
 		{
@@ -914,9 +904,9 @@
 		return dst;
 	}
 
-	Color4f VertexPipeline::normalize(Color4f &src)
+	Vector4f VertexPipeline::normalize(Vector4f &src)
 	{
-		Color4f dst;
+		Vector4f dst;
 
 		Float4 rcpLength = RcpSqrt_pp(dot3(src, src));
 		
@@ -933,11 +923,11 @@
 				
 		dst = dst * dst;
 		dst = dst * dst;
-		dst = Float4(As<Int4>(dst) - As<Int4>(Float4(1.0f, 1.0f, 1.0f, 1.0f)));
+		dst = Float4(As<Int4>(dst) - As<Int4>(Float4(1.0f)));
 				
 		dst *= src1;
 
-		dst = As<Float4>(Int4(dst) + As<Int4>(Float4(1.0f, 1.0f, 1.0f, 1.0f)));
+		dst = As<Float4>(Int4(dst) + As<Int4>(Float4(1.0f)));
 		dst = RcpSqrt_pp(dst);
 		dst = RcpSqrt_pp(dst);
 
diff --git a/src/Shader/VertexPipeline.hpp b/src/Shader/VertexPipeline.hpp
index efb8592..bfc3093 100644
--- a/src/Shader/VertexPipeline.hpp
+++ b/src/Shader/VertexPipeline.hpp
@@ -1,42 +1,42 @@
-// SwiftShader Software Renderer
-//
-// Copyright(c) 2005-2011 TransGaming Inc.
-//
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
-//
-
-#ifndef sw_VertexPipeline_hpp
-#define sw_VertexPipeline_hpp
-
-#include "VertexRoutine.hpp"
-
-#include "Context.hpp"
-#include "VertexProcessor.hpp"
-
-namespace sw
-{
-	class VertexPipeline : public VertexRoutine
-	{
-	public:
-		VertexPipeline(const VertexProcessor::State &state);
-
-		virtual ~VertexPipeline();
-
-	private:
-		void pipeline(Registers &r);
-		void processTextureCoordinate(Registers &r, int stage, Color4f &normal, Color4f &position);
-		void processPointSize(Registers &r);
-
-		Color4f transformBlend(Registers &r, Color4f &src, Pointer<Byte> &matrix, bool homogenous);
-		Color4f transform(Color4f &src, Pointer<Byte> &matrix, bool homogenous);
-		Color4f transform(Color4f &src, Pointer<Byte> &matrix, UInt index[4], bool homogenous);
-		Color4f normalize(Color4f &src);
-		Float4 power(Float4 &src0, Float4 &src1);
-	};
-};
-
-#endif   // sw_VertexPipeline_hpp
+// SwiftShader Software Renderer

+//

+// Copyright(c) 2005-2012 TransGaming Inc.

+//

+// All rights reserved. No part of this software may be copied, distributed, transmitted,

+// transcribed, stored in a retrieval system, translated into any human or computer

+// language by any means, or disclosed to third parties without the explicit written

+// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express

+// or implied, including but not limited to any patent rights, are granted to you.

+//

+

+#ifndef sw_VertexPipeline_hpp

+#define sw_VertexPipeline_hpp

+

+#include "VertexRoutine.hpp"

+

+#include "Context.hpp"

+#include "VertexProcessor.hpp"

+

+namespace sw

+{

+	class VertexPipeline : public VertexRoutine

+	{

+	public:

+		VertexPipeline(const VertexProcessor::State &state);

+

+		virtual ~VertexPipeline();

+

+	private:

+		void pipeline(Registers &r);

+		void processTextureCoordinate(Registers &r, int stage, Vector4f &normal, Vector4f &position);

+		void processPointSize(Registers &r);

+

+		Vector4f transformBlend(Registers &r, Register &src, Pointer<Byte> &matrix, bool homogenous);

+		Vector4f transform(Register &src, Pointer<Byte> &matrix, bool homogenous);

+		Vector4f transform(Register &src, Pointer<Byte> &matrix, UInt index[4], bool homogenous);

+		Vector4f normalize(Vector4f &src);

+		Float4 power(Float4 &src0, Float4 &src1);

+	};

+};

+

+#endif   // sw_VertexPipeline_hpp

diff --git a/src/Shader/VertexProgram.cpp b/src/Shader/VertexProgram.cpp
index 41153bf..a292a8e 100644
--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -22,12 +22,13 @@
 
 namespace sw
 {
-	VertexProgram::VertexProgram(const VertexProcessor::State &state, const VertexShader *vertexShader) : VertexRoutine(state), vertexShader(vertexShader)
+	VertexProgram::VertexProgram(const VertexProcessor::State &state, const VertexShader *shader) : VertexRoutine(state, shader)
 	{
-		returns = false;
 		ifDepth = 0;
 		loopRepDepth = 0;
 		breakDepth = 0;
+		currentLabel = -1;
+		whileTest = false;
 
 		for(int i = 0; i < 2048; i++)
 		{
@@ -52,7 +53,7 @@
 
 		if(!state.preTransformed)
 		{
-			shader(r);
+			program(r);
 		}
 		else
 		{
@@ -60,296 +61,320 @@
 		}
 	}
 
-	Color4f VertexProgram::readConstant(Registers &r, const Src &src, int offset)
+	void VertexProgram::program(Registers &r)
 	{
-		Color4f c;
+	//	shader->print("VertexShader-%0.8X.txt", state.shaderID);
 
-		int i = src.index + offset;
-		bool relative = src.relative;
-
-		if(!relative)
-		{
-			c.r = c.g = c.b = c.a = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c[i]));
-
-			c.r = c.r.xxxx;
-			c.g = c.g.yyyy;
-			c.b = c.b.zzzz;
-			c.a = c.a.wwww;
-
-			if(localShaderConstants)   // Constant may be known at compile time
-			{
-				for(int j = 0; j < vertexShader->getLength(); j++)
-				{
-					const ShaderInstruction &instruction = *vertexShader->getInstruction(j);
-
-					if(instruction.getOpcode() == ShaderOperation::OPCODE_DEF)
-					{
-						if(instruction.getDestinationParameter().index == i)
-						{
-							c.r = Float4(instruction.getSourceParameter(0).value);
-							c.g = Float4(instruction.getSourceParameter(1).value);
-							c.b = Float4(instruction.getSourceParameter(2).value);
-							c.a = Float4(instruction.getSourceParameter(3).value);
-
-							break;
-						}
-					}
-				}
-			}
-		}
-		else if(src.relativeType == Src::PARAMETER_LOOP)
-		{
-			Int loopCounter = r.aL[r.loopDepth];
-
-			c.r = c.g = c.b = c.a = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c[i]) + loopCounter * 16);
-
-			c.r = c.r.xxxx;
-			c.g = c.g.yyyy;
-			c.b = c.b.zzzz;
-			c.a = c.a.wwww;
-		}
-		else
-		{
-			Int index0;
-			Int index1;
-			Int index2;
-			Int index3;
-
-			Float4 a0_;
-
-			switch(src.relativeSwizzle & 0x03)
-			{
-			case 0: a0_ = r.a0.x; break;
-			case 1: a0_ = r.a0.y; break;
-			case 2: a0_ = r.a0.z; break;
-			case 3: a0_ = r.a0.w; break;
-			}
-
-			index0 = i + RoundInt(Float(a0_.x));
-			index1 = i + RoundInt(Float(a0_.y));
-			index2 = i + RoundInt(Float(a0_.z));
-			index3 = i + RoundInt(Float(a0_.w));
-
-			// Clamp to constant register range, c[256] = {0, 0, 0, 0}
-			index0 = IfThenElse(UInt(index0) > UInt(256), Int(256), index0);
-			index1 = IfThenElse(UInt(index1) > UInt(256), Int(256), index1);
-			index2 = IfThenElse(UInt(index2) > UInt(256), Int(256), index2);
-			index3 = IfThenElse(UInt(index3) > UInt(256), Int(256), index3);
-
-			c.x = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index0 * 16, 16);
-			c.y = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index1 * 16, 16);
-			c.z = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index2 * 16, 16);
-			c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index3 * 16, 16);
-
-			transpose4x4(c.x, c.y, c.z, c.w);
-		}
-
-		return c;
-	}
-
-	void VertexProgram::shader(Registers &r)
-	{
-	//	vertexShader->print("VertexShader-%0.16llX.txt", state.shaderHash);
-
-		unsigned short version = vertexShader->getVersion();
+		unsigned short version = shader->getVersion();
 
 		r.enableIndex = 0;
 		r.stackIndex = 0;
-	
-		for(int i = 0; i < vertexShader->getLength(); i++)
+
+		// Create all call site return blocks up front
+		for(int i = 0; i < shader->getLength(); i++)
 		{
-			const ShaderInstruction *instruction = vertexShader->getInstruction(i);
-			Op::Opcode opcode = instruction->getOpcode();
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
 
-		//	#ifndef NDEBUG   // FIXME: Centralize debug output control
-		//		vertexShader->printInstruction(i, "debug.txt");
-		//	#endif
+			if(opcode == Shader::OPCODE_CALL || opcode == Shader::OPCODE_CALLNZ)
+			{
+				const Dst &dst = instruction->dst;
 
-			if(opcode == Op::OPCODE_DCL || opcode == Op::OPCODE_DEF || opcode == Op::OPCODE_DEFI || opcode == Op::OPCODE_DEFB)
+				ASSERT(callRetBlock[dst.label].size() == dst.callSite);
+				callRetBlock[dst.label].push_back(Nucleus::createBasicBlock());
+			}
+		}
+	
+		for(int i = 0; i < shader->getLength(); i++)
+		{
+			const Shader::Instruction *instruction = shader->getInstruction(i);
+			Shader::Opcode opcode = instruction->opcode;
+
+			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
 			{
 				continue;
 			}
 
-			Dst dest = instruction->getDestinationParameter();
-			Src src0 = instruction->getSourceParameter(0);
-			Src src1 = instruction->getSourceParameter(1);
-			Src src2 = instruction->getSourceParameter(2);
-			Src src3 = instruction->getSourceParameter(3);
+			Dst dst = instruction->dst;
+			Src src0 = instruction->src[0];
+			Src src1 = instruction->src[1];
+			Src src2 = instruction->src[2];
 
-			bool predicate = instruction->isPredicate();
-			int size = vertexShader->size(opcode);
-			Usage usage = instruction->getUsage();
-			unsigned char usageIndex = instruction->getUsageIndex();
-			Control control = instruction->getControl();
-			bool integer = dest.type == Dst::PARAMETER_ADDR;
-			bool pp = dest.partialPrecision;
+			bool predicate = instruction->predicate;
+			int size = shader->size(opcode);
+			Usage usage = instruction->usage;
+			unsigned char usageIndex = instruction->usageIndex;
+			Control control = instruction->control;
+			bool integer = dst.type == Shader::PARAMETER_ADDR;
+			bool pp = dst.partialPrecision;
 
-			Color4f d;
-			Color4f s0;
-			Color4f s1;
-			Color4f s2;
-			Color4f s3;
+			Vector4f d;
+			Vector4f s0;
+			Vector4f s1;
+			Vector4f s2;
 
-			if(src0.type != Src::PARAMETER_VOID) s0 = reg(r, src0);
-			if(src1.type != Src::PARAMETER_VOID) s1 = reg(r, src1);
-			if(src2.type != Src::PARAMETER_VOID) s2 = reg(r, src2);
-			if(src3.type != Src::PARAMETER_VOID) s3 = reg(r, src3);
+			if(src0.type != Shader::PARAMETER_VOID) s0 = reg(r, src0);
+			if(src1.type != Shader::PARAMETER_VOID) s1 = reg(r, src1);
+			if(src2.type != Shader::PARAMETER_VOID) s2 = reg(r, src2);
 
 			switch(opcode)
 			{
-			case Op::OPCODE_VS_1_0:										break;
-			case Op::OPCODE_VS_1_1:										break;
-			case Op::OPCODE_VS_2_0:										break;
-			case Op::OPCODE_VS_2_x:										break;
-			case Op::OPCODE_VS_2_sw:									break;
-			case Op::OPCODE_VS_3_0:										break;
-			case Op::OPCODE_VS_3_sw:									break;
-			case Op::OPCODE_DCL:										break;
-			case Op::OPCODE_DEF:										break;
-			case Op::OPCODE_DEFI:										break;
-			case Op::OPCODE_DEFB:										break;
-			case Op::OPCODE_NOP:										break;
-			case Op::OPCODE_ABS:		abs(d, s0);						break;
-			case Op::OPCODE_ADD:		add(d, s0, s1);					break;
-			case Op::OPCODE_CRS:		crs(d, s0, s1);					break;
-			case Op::OPCODE_DP3:		dp3(d, s0, s1);					break;
-			case Op::OPCODE_DP4:		dp4(d, s0, s1);					break;
-			case Op::OPCODE_DST:		dst(d, s0, s1);					break;
-			case Op::OPCODE_EXP:		exp(d, s0, pp);					break;
-			case Op::OPCODE_EXPP:		expp(d, s0, version);			break;
-			case Op::OPCODE_FRC:		frc(d, s0);						break;
-			case Op::OPCODE_LIT:		lit(d, s0);						break;
-			case Op::OPCODE_LOG:		log(d, s0, pp);					break;
-			case Op::OPCODE_LOGP:		logp(d, s0, version);			break;
-			case Op::OPCODE_LRP:		lrp(d, s0, s1, s2);				break;
-			case Op::OPCODE_M3X2:		M3X2(r, d, s0, src1);			break;
-			case Op::OPCODE_M3X3:		M3X3(r, d, s0, src1);			break;
-			case Op::OPCODE_M3X4:		M3X4(r, d, s0, src1);			break;
-			case Op::OPCODE_M4X3:		M4X3(r, d, s0, src1);			break;
-			case Op::OPCODE_M4X4:		M4X4(r, d, s0, src1);			break;
-			case Op::OPCODE_MAD:		mad(d, s0, s1, s2);				break;
-			case Op::OPCODE_MAX:		max(d, s0, s1);					break;
-			case Op::OPCODE_MIN:		min(d, s0, s1);					break;
-			case Op::OPCODE_MOV:		mov(d, s0, integer);			break;
-			case Op::OPCODE_MOVA:		mov(d, s0);						break;
-			case Op::OPCODE_MUL:		mul(d, s0, s1);					break;
-			case Op::OPCODE_NRM:		nrm(d, s0, pp);					break;
-			case Op::OPCODE_POW:		pow(d, s0, s1, pp);				break;
-			case Op::OPCODE_RCP:		rcp(d, s0, pp);					break;
-			case Op::OPCODE_RSQ:		rsq(d, s0, pp);					break;
-			case Op::OPCODE_SGE:		sge(d, s0, s1);					break;
-			case Op::OPCODE_SGN:		sgn(d, s0);						break;
-			case Op::OPCODE_SINCOS:		sincos(d, s0, pp);				break;
-			case Op::OPCODE_SLT:		slt(d, s0, s1);					break;
-			case Op::OPCODE_SUB:		sub(d, s0, s1);					break;
-			case Op::OPCODE_BREAK:		BREAK(r);						break;
-			case Op::OPCODE_BREAKC:		BREAKC(r, s0, s1, control);		break;
-			case Op::OPCODE_BREAKP:		BREAKP(r, src0);				break;
-			case Op::OPCODE_CALL:		CALL(r, dest.index);			break;
-			case Op::OPCODE_CALLNZ:		CALLNZ(r, dest.index, src0);	break;
-			case Op::OPCODE_ELSE:		ELSE(r);						break;
-			case Op::OPCODE_ENDIF:		ENDIF(r);						break;
-			case Op::OPCODE_ENDLOOP:	ENDLOOP(r);						break;
-			case Op::OPCODE_ENDREP:		ENDREP(r);						break;
-			case Op::OPCODE_IF:			IF(r, src0);					break;
-			case Op::OPCODE_IFC:		IFC(r, s0, s1, control);		break;
-			case Op::OPCODE_LABEL:		LABEL(dest.index);				break;
-			case Op::OPCODE_LOOP:		LOOP(r, src1);					break;
-			case Op::OPCODE_REP:		REP(r, src0);					break;
-			case Op::OPCODE_RET:		RET(r);							break;
-			case Op::OPCODE_SETP:		setp(d, s0, s1, control);		break;
-			case Op::OPCODE_TEXLDL:		TEXLDL(r, d, s0, src1);			break;
-			case Op::OPCODE_END:										break;
+			case Shader::OPCODE_VS_1_0:										break;
+			case Shader::OPCODE_VS_1_1:										break;
+			case Shader::OPCODE_VS_2_0:										break;
+			case Shader::OPCODE_VS_2_x:										break;
+			case Shader::OPCODE_VS_2_sw:									break;
+			case Shader::OPCODE_VS_3_0:										break;
+			case Shader::OPCODE_VS_3_sw:									break;
+			case Shader::OPCODE_DCL:										break;
+			case Shader::OPCODE_DEF:										break;
+			case Shader::OPCODE_DEFI:										break;
+			case Shader::OPCODE_DEFB:										break;
+			case Shader::OPCODE_NOP:										break;
+			case Shader::OPCODE_ABS:		abs(d, s0);						break;
+			case Shader::OPCODE_ADD:		add(d, s0, s1);					break;
+			case Shader::OPCODE_CRS:		crs(d, s0, s1);					break;
+			case Shader::OPCODE_FORWARD1:	forward1(d, s0, s1, s2);		break;
+			case Shader::OPCODE_FORWARD2:	forward2(d, s0, s1, s2);		break;
+			case Shader::OPCODE_FORWARD3:	forward3(d, s0, s1, s2);		break;
+			case Shader::OPCODE_FORWARD4:	forward4(d, s0, s1, s2);		break;
+			case Shader::OPCODE_REFLECT1:	reflect1(d, s0, s1);			break;
+			case Shader::OPCODE_REFLECT2:	reflect2(d, s0, s1);			break;
+			case Shader::OPCODE_REFLECT3:	reflect3(d, s0, s1);			break;
+			case Shader::OPCODE_REFLECT4:	reflect4(d, s0, s1);			break;
+			case Shader::OPCODE_REFRACT1:	refract1(d, s0, s1, s2.x);		break;
+			case Shader::OPCODE_REFRACT2:	refract2(d, s0, s1, s2.x);		break;
+			case Shader::OPCODE_REFRACT3:	refract3(d, s0, s1, s2.x);		break;
+			case Shader::OPCODE_REFRACT4:	refract4(d, s0, s1, s2.x);		break;
+			case Shader::OPCODE_DP1:		dp1(d, s0, s1);					break;
+			case Shader::OPCODE_DP2:		dp2(d, s0, s1);					break;
+			case Shader::OPCODE_DP3:		dp3(d, s0, s1);					break;
+			case Shader::OPCODE_DP4:		dp4(d, s0, s1);					break;
+			case Shader::OPCODE_ATT:		att(d, s0, s1);					break;
+			case Shader::OPCODE_EXP2X:		exp2x(d, s0, pp);				break;
+			case Shader::OPCODE_EXP2:		exp2(d, s0, pp);				break;
+			case Shader::OPCODE_EXPP:		expp(d, s0, version);			break;
+			case Shader::OPCODE_EXP:		exp(d, s0, pp);					break;
+			case Shader::OPCODE_FRC:		frc(d, s0);						break;
+			case Shader::OPCODE_TRUNC:      trunc(d, s0);                   break;
+			case Shader::OPCODE_FLOOR:      floor(d, s0);                   break;
+			case Shader::OPCODE_CEIL:       ceil(d, s0);                    break;
+			case Shader::OPCODE_LIT:		lit(d, s0);						break;
+			case Shader::OPCODE_LOG2X:		log2x(d, s0, pp);				break;
+			case Shader::OPCODE_LOG2:		log2(d, s0, pp);				break;
+			case Shader::OPCODE_LOGP:		logp(d, s0, version);			break;
+			case Shader::OPCODE_LOG:		log(d, s0, pp);					break;
+			case Shader::OPCODE_LRP:		lrp(d, s0, s1, s2);				break;
+			case Shader::OPCODE_STEP:		step(d, s0, s1);				break;
+			case Shader::OPCODE_SMOOTH:		smooth(d, s0, s1, s2);			break;
+			case Shader::OPCODE_M3X2:		M3X2(r, d, s0, src1);			break;
+			case Shader::OPCODE_M3X3:		M3X3(r, d, s0, src1);			break;
+			case Shader::OPCODE_M3X4:		M3X4(r, d, s0, src1);			break;
+			case Shader::OPCODE_M4X3:		M4X3(r, d, s0, src1);			break;
+			case Shader::OPCODE_M4X4:		M4X4(r, d, s0, src1);			break;
+			case Shader::OPCODE_MAD:		mad(d, s0, s1, s2);				break;
+			case Shader::OPCODE_MAX:		max(d, s0, s1);					break;
+			case Shader::OPCODE_MIN:		min(d, s0, s1);					break;
+			case Shader::OPCODE_MOV:		mov(d, s0, integer);			break;
+			case Shader::OPCODE_MOVA:		mov(d, s0);						break;
+			case Shader::OPCODE_F2B:		f2b(d, s0);						break;
+			case Shader::OPCODE_B2F:		b2f(d, s0);						break;
+			case Shader::OPCODE_MUL:		mul(d, s0, s1);					break;
+			case Shader::OPCODE_NRM2:		nrm2(d, s0, pp);				break;
+			case Shader::OPCODE_NRM3:		nrm3(d, s0, pp);				break;
+			case Shader::OPCODE_NRM4:		nrm4(d, s0, pp);				break;
+			case Shader::OPCODE_POWX:		powx(d, s0, s1, pp);			break;
+			case Shader::OPCODE_POW:		pow(d, s0, s1, pp);				break;
+			case Shader::OPCODE_RCPX:		rcpx(d, s0, pp);				break;
+			case Shader::OPCODE_DIV:		div(d, s0, s1);					break;
+			case Shader::OPCODE_MOD:		mod(d, s0, s1);					break;
+			case Shader::OPCODE_RSQX:		rsqx(d, s0, pp);				break;
+			case Shader::OPCODE_SQRT:		sqrt(d, s0, pp);				break;
+			case Shader::OPCODE_RSQ:		rsq(d, s0, pp);					break;
+			case Shader::OPCODE_LEN2:		len2(d.x, s0, pp);				break;
+			case Shader::OPCODE_LEN3:		len3(d.x, s0, pp);				break;
+			case Shader::OPCODE_LEN4:		len4(d.x, s0, pp);				break;
+			case Shader::OPCODE_DIST1:		dist1(d.x, s0, s1, pp);			break;
+			case Shader::OPCODE_DIST2:		dist2(d.x, s0, s1, pp);			break;
+			case Shader::OPCODE_DIST3:		dist3(d.x, s0, s1, pp);			break;
+			case Shader::OPCODE_DIST4:		dist4(d.x, s0, s1, pp);			break;
+			case Shader::OPCODE_SGE:		step(d, s1, s0);				break;
+			case Shader::OPCODE_SGN:		sgn(d, s0);						break;
+			case Shader::OPCODE_SINCOS:		sincos(d, s0, pp);				break;
+			case Shader::OPCODE_COS:		cos(d, s0, pp);					break;
+			case Shader::OPCODE_SIN:		sin(d, s0, pp);					break;
+			case Shader::OPCODE_TAN:		tan(d, s0);						break;
+			case Shader::OPCODE_ACOS:		acos(d, s0);					break;
+			case Shader::OPCODE_ASIN:		asin(d, s0);					break;
+			case Shader::OPCODE_ATAN:		atan(d, s0);					break;
+			case Shader::OPCODE_ATAN2:		atan2(d, s0, s1);				break;
+			case Shader::OPCODE_SLT:		slt(d, s0, s1);					break;
+			case Shader::OPCODE_SUB:		sub(d, s0, s1);					break;
+			case Shader::OPCODE_BREAK:		BREAK(r);						break;
+			case Shader::OPCODE_BREAKC:		BREAKC(r, s0, s1, control);		break;
+			case Shader::OPCODE_BREAKP:		BREAKP(r, src0);				break;
+			case Shader::OPCODE_CONTINUE:	CONTINUE(r);					break;
+			case Shader::OPCODE_TEST:		TEST();							break;
+			case Shader::OPCODE_CALL:		CALL(r, dst.label, dst.callSite);         break;
+			case Shader::OPCODE_CALLNZ:		CALLNZ(r, dst.label, dst.callSite, src0); break;
+			case Shader::OPCODE_ELSE:		ELSE(r);						break;
+			case Shader::OPCODE_ENDIF:		ENDIF(r);						break;
+			case Shader::OPCODE_ENDLOOP:	ENDLOOP(r);						break;
+			case Shader::OPCODE_ENDREP:		ENDREP(r);						break;
+			case Shader::OPCODE_ENDWHILE:	ENDWHILE(r);					break;
+			case Shader::OPCODE_IF:			IF(r, src0);					break;
+			case Shader::OPCODE_IFC:		IFC(r, s0, s1, control);		break;
+			case Shader::OPCODE_LABEL:		LABEL(dst.index);				break;
+			case Shader::OPCODE_LOOP:		LOOP(r, src1);					break;
+			case Shader::OPCODE_REP:		REP(r, src0);					break;
+			case Shader::OPCODE_WHILE:		WHILE(r, src0);					break;
+			case Shader::OPCODE_RET:		RET(r);							break;
+			case Shader::OPCODE_LEAVE:		LEAVE(r);						break;
+			case Shader::OPCODE_CMP:		cmp(d, s0, s1, control);		break;
+			case Shader::OPCODE_ICMP:		icmp(d, s0, s1, control);		break;
+			case Shader::OPCODE_SELECT:		select(d, s0, s1, s2);			break;
+			case Shader::OPCODE_EXTRACT:	extract(d.x, s0, s1.x);			break;
+			case Shader::OPCODE_INSERT:		insert(d, s0, s1.x, s2.x);		break;
+			case Shader::OPCODE_ALL:		all(d.x, s0);					break;
+			case Shader::OPCODE_ANY:		any(d.x, s0);					break;
+			case Shader::OPCODE_NOT:		not(d, s0);						break;
+			case Shader::OPCODE_OR:			or(d.x, s0.x, s1.x);			break;
+			case Shader::OPCODE_XOR:		xor(d.x, s0.x, s1.x);			break;
+			case Shader::OPCODE_AND:		and(d.x, s0.x, s1.x);			break;
+			case Shader::OPCODE_TEXLDL:		TEXLDL(r, d, s0, src1);			break;
+			case Shader::OPCODE_TEX:		TEX(r, d, s0, src1);			break;
+			case Shader::OPCODE_END:										break;
 			default:
 				ASSERT(false);
 			}
 
-			if(dest.type != Dst::PARAMETER_VOID && dest.type != Dst::PARAMETER_LABEL)
+			if(dst.type != Shader::PARAMETER_VOID && dst.type != Shader::PARAMETER_LABEL && opcode != Shader::OPCODE_NOP)
 			{
-				if(dest.saturate)
+				if(dst.integer)
 				{
-					if(dest.x) d.r = Max(d.r, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					if(dest.y) d.g = Max(d.g, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					if(dest.z) d.b = Max(d.b, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					if(dest.w) d.a = Max(d.a, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-
-					if(dest.x) d.r = Min(d.r, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-					if(dest.y) d.g = Min(d.g, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-					if(dest.z) d.b = Min(d.b, Float4(1.0f, 1.0f, 1.0f, 1.0f));
-					if(dest.w) d.a = Min(d.a, Float4(1.0f, 1.0f, 1.0f, 1.0f));
+					switch(opcode)
+					{
+					case Shader::OPCODE_DIV:
+						if(dst.x) d.x = Trunc(d.x);
+						if(dst.y) d.y = Trunc(d.y);
+						if(dst.z) d.z = Trunc(d.z);
+						if(dst.w) d.w = Trunc(d.w);
+						break;
+					default:
+						break;   // No truncation to integer required when arguments are integer
+					}
 				}
 
-				if(vertexShader->containsDynamicBranching())
+				if(dst.saturate)
 				{
-					Color4f pDst;   // FIXME: Rename
+					if(dst.x) d.x = Max(d.x, Float4(0.0f));
+					if(dst.y) d.y = Max(d.y, Float4(0.0f));
+					if(dst.z) d.z = Max(d.z, Float4(0.0f));
+					if(dst.w) d.w = Max(d.w, Float4(0.0f));
 
-					switch(dest.type)
+					if(dst.x) d.x = Min(d.x, Float4(1.0f));
+					if(dst.y) d.y = Min(d.y, Float4(1.0f));
+					if(dst.z) d.z = Min(d.z, Float4(1.0f));
+					if(dst.w) d.w = Min(d.w, Float4(1.0f));
+				}
+
+				if(shader->containsDynamicBranching())
+				{
+					Vector4f pDst;   // FIXME: Rename
+
+					switch(dst.type)
 					{
-					case Dst::PARAMETER_VOID:																		break;
-					case Dst::PARAMETER_TEMP:		pDst = r.r[dest.index];											break;
-					case Dst::PARAMETER_ADDR:		pDst = r.a0;													break;
-					case Dst::PARAMETER_RASTOUT:
-						switch(dest.index)
+					case Shader::PARAMETER_VOID:																		break;
+					case Shader::PARAMETER_TEMP:
+						if(dst.rel.type == Shader::PARAMETER_VOID)
+						{
+							if(dst.x) pDst.x = r.r[dst.index].x;
+							if(dst.y) pDst.y = r.r[dst.index].y;
+							if(dst.z) pDst.z = r.r[dst.index].z;
+							if(dst.w) pDst.w = r.r[dst.index].w;
+						}
+						else
+						{
+							Int a = relativeAddress(r, dst);
+
+							if(dst.x) pDst.x = r.r[dst.index + a].x;
+							if(dst.y) pDst.y = r.r[dst.index + a].y;
+							if(dst.z) pDst.z = r.r[dst.index + a].z;
+							if(dst.w) pDst.w = r.r[dst.index + a].w;
+						}
+						break;
+					case Shader::PARAMETER_ADDR:		pDst = r.a0;													break;
+					case Shader::PARAMETER_RASTOUT:
+						switch(dst.index)
 						{
 						case 0:
-							if(dest.x) pDst.x = r.ox[Pos];
-							if(dest.y) pDst.y = r.oy[Pos];
-							if(dest.z) pDst.z = r.oz[Pos];
-							if(dest.w) pDst.w = r.ow[Pos];
+							if(dst.x) pDst.x = r.o[Pos].x;
+							if(dst.y) pDst.y = r.o[Pos].y;
+							if(dst.z) pDst.z = r.o[Pos].z;
+							if(dst.w) pDst.w = r.o[Pos].w;
 							break;
 						case 1:
-							pDst.x = r.ox[Fog];
+							pDst.x = r.o[Fog].x;
 							break;
 						case 2:
-							pDst.x = r.oy[Pts];
+							pDst.x = r.o[Pts].y;
 							break;
 						default:
 							ASSERT(false);
 						}
 						break;
-					case Dst::PARAMETER_ATTROUT:
-						if(dest.x) pDst.x = r.ox[D0 + dest.index];
-						if(dest.y) pDst.y = r.oy[D0 + dest.index];
-						if(dest.z) pDst.z = r.oz[D0 + dest.index];
-						if(dest.w) pDst.w = r.ow[D0 + dest.index];
+					case Shader::PARAMETER_ATTROUT:
+						if(dst.x) pDst.x = r.o[D0 + dst.index].x;
+						if(dst.y) pDst.y = r.o[D0 + dst.index].y;
+						if(dst.z) pDst.z = r.o[D0 + dst.index].z;
+						if(dst.w) pDst.w = r.o[D0 + dst.index].w;
 						break;
-					case Dst::PARAMETER_TEXCRDOUT:
-				//	case Dst::PARAMETER_OUTPUT:
+					case Shader::PARAMETER_TEXCRDOUT:
+				//	case Shader::PARAMETER_OUTPUT:
 						if(version < 0x0300)
 						{
-							if(dest.x) pDst.x = r.ox[T0 + dest.index];
-							if(dest.y) pDst.y = r.oy[T0 + dest.index];
-							if(dest.z) pDst.z = r.oz[T0 + dest.index];
-							if(dest.w) pDst.w = r.ow[T0 + dest.index];
+							if(dst.x) pDst.x = r.o[T0 + dst.index].x;
+							if(dst.y) pDst.y = r.o[T0 + dst.index].y;
+							if(dst.z) pDst.z = r.o[T0 + dst.index].z;
+							if(dst.w) pDst.w = r.o[T0 + dst.index].w;
 						}
 						else
 						{
-							if(!dest.relative)
+							if(dst.rel.type == Shader::PARAMETER_VOID)   // Not relative
 							{
-								if(dest.x) pDst.x = r.ox[dest.index];
-								if(dest.y) pDst.y = r.oy[dest.index];
-								if(dest.z) pDst.z = r.oz[dest.index];
-								if(dest.w) pDst.w = r.ow[dest.index];
+								if(dst.x) pDst.x = r.o[dst.index].x;
+								if(dst.y) pDst.y = r.o[dst.index].y;
+								if(dst.z) pDst.z = r.o[dst.index].z;
+								if(dst.w) pDst.w = r.o[dst.index].w;
 							}
-							else
+							else if(dst.rel.type == Shader::PARAMETER_LOOP)
 							{
 								Int aL = r.aL[r.loopDepth];
 
-								if(dest.x) pDst.x = r.ox[dest.index + aL];
-								if(dest.y) pDst.y = r.oy[dest.index + aL];
-								if(dest.z) pDst.z = r.oz[dest.index + aL];
-								if(dest.w) pDst.w = r.ow[dest.index + aL];
+								if(dst.x) pDst.x = r.o[dst.index + aL].x;
+								if(dst.y) pDst.y = r.o[dst.index + aL].y;
+								if(dst.z) pDst.z = r.o[dst.index + aL].z;
+								if(dst.w) pDst.w = r.o[dst.index + aL].w;
+							}
+							else
+							{
+								Int a = relativeAddress(r, dst);
+
+								if(dst.x) pDst.x = r.o[dst.index + a].x;
+								if(dst.y) pDst.y = r.o[dst.index + a].y;
+								if(dst.z) pDst.z = r.o[dst.index + a].z;
+								if(dst.w) pDst.w = r.o[dst.index + a].w;
 							}
 						}
 						break;
-					case Dst::PARAMETER_LABEL:																		break;
-					case Dst::PARAMETER_PREDICATE:	pDst = r.p0;													break;
-					case Dst::PARAMETER_INPUT:																		break;
+					case Shader::PARAMETER_LABEL:																		break;
+					case Shader::PARAMETER_PREDICATE:	pDst = r.p0;													break;
+					case Shader::PARAMETER_INPUT:																		break;
 					default:
 						ASSERT(false);
 					}
 
-					Int4 enable = r.enableStack[r.enableIndex] & r.enableBreak;
+					Int4 enable = enableMask(r, instruction);
 
 					Int4 xEnable = enable;
 					Int4 yEnable = enable;
@@ -358,119 +383,140 @@
 
 					if(predicate)
 					{
-						unsigned char pSwizzle = instruction->getPredicateSwizzle();
+						unsigned char pSwizzle = instruction->predicateSwizzle;
 
 						Float4 xPredicate = r.p0[(pSwizzle >> 0) & 0x03];
 						Float4 yPredicate = r.p0[(pSwizzle >> 2) & 0x03];
 						Float4 zPredicate = r.p0[(pSwizzle >> 4) & 0x03];
 						Float4 wPredicate = r.p0[(pSwizzle >> 6) & 0x03];
 
-						if(!instruction->isPredicateNot())
+						if(!instruction->predicateNot)
 						{
-							if(dest.x) xEnable = xEnable & As<Int4>(xPredicate);
-							if(dest.y) yEnable = yEnable & As<Int4>(yPredicate);
-							if(dest.z) zEnable = zEnable & As<Int4>(zPredicate);
-							if(dest.w) wEnable = wEnable & As<Int4>(wPredicate);
+							if(dst.x) xEnable = xEnable & As<Int4>(xPredicate);
+							if(dst.y) yEnable = yEnable & As<Int4>(yPredicate);
+							if(dst.z) zEnable = zEnable & As<Int4>(zPredicate);
+							if(dst.w) wEnable = wEnable & As<Int4>(wPredicate);
 						}
 						else
 						{
-							if(dest.x) xEnable = xEnable & ~As<Int4>(xPredicate);
-							if(dest.y) yEnable = yEnable & ~As<Int4>(yPredicate);
-							if(dest.z) zEnable = zEnable & ~As<Int4>(zPredicate);
-							if(dest.w) wEnable = wEnable & ~As<Int4>(wPredicate);
+							if(dst.x) xEnable = xEnable & ~As<Int4>(xPredicate);
+							if(dst.y) yEnable = yEnable & ~As<Int4>(yPredicate);
+							if(dst.z) zEnable = zEnable & ~As<Int4>(zPredicate);
+							if(dst.w) wEnable = wEnable & ~As<Int4>(wPredicate);
 						}
 					}
 
-					if(dest.x) d.x = As<Float4>(As<Int4>(d.x) & xEnable);
-					if(dest.y) d.y = As<Float4>(As<Int4>(d.y) & yEnable);
-					if(dest.z) d.z = As<Float4>(As<Int4>(d.z) & zEnable);
-					if(dest.w) d.w = As<Float4>(As<Int4>(d.w) & wEnable);
+					if(dst.x) d.x = As<Float4>(As<Int4>(d.x) & xEnable);
+					if(dst.y) d.y = As<Float4>(As<Int4>(d.y) & yEnable);
+					if(dst.z) d.z = As<Float4>(As<Int4>(d.z) & zEnable);
+					if(dst.w) d.w = As<Float4>(As<Int4>(d.w) & wEnable);
 
-					if(dest.x) d.x = As<Float4>(As<Int4>(d.x) | (As<Int4>(pDst.x) & ~xEnable));
-					if(dest.y) d.y = As<Float4>(As<Int4>(d.y) | (As<Int4>(pDst.y) & ~yEnable));
-					if(dest.z) d.z = As<Float4>(As<Int4>(d.z) | (As<Int4>(pDst.z) & ~zEnable));
-					if(dest.w) d.w = As<Float4>(As<Int4>(d.w) | (As<Int4>(pDst.w) & ~wEnable));
+					if(dst.x) d.x = As<Float4>(As<Int4>(d.x) | (As<Int4>(pDst.x) & ~xEnable));
+					if(dst.y) d.y = As<Float4>(As<Int4>(d.y) | (As<Int4>(pDst.y) & ~yEnable));
+					if(dst.z) d.z = As<Float4>(As<Int4>(d.z) | (As<Int4>(pDst.z) & ~zEnable));
+					if(dst.w) d.w = As<Float4>(As<Int4>(d.w) | (As<Int4>(pDst.w) & ~wEnable));
 				}
 
-				switch(dest.type)
+				switch(dst.type)
 				{
-				case Dst::PARAMETER_VOID:
+				case Shader::PARAMETER_VOID:
 					break;
-				case Dst::PARAMETER_TEMP:
-					if(dest.x) r.r[dest.index].x = d.x;
-					if(dest.y) r.r[dest.index].y = d.y;
-					if(dest.z) r.r[dest.index].z = d.z;
-					if(dest.w) r.r[dest.index].w = d.w;
+				case Shader::PARAMETER_TEMP:
+					if(dst.rel.type == Shader::PARAMETER_VOID)
+					{
+						if(dst.x) r.r[dst.index].x = d.x;
+						if(dst.y) r.r[dst.index].y = d.y;
+						if(dst.z) r.r[dst.index].z = d.z;
+						if(dst.w) r.r[dst.index].w = d.w;
+					}
+					else
+					{
+						Int a = relativeAddress(r, dst);
+
+						if(dst.x) r.r[dst.index + a].x = d.x;
+						if(dst.y) r.r[dst.index + a].y = d.y;
+						if(dst.z) r.r[dst.index + a].z = d.z;
+						if(dst.w) r.r[dst.index + a].w = d.w;
+					}
 					break;
-				case Dst::PARAMETER_ADDR:
-					if(dest.x) r.a0.x = d.x;
-					if(dest.y) r.a0.y = d.y;
-					if(dest.z) r.a0.z = d.z;
-					if(dest.w) r.a0.w = d.w;
+				case Shader::PARAMETER_ADDR:
+					if(dst.x) r.a0.x = d.x;
+					if(dst.y) r.a0.y = d.y;
+					if(dst.z) r.a0.z = d.z;
+					if(dst.w) r.a0.w = d.w;
 					break;
-				case Dst::PARAMETER_RASTOUT:
-					switch(dest.index)
+				case Shader::PARAMETER_RASTOUT:
+					switch(dst.index)
 					{
 					case 0:
-						if(dest.x) r.ox[Pos] = d.x;
-						if(dest.y) r.oy[Pos] = d.y;
-						if(dest.z) r.oz[Pos] = d.z;
-						if(dest.w) r.ow[Pos] = d.w;
+						if(dst.x) r.o[Pos].x = d.x;
+						if(dst.y) r.o[Pos].y = d.y;
+						if(dst.z) r.o[Pos].z = d.z;
+						if(dst.w) r.o[Pos].w = d.w;
 						break;
 					case 1:
-						r.ox[Fog] = d.x;
+						r.o[Fog].x = d.x;
 						break;
 					case 2:		
-						r.oy[Pts] = d.x;
+						r.o[Pts].y = d.x;
 						break;
 					default:	ASSERT(false);
 					}
 					break;
-				case Dst::PARAMETER_ATTROUT:	
-					if(dest.x) r.ox[D0 + dest.index] = d.x;
-					if(dest.y) r.oy[D0 + dest.index] = d.y;
-					if(dest.z) r.oz[D0 + dest.index] = d.z;
-					if(dest.w) r.ow[D0 + dest.index] = d.w;
+				case Shader::PARAMETER_ATTROUT:	
+					if(dst.x) r.o[D0 + dst.index].x = d.x;
+					if(dst.y) r.o[D0 + dst.index].y = d.y;
+					if(dst.z) r.o[D0 + dst.index].z = d.z;
+					if(dst.w) r.o[D0 + dst.index].w = d.w;
 					break;
-				case Dst::PARAMETER_TEXCRDOUT:
-			//	case Dst::PARAMETER_OUTPUT:
+				case Shader::PARAMETER_TEXCRDOUT:
+			//	case Shader::PARAMETER_OUTPUT:
 					if(version < 0x0300)
 					{
-						if(dest.x) r.ox[T0 + dest.index] = d.x;
-						if(dest.y) r.oy[T0 + dest.index] = d.y;
-						if(dest.z) r.oz[T0 + dest.index] = d.z;
-						if(dest.w) r.ow[T0 + dest.index] = d.w;
+						if(dst.x) r.o[T0 + dst.index].x = d.x;
+						if(dst.y) r.o[T0 + dst.index].y = d.y;
+						if(dst.z) r.o[T0 + dst.index].z = d.z;
+						if(dst.w) r.o[T0 + dst.index].w = d.w;
 					}
 					else
 					{
-						if(!dest.relative)
+						if(dst.rel.type == Shader::PARAMETER_VOID)   // Not relative
 						{
-							if(dest.x) r.ox[dest.index] = d.x;
-							if(dest.y) r.oy[dest.index] = d.y;
-							if(dest.z) r.oz[dest.index] = d.z;
-							if(dest.w) r.ow[dest.index] = d.w;
+							if(dst.x) r.o[dst.index].x = d.x;
+							if(dst.y) r.o[dst.index].y = d.y;
+							if(dst.z) r.o[dst.index].z = d.z;
+							if(dst.w) r.o[dst.index].w = d.w;
 						}
-						else
+						else if(dst.rel.type == Shader::PARAMETER_LOOP)
 						{
 							Int aL = r.aL[r.loopDepth];
 
-							if(dest.x) r.ox[dest.index + aL] = d.x;
-							if(dest.y) r.oy[dest.index + aL] = d.y;
-							if(dest.z) r.oz[dest.index + aL] = d.z;
-							if(dest.w) r.ow[dest.index + aL] = d.w;
+							if(dst.x) r.o[dst.index + aL].x = d.x;
+							if(dst.y) r.o[dst.index + aL].y = d.y;
+							if(dst.z) r.o[dst.index + aL].z = d.z;
+							if(dst.w) r.o[dst.index + aL].w = d.w;
+						}
+						else
+						{
+							Int a = relativeAddress(r, dst);
+
+							if(dst.x) r.o[dst.index + a].x = d.x;
+							if(dst.y) r.o[dst.index + a].y = d.y;
+							if(dst.z) r.o[dst.index + a].z = d.z;
+							if(dst.w) r.o[dst.index + a].w = d.w;
 						}
 					}
 					break;
-				case Dst::PARAMETER_LABEL:																		break;
-				case Dst::PARAMETER_PREDICATE:	r.p0 = d;														break;
-				case Dst::PARAMETER_INPUT:																		break;
+				case Shader::PARAMETER_LABEL:																		break;
+				case Shader::PARAMETER_PREDICATE:	r.p0 = d;														break;
+				case Shader::PARAMETER_INPUT:																		break;
 				default:
 					ASSERT(false);
 				}
 			}
 		}
 
-		if(returns)
+		if(currentLabel != -1)
 		{
 			Nucleus::setInsertBlock(returnBlock);
 		}
@@ -478,40 +524,40 @@
 
 	void VertexProgram::passThrough(Registers &r)
 	{
-		if(vertexShader)
+		if(shader)
 		{
 			for(int i = 0; i < 12; i++)
 			{
-				unsigned char usage = vertexShader->output[i][0].usage;
-				unsigned char index = vertexShader->output[i][0].index;
+				unsigned char usage = shader->output[i][0].usage;
+				unsigned char index = shader->output[i][0].index;
 
 				switch(usage)
 				{
 				case 0xFF:
 					continue;
-				case ShaderOperation::USAGE_PSIZE:
-					r.oy[i] = r.v[i].x;
+				case Shader::USAGE_PSIZE:
+					r.o[i].y = r.v[i].x;
 					break;
-				case ShaderOperation::USAGE_TEXCOORD:
-					r.ox[i] = r.v[i].x;
-					r.oy[i] = r.v[i].y;
-					r.oz[i] = r.v[i].z;
-					r.ow[i] = r.v[i].w;
+				case Shader::USAGE_TEXCOORD:
+					r.o[i].x = r.v[i].x;
+					r.o[i].y = r.v[i].y;
+					r.o[i].z = r.v[i].z;
+					r.o[i].w = r.v[i].w;
 					break;
-				case ShaderOperation::USAGE_POSITION:
-					r.ox[i] = r.v[i].x;
-					r.oy[i] = r.v[i].y;
-					r.oz[i] = r.v[i].z;
-					r.ow[i] = r.v[i].w;
+				case Shader::USAGE_POSITION:
+					r.o[i].x = r.v[i].x;
+					r.o[i].y = r.v[i].y;
+					r.o[i].z = r.v[i].z;
+					r.o[i].w = r.v[i].w;
 					break;
-				case ShaderOperation::USAGE_COLOR:
-					r.ox[i] = r.v[i].x;
-					r.oy[i] = r.v[i].y;
-					r.oz[i] = r.v[i].z;
-					r.ow[i] = r.v[i].w;
+				case Shader::USAGE_COLOR:
+					r.o[i].x = r.v[i].x;
+					r.o[i].y = r.v[i].y;
+					r.o[i].z = r.v[i].z;
+					r.o[i].w = r.v[i].w;
 					break;
-				case ShaderOperation::USAGE_FOG:
-					r.ox[i] = r.v[i].x;
+				case Shader::USAGE_FOG:
+					r.o[i].x = r.v[i].x;
 					break;
 				default:
 					ASSERT(false);
@@ -520,60 +566,99 @@
 		}
 		else
 		{
-			r.ox[Pos] = r.v[PositionT].x;
-			r.oy[Pos] = r.v[PositionT].y;
-			r.oz[Pos] = r.v[PositionT].z;
-			r.ow[Pos] = r.v[PositionT].w;
+			r.o[Pos].x = r.v[PositionT].x;
+			r.o[Pos].y = r.v[PositionT].y;
+			r.o[Pos].z = r.v[PositionT].z;
+			r.o[Pos].w = r.v[PositionT].w;
 
 			for(int i = 0; i < 2; i++)
 			{
-				r.ox[D0 + i] = r.v[Color0 + i].x;
-				r.oy[D0 + i] = r.v[Color0 + i].y;
-				r.oz[D0 + i] = r.v[Color0 + i].z;
-				r.ow[D0 + i] = r.v[Color0 + i].w;
+				r.o[D0 + i].x = r.v[Color0 + i].x;
+				r.o[D0 + i].y = r.v[Color0 + i].y;
+				r.o[D0 + i].z = r.v[Color0 + i].z;
+				r.o[D0 + i].w = r.v[Color0 + i].w;
 			}
 
 			for(int i = 0; i < 8; i++)
 			{
-				r.ox[T0 + i] = r.v[TexCoord0 + i].x;
-				r.oy[T0 + i] = r.v[TexCoord0 + i].y;
-				r.oz[T0 + i] = r.v[TexCoord0 + i].z;
-				r.ow[T0 + i] = r.v[TexCoord0 + i].w;
+				r.o[T0 + i].x = r.v[TexCoord0 + i].x;
+				r.o[T0 + i].y = r.v[TexCoord0 + i].y;
+				r.o[T0 + i].z = r.v[TexCoord0 + i].z;
+				r.o[T0 + i].w = r.v[TexCoord0 + i].w;
 			}
 
-			r.oy[Pts] = r.v[PSize].x;
+			r.o[Pts].y = r.v[PSize].x;
 		}
 	}
 
-	Color4f VertexProgram::reg(Registers &r, const Src &src, int offset)
+	Vector4f VertexProgram::reg(Registers &r, const Src &src, int offset)
 	{
 		int i = src.index + offset;
 
-		Color4f reg;
+		Vector4f reg;
 
-		if(src.type == Src::PARAMETER_CONST)
-		{
-			reg = readConstant(r, src, offset);
-		}
-		
 		switch(src.type)
 		{
-		case Src::PARAMETER_TEMP:			reg = r.r[i];	break;
-		case Src::PARAMETER_CONST:							break;
-		case Src::PARAMETER_INPUT:			reg = r.v[i];	break;
-		case Src::PARAMETER_VOID:			return r.r[0];   // Dummy
-		case Src::PARAMETER_FLOATLITERAL:	return r.r[0];   // Dummy
-		case Src::PARAMETER_ADDR:			reg = r.a0;		break;
-		case Src::PARAMETER_CONSTBOOL:		return r.r[0];   // Dummy
-		case Src::PARAMETER_CONSTINT:		return r.r[0];   // Dummy
-		case Src::PARAMETER_LOOP:			return r.r[0];   // Dummy
-		case Src::PARAMETER_PREDICATE:		return r.r[0];   // Dummy
-		case Src::PARAMETER_SAMPLER:		return r.r[0];   // Dummy
+		case Shader::PARAMETER_TEMP:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg = r.r[i];
+			}
+			else
+			{
+				reg = r.r[i + relativeAddress(r, src)];
+			}
+			break;
+		case Shader::PARAMETER_CONST:
+			reg = readConstant(r, src, offset);
+			break;
+		case Shader::PARAMETER_INPUT:
+            if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg = r.v[i];
+			}
+			else
+			{
+				reg = r.v[i + relativeAddress(r, src)];
+			}
+            break;
+		case Shader::PARAMETER_VOID:			return r.r[0];   // Dummy
+		case Shader::PARAMETER_FLOAT4LITERAL:
+			reg.x = Float4(src.value[0]);
+			reg.y = Float4(src.value[1]);
+			reg.z = Float4(src.value[2]);
+			reg.w = Float4(src.value[3]);
+			break;
+		case Shader::PARAMETER_ADDR:			reg = r.a0;		break;
+		case Shader::PARAMETER_CONSTBOOL:		return r.r[0];   // Dummy
+		case Shader::PARAMETER_CONSTINT:		return r.r[0];   // Dummy
+		case Shader::PARAMETER_LOOP:			return r.r[0];   // Dummy
+		case Shader::PARAMETER_PREDICATE:		return r.r[0];   // Dummy
+		case Shader::PARAMETER_SAMPLER:
+			if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg.x = As<Float4>(Int4(i));
+			}
+			else if(src.rel.type == Shader::PARAMETER_TEMP)
+			{
+				reg.x = As<Float4>(Int4(i) + RoundInt(r.r[src.rel.index].x));
+			}
+			return reg;
+		case Shader::PARAMETER_OUTPUT:
+            if(src.rel.type == Shader::PARAMETER_VOID)
+			{
+				reg = r.o[i];
+			}
+			else
+			{
+				reg = r.o[i + relativeAddress(r, src)];
+			}
+			break;
 		default:
 			ASSERT(false);
 		}
 
-		Color4f mod;
+		Vector4f mod;
 
 		mod.x = reg[(src.swizzle >> 0) & 0x03];
 		mod.y = reg[(src.swizzle >> 2) & 0x03];
@@ -582,54 +667,54 @@
 
 		switch(src.modifier)
 		{
-		case Src::MODIFIER_NONE:
+		case Shader::MODIFIER_NONE:
 			break;
-		case Src::MODIFIER_NEGATE:
+		case Shader::MODIFIER_NEGATE:
 			mod.x = -mod.x;
 			mod.y = -mod.y;
 			mod.z = -mod.z;
 			mod.w = -mod.w;
 			break;
-		case Src::MODIFIER_BIAS:
+		case Shader::MODIFIER_BIAS:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_BIAS_NEGATE:
+		case Shader::MODIFIER_BIAS_NEGATE:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_SIGN:
+		case Shader::MODIFIER_SIGN:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_SIGN_NEGATE:
+		case Shader::MODIFIER_SIGN_NEGATE:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_COMPLEMENT:
+		case Shader::MODIFIER_COMPLEMENT:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_X2:
+		case Shader::MODIFIER_X2:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_X2_NEGATE:
+		case Shader::MODIFIER_X2_NEGATE:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_DZ:
+		case Shader::MODIFIER_DZ:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_DW:
+		case Shader::MODIFIER_DW:
 			ASSERT(false);   // NOTE: Unimplemented
 			break;
-		case Src::MODIFIER_ABS:
+		case Shader::MODIFIER_ABS:
 			mod.x = Abs(mod.x);
 			mod.y = Abs(mod.y);
 			mod.z = Abs(mod.z);
 			mod.w = Abs(mod.w);
 			break;
-		case Src::MODIFIER_ABS_NEGATE:
+		case Shader::MODIFIER_ABS_NEGATE:
 			mod.x = -Abs(mod.x);
 			mod.y = -Abs(mod.y);
 			mod.z = -Abs(mod.z);
 			mod.w = -Abs(mod.w);
 			break;
-		case Src::MODIFIER_NOT:
+		case Shader::MODIFIER_NOT:
 			UNIMPLEMENTED();
 			break;
 		default:
@@ -639,32 +724,177 @@
 		return mod;
 	}
 
-	void VertexProgram::M3X2(Registers &r, Color4f &dst, Color4f &src0, Src &src1)
+	Vector4f VertexProgram::readConstant(Registers &r, const Src &src, int offset)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
+		Vector4f c;
+
+		int i = src.index + offset;
+
+		if(src.rel.type == Shader::PARAMETER_VOID)   // Not relative
+		{
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c[i]));
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+
+			if(localShaderConstants)   // Constant may be known at compile time
+			{
+				for(int j = 0; j < shader->getLength(); j++)
+				{
+					const Shader::Instruction &instruction = *shader->getInstruction(j);
+
+					if(instruction.opcode == Shader::OPCODE_DEF)
+					{
+						if(instruction.dst.index == i)
+						{
+							c.x = Float4(instruction.src[0].value[0]);
+							c.y = Float4(instruction.src[0].value[1]);
+							c.z = Float4(instruction.src[0].value[2]);
+							c.w = Float4(instruction.src[0].value[3]);
+
+							break;
+						}
+					}
+				}
+			}
+		}
+		else if(src.rel.type == Shader::PARAMETER_LOOP)
+		{
+			Int loopCounter = r.aL[r.loopDepth];
+
+			c.x = c.y = c.z = c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c[i]) + loopCounter * 16);
+
+			c.x = c.x.xxxx;
+			c.y = c.y.yyyy;
+			c.z = c.z.zzzz;
+			c.w = c.w.wwww;
+		}
+		else
+		{
+			if(src.rel.deterministic)
+			{
+				Int a = relativeAddress(r, src);
+			
+				c.x = c.y = c.z = c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c[i]) + a * 16);
+
+				c.x = c.x.xxxx;
+				c.y = c.y.yyyy;
+				c.z = c.z.zzzz;
+				c.w = c.w.wwww;
+			}
+			else
+			{
+				int component = src.rel.swizzle & 0x03;
+				Float4 a;
+
+				switch(src.rel.type)
+				{
+				case Shader::PARAMETER_ADDR:   a = r.a0[component]; break;
+				case Shader::PARAMETER_TEMP:   a = r.r[src.rel.index][component]; break;
+				case Shader::PARAMETER_INPUT:  a = r.v[src.rel.index][component]; break;
+				case Shader::PARAMETER_OUTPUT: a = r.o[src.rel.index][component]; break;
+				case Shader::PARAMETER_CONST:  a = Float4(*Pointer<Float>(r.data + OFFSET(DrawData,vs.c[src.rel.index][component]))); break;
+				default: ASSERT(false);
+				}
+
+				Int4 index = Int4(i) + RoundInt(a) * Int4(src.rel.scale);
+
+				index = Min(As<UInt4>(index), UInt4(256));   // Clamp to constant register range, c[256] = {0, 0, 0, 0}
+				
+				Int index0 = Extract(index, 0);
+				Int index1 = Extract(index, 1);
+				Int index2 = Extract(index, 2);
+				Int index3 = Extract(index, 3);
+
+				c.x = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index0 * 16, 16);
+				c.y = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index1 * 16, 16);
+				c.z = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index2 * 16, 16);
+				c.w = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c) + index3 * 16, 16);
+
+				transpose4x4(c.x, c.y, c.z, c.w);
+			}
+		}
+
+		return c;
+	}
+
+	Int VertexProgram::relativeAddress(Registers &r, const Shader::Parameter &var)
+	{
+		ASSERT(var.rel.deterministic);
+
+		if(var.rel.type == Shader::PARAMETER_TEMP)
+		{
+			return RoundInt(Extract(r.r[var.rel.index].x, 0)) * var.rel.scale;
+		}
+		else if(var.rel.type == Shader::PARAMETER_INPUT)
+		{
+			return RoundInt(Extract(r.v[var.rel.index].x, 0)) * var.rel.scale;
+		}
+		else if(var.rel.type == Shader::PARAMETER_OUTPUT)
+		{
+			return RoundInt(Extract(r.o[var.rel.index].x, 0)) * var.rel.scale;
+		}
+		else if(var.rel.type == Shader::PARAMETER_CONST)
+		{
+			RValue<Float4> c = *Pointer<Float4>(r.data + OFFSET(DrawData,vs.c[var.rel.index]));
+
+			return RoundInt(Extract(c, 0)) * var.rel.scale;
+		}
+		else ASSERT(false);
+
+		return 0;
+	}
+
+	Int4 VertexProgram::enableMask(Registers &r, const Shader::Instruction *instruction)
+	{
+		Int4 enable = instruction->analysisBranch ? Int4(r.enableStack[r.enableIndex]) : Int4(0xFFFFFFFF);
+					
+		if(shader->containsBreakInstruction() && !whileTest && instruction->analysisBreak)
+		{
+			enable &= r.enableBreak;
+		}
+
+		if(shader->containsContinueInstruction() && !whileTest && instruction->analysisContinue)
+		{
+			enable &= r.enableContinue;
+		}
+
+		if(shader->containsLeaveInstruction() && instruction->analysisLeave)
+		{
+			enable &= r.enableLeave;
+		}
+
+		return enable;
+	}
+
+	void VertexProgram::M3X2(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1)
+	{
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
 
 		dst.x = dot3(src0, row0);
 		dst.y = dot3(src0, row1);
 	}
 
-	void VertexProgram::M3X3(Registers &r, Color4f &dst, Color4f &src0, Src &src1)
+	void VertexProgram::M3X3(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
 
 		dst.x = dot3(src0, row0);
 		dst.y = dot3(src0, row1);
 		dst.z = dot3(src0, row2);
 	}
 
-	void VertexProgram::M3X4(Registers &r, Color4f &dst, Color4f &src0, Src &src1)
+	void VertexProgram::M3X4(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
-		Color4f row3 = reg(r, src1, 3);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
+		Vector4f row3 = reg(r, src1, 3);
 
 		dst.x = dot3(src0, row0);
 		dst.y = dot3(src0, row1);
@@ -672,23 +902,23 @@
 		dst.w = dot3(src0, row3);
 	}
 
-	void VertexProgram::M4X3(Registers &r, Color4f &dst, Color4f &src0, Src &src1)
+	void VertexProgram::M4X3(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
 
 		dst.x = dot4(src0, row0);
 		dst.y = dot4(src0, row1);
 		dst.z = dot4(src0, row2);
 	}
 
-	void VertexProgram::M4X4(Registers &r, Color4f &dst, Color4f &src0, Src &src1)
+	void VertexProgram::M4X4(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1)
 	{
-		Color4f row0 = reg(r, src1, 0);
-		Color4f row1 = reg(r, src1, 1);
-		Color4f row2 = reg(r, src1, 2);
-		Color4f row3 = reg(r, src1, 3);
+		Vector4f row0 = reg(r, src1, 0);
+		Vector4f row1 = reg(r, src1, 1);
+		Vector4f row2 = reg(r, src1, 2);
+		Vector4f row3 = reg(r, src1, 3);
 
 		dst.x = dot4(src0, row0);
 		dst.y = dot4(src0, row1);
@@ -703,6 +933,7 @@
 
 		if(breakDepth == 0)
 		{
+			r.enableIndex = r.enableIndex - breakDepth;
 			Nucleus::createBr(endBlock);
 		}
 		else
@@ -710,49 +941,47 @@
 			r.enableBreak = r.enableBreak & ~r.enableStack[r.enableIndex];
 			Bool allBreak = SignMask(r.enableBreak) == 0x0;
 
+			r.enableIndex = r.enableIndex - breakDepth;
 			branch(allBreak, endBlock, deadBlock);
 		}
 
 		Nucleus::setInsertBlock(deadBlock);
+		r.enableIndex = r.enableIndex + breakDepth;
 	}
 
-	void VertexProgram::BREAKC(Registers &r, Color4f &src0, Color4f &src1, Control control)
+	void VertexProgram::BREAKC(Registers &r, Vector4f &src0, Vector4f &src1, Control control)
 	{
 		Int4 condition;
 
 		switch(control)
 		{
-		case Op::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
-		case Op::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
-		case Op::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
-		case Op::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
-		case Op::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
-		case Op::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
 		default:
 			ASSERT(false);
 		}
 
-		condition &= r.enableStack[r.enableIndex];
-
-		llvm::BasicBlock *continueBlock = Nucleus::createBasicBlock();
-		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth - 1];
-
-		r.enableBreak = r.enableBreak & ~condition;
-		Bool allBreak = SignMask(r.enableBreak) == 0x0;
-
-		branch(allBreak, endBlock, continueBlock);
-		Nucleus::setInsertBlock(continueBlock);
+		BREAK(r, condition);
 	}
 
 	void VertexProgram::BREAKP(Registers &r, const Src &predicateRegister)   // FIXME: Factor out parts common with BREAKC
 	{
 		Int4 condition = As<Int4>(r.p0[predicateRegister.swizzle & 0x3]);
 
-		if(predicateRegister.modifier == Src::MODIFIER_NOT)
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = ~condition;
 		}
 
+		BREAK(r, condition);
+	}
+
+	void VertexProgram::BREAK(Registers &r, Int4 &condition)
+	{
 		condition &= r.enableStack[r.enableIndex];
 
 		llvm::BasicBlock *continueBlock = Nucleus::createBasicBlock();
@@ -761,44 +990,61 @@
 		r.enableBreak = r.enableBreak & ~condition;
 		Bool allBreak = SignMask(r.enableBreak) == 0x0;
 
+		r.enableIndex = r.enableIndex - breakDepth;
 		branch(allBreak, endBlock, continueBlock);
+
 		Nucleus::setInsertBlock(continueBlock);
+		r.enableIndex = r.enableIndex + breakDepth;
 	}
 
-	void VertexProgram::CALL(Registers &r, int labelIndex)
+	void VertexProgram::CONTINUE(Registers &r)
+	{
+		r.enableContinue = r.enableContinue & ~r.enableStack[r.enableIndex];
+	}
+
+	void VertexProgram::TEST()
+	{
+		whileTest = true;
+	}
+
+	void VertexProgram::CALL(Registers &r, int labelIndex, int callSiteIndex)
 	{
 		if(!labelBlock[labelIndex])
 		{
 			labelBlock[labelIndex] = Nucleus::createBasicBlock();
 		}
 
-		llvm::BasicBlock *retBlock = Nucleus::createBasicBlock();
-		callRetBlock.push_back(retBlock);
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			r.callStack[r.stackIndex++] = UInt(callSiteIndex);
+		}
 
-		r.callStack[r.stackIndex++] = UInt((unsigned int)callRetBlock.size() - 1);   // FIXME
+		Int4 restoreLeave = r.enableLeave;
 
 		Nucleus::createBr(labelBlock[labelIndex]);
-		Nucleus::setInsertBlock(retBlock);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		r.enableLeave = restoreLeave;
 	}
 
-	void VertexProgram::CALLNZ(Registers &r, int labelIndex, const Src &src)
+	void VertexProgram::CALLNZ(Registers &r, int labelIndex, int callSiteIndex, const Src &src)
 	{
-		if(src.type == Src::PARAMETER_CONSTBOOL)
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
 		{
-			CALLNZb(r, labelIndex, src);
+			CALLNZb(r, labelIndex, callSiteIndex, src);
 		}
-		else if(src.type == Src::PARAMETER_PREDICATE)
+		else if(src.type == Shader::PARAMETER_PREDICATE)
 		{
-			CALLNZp(r, labelIndex, src);
+			CALLNZp(r, labelIndex, callSiteIndex, src);
 		}
 		else ASSERT(false);
 	}
 
-	void VertexProgram::CALLNZb(Registers &r, int labelIndex, const Src &boolRegister)
+	void VertexProgram::CALLNZb(Registers &r, int labelIndex, int callSiteIndex, const Src &boolRegister)
 	{
 		Bool condition = (*Pointer<Byte>(r.data + OFFSET(DrawData,vs.b[boolRegister.index])) != Byte(0));   // FIXME
 		
-		if(boolRegister.modifier == Src::MODIFIER_NOT)
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = !condition;	
 		}
@@ -808,20 +1054,24 @@
 			labelBlock[labelIndex] = Nucleus::createBasicBlock();
 		}
 
-		llvm::BasicBlock *retBlock = Nucleus::createBasicBlock();
-		callRetBlock.push_back(retBlock);
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			r.callStack[r.stackIndex++] = UInt(callSiteIndex);
+		}
 
-		r.callStack[r.stackIndex++] = UInt((int)callRetBlock.size() - 1);   // FIXME
+		Int4 restoreLeave = r.enableLeave;
 
-		branch(condition, labelBlock[labelIndex], retBlock);
-		Nucleus::setInsertBlock(retBlock);
+		branch(condition, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
+
+		r.enableLeave = restoreLeave;
 	}
 
-	void VertexProgram::CALLNZp(Registers &r, int labelIndex, const Src &predicateRegister)
+	void VertexProgram::CALLNZp(Registers &r, int labelIndex, int callSiteIndex, const Src &predicateRegister)
 	{
 		Int4 condition = As<Int4>(r.p0[predicateRegister.swizzle & 0x3]);
 
-		if(predicateRegister.modifier == Src::MODIFIER_NOT)
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = ~condition;
 		}
@@ -833,20 +1083,21 @@
 			labelBlock[labelIndex] = Nucleus::createBasicBlock();
 		}
 
-		llvm::BasicBlock *retBlock = Nucleus::createBasicBlock();
-		callRetBlock.push_back(retBlock);
-
-		r.callStack[r.stackIndex++] = UInt((int)callRetBlock.size() - 1);   // FIXME
+		if(callRetBlock[labelIndex].size() > 1)
+		{
+			r.callStack[r.stackIndex++] = UInt(callSiteIndex);
+		}
 
 		r.enableIndex++;
 		r.enableStack[r.enableIndex] = condition;
+		Int4 restoreLeave = r.enableLeave;
 
-		Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
-
-		branch(notAllFalse, labelBlock[labelIndex], retBlock);
-		Nucleus::setInsertBlock(retBlock);
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, labelBlock[labelIndex], callRetBlock[labelIndex][callSiteIndex]);
+		Nucleus::setInsertBlock(callRetBlock[labelIndex][callSiteIndex]);
 
 		r.enableIndex--;
+		r.enableLeave = restoreLeave;
 	}
 
 	void VertexProgram::ELSE(Registers &r)
@@ -859,7 +1110,7 @@
 		if(isConditionalIf[ifDepth])
 		{
 			Int4 condition = ~r.enableStack[r.enableIndex] & r.enableStack[r.enableIndex - 1];
-			Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
+			Bool notAllFalse = SignMask(condition) != 0;
 
 			branch(notAllFalse, falseBlock, endBlock);
 
@@ -892,20 +1143,6 @@
 		}
 	}
 
-	void VertexProgram::ENDREP(Registers &r)
-	{
-		loopRepDepth--;
-
-		llvm::BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
-		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
-
-		Nucleus::createBr(testBlock);
-		Nucleus::setInsertBlock(endBlock);
-
-		r.loopDepth--;
-		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-	}
-
 	void VertexProgram::ENDLOOP(Registers &r)
 	{
 		loopRepDepth--;
@@ -922,17 +1159,50 @@
 		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 	}
 
+	void VertexProgram::ENDREP(Registers &r)
+	{
+		loopRepDepth--;
+
+		llvm::BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		r.loopDepth--;
+		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+	}
+
+	void VertexProgram::ENDWHILE(Registers &r)
+	{
+		loopRepDepth--;
+
+		llvm::BasicBlock *testBlock = loopRepTestBlock[loopRepDepth];
+		llvm::BasicBlock *endBlock = loopRepEndBlock[loopRepDepth];
+
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(endBlock);
+
+		r.enableIndex--;
+		r.enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+		whileTest = false;
+	}
+
 	void VertexProgram::IF(Registers &r, const Src &src)
 	{
-		if(src.type == Src::PARAMETER_CONSTBOOL)
+		if(src.type == Shader::PARAMETER_CONSTBOOL)
 		{
 			IFb(r, src);
 		}
-		else if(src.type == Src::PARAMETER_PREDICATE)
+		else if(src.type == Shader::PARAMETER_PREDICATE)
 		{
 			IFp(r, src);
 		}
-		else ASSERT(false);
+		else
+		{
+			Int4 condition = As<Int4>(reg(r, src).x);
+			IF(r, condition);
+		}
 	}
 
 	void VertexProgram::IFb(Registers &r, const Src &boolRegister)
@@ -941,9 +1211,9 @@
 
 		Bool condition = (*Pointer<Byte>(r.data + OFFSET(DrawData,vs.b[boolRegister.index])) != Byte(0));   // FIXME
 
-		if(boolRegister.modifier == Src::MODIFIER_NOT)
+		if(boolRegister.modifier == Shader::MODIFIER_NOT)
 		{
-			condition = !condition;	
+			condition = !condition;
 		}
 
 		llvm::BasicBlock *trueBlock = Nucleus::createBasicBlock();
@@ -957,50 +1227,39 @@
 		ifDepth++;
 	}
 
-	void VertexProgram::IFp(Registers &r, const Src &predicateRegister)   // FIXME: Factor out parts common with IFC
+	void VertexProgram::IFp(Registers &r, const Src &predicateRegister)
 	{
 		Int4 condition = As<Int4>(r.p0[predicateRegister.swizzle & 0x3]);
 
-		if(predicateRegister.modifier == Src::MODIFIER_NOT)
+		if(predicateRegister.modifier == Shader::MODIFIER_NOT)
 		{
 			condition = ~condition;
 		}
 
-		condition &= r.enableStack[r.enableIndex];
-
-		r.enableIndex++;
-		r.enableStack[r.enableIndex] = condition;
-
-		llvm::BasicBlock *trueBlock = Nucleus::createBasicBlock();
-		llvm::BasicBlock *falseBlock = Nucleus::createBasicBlock();
-
-		Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
-
-		branch(notAllFalse, trueBlock, falseBlock);
-
-		isConditionalIf[ifDepth] = true;
-		ifFalseBlock[ifDepth] = falseBlock;
-
-		ifDepth++;
-		breakDepth++;
+		IF(r, condition);
 	}
 
-	void VertexProgram::IFC(Registers &r, Color4f &src0, Color4f &src1, Control control)
+	void VertexProgram::IFC(Registers &r, Vector4f &src0, Vector4f &src1, Control control)
 	{
 		Int4 condition;
 
 		switch(control)
 		{
-		case Op::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
-		case Op::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
-		case Op::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
-		case Op::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
-		case Op::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
-		case Op::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
+		case Shader::CONTROL_GT: condition = CmpNLE(src0.x,  src1.x);	break;
+		case Shader::CONTROL_EQ: condition = CmpEQ(src0.x, src1.x);		break;
+		case Shader::CONTROL_GE: condition = CmpNLT(src0.x, src1.x);	break;
+		case Shader::CONTROL_LT: condition = CmpLT(src0.x,  src1.x);	break;
+		case Shader::CONTROL_NE: condition = CmpNEQ(src0.x, src1.x);	break;
+		case Shader::CONTROL_LE: condition = CmpLE(src0.x, src1.x);		break;
 		default:
 			ASSERT(false);
 		}
 
+		IF(r, condition);
+	}
+
+	void VertexProgram::IF(Registers &r, Int4 &condition)
+	{
 		condition &= r.enableStack[r.enableIndex];
 
 		r.enableIndex++;
@@ -1009,7 +1268,7 @@
 		llvm::BasicBlock *trueBlock = Nucleus::createBasicBlock();
 		llvm::BasicBlock *falseBlock = Nucleus::createBasicBlock();
 
-		Bool notAllFalse = SignMask(condition & r.enableBreak) != 0;
+		Bool notAllFalse = SignMask(condition) != 0;
 
 		branch(notAllFalse, trueBlock, falseBlock);
 
@@ -1022,7 +1281,13 @@
 
 	void VertexProgram::LABEL(int labelIndex)
 	{
+		if(!labelBlock[labelIndex])
+		{
+			labelBlock[labelIndex] = Nucleus::createBasicBlock();
+		}
+
 		Nucleus::setInsertBlock(labelBlock[labelIndex]);
+		currentLabel = labelIndex;
 	}
 
 	void VertexProgram::LOOP(Registers &r, const Src &integerRegister)
@@ -1086,27 +1351,73 @@
 		breakDepth = 0;
 	}
 
+	void VertexProgram::WHILE(Registers &r, const Src &temporaryRegister)
+	{
+		r.enableIndex++;
+
+		llvm::BasicBlock *loopBlock = Nucleus::createBasicBlock();
+		llvm::BasicBlock *testBlock = Nucleus::createBasicBlock();
+		llvm::BasicBlock *endBlock = Nucleus::createBasicBlock();
+		
+		loopRepTestBlock[loopRepDepth] = testBlock;
+		loopRepEndBlock[loopRepDepth] = endBlock;
+
+		Int4 restoreBreak = r.enableBreak;
+		Int4 restoreContinue = r.enableContinue;
+
+		// FIXME: jump(testBlock)
+		Nucleus::createBr(testBlock);
+		Nucleus::setInsertBlock(testBlock);
+		r.enableContinue = restoreContinue;
+
+		Vector4f &src = reg(r, temporaryRegister);
+		Int4 condition = As<Int4>(src.x);
+		condition &= r.enableStack[r.enableIndex - 1];
+		r.enableStack[r.enableIndex] = condition;
+
+		Bool notAllFalse = SignMask(condition) != 0;
+		branch(notAllFalse, loopBlock, endBlock);
+		
+		Nucleus::setInsertBlock(endBlock);
+		r.enableBreak = restoreBreak;
+		
+		Nucleus::setInsertBlock(loopBlock);
+
+		loopRepDepth++;
+		breakDepth = 0;
+	}
+
 	void VertexProgram::RET(Registers &r)
 	{
-		if(!returns)
+		if(currentLabel == -1)
 		{
 			returnBlock = Nucleus::createBasicBlock();
 			Nucleus::createBr(returnBlock);
-
-			returns = true;
 		}
 		else
 		{
-			// FIXME: Encapsulate
-			UInt index = r.callStack[--r.stackIndex];
- 
 			llvm::BasicBlock *unreachableBlock = Nucleus::createBasicBlock();
-			llvm::Value *value = Nucleus::createLoad(index.address);
-			llvm::Value *switchInst = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock.size());
 
-			for(unsigned int i = 0; i < callRetBlock.size(); i++)
+			if(callRetBlock[currentLabel].size() > 1)   // Pop the return destination from the call stack
 			{
-				Nucleus::addSwitchCase(switchInst, i, callRetBlock[i]);
+				// FIXME: Encapsulate
+				UInt index = r.callStack[--r.stackIndex];
+ 
+				llvm::Value *value = Nucleus::createLoad(index.address);
+				llvm::Value *switchInst = Nucleus::createSwitch(value, unreachableBlock, (int)callRetBlock[currentLabel].size());
+
+				for(unsigned int i = 0; i < callRetBlock[currentLabel].size(); i++)
+				{
+					Nucleus::addSwitchCase(switchInst, i, callRetBlock[currentLabel][i]);
+				}
+			}
+			else if(callRetBlock[currentLabel].size() == 1)   // Jump directly to the unique return destination
+			{
+				Nucleus::createBr(callRetBlock[currentLabel][0]);
+			}
+			else   // Function isn't called
+			{
+				Nucleus::createBr(unreachableBlock);
 			}
 
 			Nucleus::setInsertBlock(unreachableBlock);
@@ -1114,17 +1425,60 @@
 		}
 	}
 
-	void VertexProgram::TEXLDL(Registers &r, Color4f &dst, Color4f &src0, const Src &src1)
+	void VertexProgram::LEAVE(Registers &r)
 	{
-		Pointer<Byte> texture = r.data + OFFSET(DrawData,mipmap[16]) + src1.index * sizeof(Texture);
+		r.enableLeave = r.enableLeave & ~r.enableStack[r.enableIndex];
 
-		Color4f tmp;
+		// FIXME: Return from function if all instances left
+		// FIXME: Use enableLeave in other control-flow constructs
+	}
 
-		sampler[src1.index]->sampleTexture(texture, tmp, src0.x, src0.y, src0.z, src0.w, src0, src0, false, false, true);
+	void VertexProgram::TEXLDL(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		Vector4f tmp;
+		sampleTexture(r, tmp, src1, src0.x, src0.y, src0.z, src0.w);
 
 		dst.x = tmp[(src1.swizzle >> 0) & 0x3];
 		dst.y = tmp[(src1.swizzle >> 2) & 0x3];
 		dst.z = tmp[(src1.swizzle >> 4) & 0x3];
 		dst.w = tmp[(src1.swizzle >> 6) & 0x3];
 	}
+
+	void VertexProgram::TEX(Registers &r, Vector4f &dst, Vector4f &src0, const Src &src1)
+	{
+		Float4 lod = Float4(0.0f);
+		Vector4f tmp;
+		sampleTexture(r, tmp, src1, src0.x, src0.y, src0.z, lod);
+
+		dst.x = tmp[(src1.swizzle >> 0) & 0x3];
+		dst.y = tmp[(src1.swizzle >> 2) & 0x3];
+		dst.z = tmp[(src1.swizzle >> 4) & 0x3];
+		dst.w = tmp[(src1.swizzle >> 6) & 0x3];
+	}
+
+	void VertexProgram::sampleTexture(Registers &r, Vector4f &c, const Src &s, Float4 &u, Float4 &v, Float4 &w, Float4 &q)
+	{
+		if(s.type == Shader::PARAMETER_SAMPLER && s.rel.type == Shader::PARAMETER_VOID)
+		{
+			Pointer<Byte> texture = r.data + OFFSET(DrawData,mipmap[16]) + s.index * sizeof(Texture);
+			sampler[s.index]->sampleTexture(texture, c, u, v, w, q, r.a0, r.a0, false, false, true);	
+		}
+		else
+		{
+			Int index = As<Int>(Float(reg(r, s).x.x));
+
+			for(int i = 0; i < 16; i++)
+			{
+				if(shader->usesSampler(i))
+				{
+					If(index == i)
+					{
+						Pointer<Byte> texture = r.data + OFFSET(DrawData,mipmap[16]) + i * sizeof(Texture);
+						sampler[i]->sampleTexture(texture, c, u, v, w, q, r.a0, r.a0, false, false, true);
+						// FIXME: When the sampler states are the same, we could use one sampler and just index the texture
+					}
+				}
+			}
+		}
+	}
 }
diff --git a/src/Shader/VertexProgram.hpp b/src/Shader/VertexProgram.hpp
index bae97f3..7423e5b 100644
--- a/src/Shader/VertexProgram.hpp
+++ b/src/Shader/VertexProgram.hpp
@@ -1,94 +1,104 @@
-// SwiftShader Software Renderer
-//
-// Copyright(c) 2005-2011 TransGaming Inc.
-//
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
-//
-
-#ifndef sw_VertexProgram_hpp
-#define sw_VertexProgram_hpp
-
-#include "VertexRoutine.hpp"
-#include "ShaderCore.hpp"
-
-#include "Stream.hpp"
-#include "Types.hpp"
-
-namespace sw
-{
-	struct Stream;
-	class VertexShader;
-	class SamplerCore;
-
-	class VertexProgram : public VertexRoutine, public ShaderCore
-	{
-	public:
-		VertexProgram(const VertexProcessor::State &state, const VertexShader *vertexShader);
-
-		virtual ~VertexProgram();
-
-	private:
-		typedef Shader::Instruction::DestinationParameter Dst;
-		typedef Shader::Instruction::SourceParameter Src;
-		typedef Shader::Instruction::Operation Op;
-		typedef Shader::Instruction::Operation::Control Control;
-		typedef Shader::Instruction::Operation::Usage Usage;
-
-		Color4f readConstant(Registers &r, const Src &src, int offset = 0);
-		void pipeline(Registers &r);
-		void shader(Registers &r);
-		void passThrough(Registers &r);
-
-		Color4f reg(Registers &r, const Src &src, int offset = 0);
-
-		void M3X2(Registers &r, Color4f &dst, Color4f &src0, Src &src1);
-		void M3X3(Registers &r, Color4f &dst, Color4f &src0, Src &src1);
-		void M3X4(Registers &r, Color4f &dst, Color4f &src0, Src &src1);
-		void M4X3(Registers &r, Color4f &dst, Color4f &src0, Src &src1);
-		void M4X4(Registers &r, Color4f &dst, Color4f &src0, Src &src1);
-		void BREAK(Registers &r);
-		void BREAKC(Registers &r, Color4f &src0, Color4f &src1, Control);
-		void BREAKP(Registers &r, const Src &predicateRegister);
-		void CALL(Registers &r, int labelIndex);
-		void CALLNZ(Registers &r, int labelIndex, const Src &src);
-		void CALLNZb(Registers &r, int labelIndex, const Src &boolRegister);
-		void CALLNZp(Registers &r, int labelIndex, const Src &predicateRegister);
-		void ELSE(Registers &r);
-		void ENDIF(Registers &r);
-		void ENDLOOP(Registers &r);
-		void ENDREP(Registers &r);
-		void IF(Registers &r, const Src &src);
-		void IFb(Registers &r, const Src &boolRegister);
-		void IFp(Registers &r, const Src &predicateRegister);
-		void IFC(Registers &r, Color4f &src0, Color4f &src1, Control);
-		void LABEL(int labelIndex);
-		void LOOP(Registers &r, const Src &integerRegister);
-		void REP(Registers &r, const Src &integerRegister);
-		void RET(Registers &r);
-		void TEXLDL(Registers &r, Color4f &dst, Color4f &src, const Src&);
-
-		SamplerCore *sampler[4];
-
-		bool returns;
-		int ifDepth;
-		int loopRepDepth;
-		int breakDepth;
-
-		// FIXME: Get rid of llvm::
-		llvm::BasicBlock *ifFalseBlock[24 + 24];
-		llvm::BasicBlock *loopRepTestBlock[4];
-		llvm::BasicBlock *loopRepEndBlock[4];
-		llvm::BasicBlock *labelBlock[2048];
-		std::vector<llvm::BasicBlock*> callRetBlock;
-		llvm::BasicBlock *returnBlock;
-		bool isConditionalIf[24 + 24];
-
-		const VertexShader *const vertexShader;
-	};
-}
-
-#endif   // sw_VertexProgram_hpp
+// SwiftShader Software Renderer

+//

+// Copyright(c) 2005-2012 TransGaming Inc.

+//

+// All rights reserved. No part of this software may be copied, distributed, transmitted,

+// transcribed, stored in a retrieval system, translated into any human or computer

+// language by any means, or disclosed to third parties without the explicit written

+// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express

+// or implied, including but not limited to any patent rights, are granted to you.

+//

+

+#ifndef sw_VertexProgram_hpp

+#define sw_VertexProgram_hpp

+

+#include "VertexRoutine.hpp"

+#include "ShaderCore.hpp"

+

+#include "Stream.hpp"

+#include "Types.hpp"

+

+namespace sw

+{

+	struct Stream;

+	class VertexShader;

+	class SamplerCore;

+

+	class VertexProgram : public VertexRoutine, public ShaderCore

+	{

+	public:

+		VertexProgram(const VertexProcessor::State &state, const VertexShader *vertexShader);

+

+		virtual ~VertexProgram();

+

+	private:

+		typedef Shader::DestinationParameter Dst;

+		typedef Shader::SourceParameter Src;

+		typedef Shader::Control Control;

+		typedef Shader::Usage Usage;

+

+		void pipeline(Registers &r);

+		void program(Registers &r);

+		void passThrough(Registers &r);

+

+		Vector4f reg(Registers &r, const Src &src, int offset = 0);

+		Vector4f readConstant(Registers &r, const Src &src, int offset = 0);

+		Int relativeAddress(Registers &r, const Shader::Parameter &var);

+		Int4 enableMask(Registers &r, const Shader::Instruction *instruction);

+

+		void M3X2(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1);

+		void M3X3(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1);

+		void M3X4(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1);

+		void M4X3(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1);

+		void M4X4(Registers &r, Vector4f &dst, Vector4f &src0, Src &src1);

+		void BREAK(Registers &r);

+		void BREAKC(Registers &r, Vector4f &src0, Vector4f &src1, Control);

+		void BREAKP(Registers &r, const Src &predicateRegister);

+		void BREAK(Registers &r, Int4 &condition);

+		void CONTINUE(Registers &r);

+		void TEST();

+		void CALL(Registers &r, int labelIndex, int callSiteIndex);

+		void CALLNZ(Registers &r, int labelIndex, int callSiteIndex, const Src &src);

+		void CALLNZb(Registers &r, int labelIndex, int callSiteIndex, const Src &boolRegister);

+		void CALLNZp(Registers &r, int labelIndex, int callSiteIndex, const Src &predicateRegister);

+		void ELSE(Registers &r);

+		void ENDIF(Registers &r);

+		void ENDLOOP(Registers &r);

+		void ENDREP(Registers &r);

+		void ENDWHILE(Registers &r);

+		void IF(Registers &r, const Src &src);

+		void IFb(Registers &r, const Src &boolRegister);

+		void IFp(Registers &r, const Src &predicateRegister);

+		void IFC(Registers &r, Vector4f &src0, Vector4f &src1, Control);

+		void IF(Registers &r, Int4 &condition);

+		void LABEL(int labelIndex);

+		void LOOP(Registers &r, const Src &integerRegister);

+		void REP(Registers &r, const Src &integerRegister);

+		void WHILE(Registers &r, const Src &temporaryRegister);

+		void RET(Registers &r);

+		void LEAVE(Registers &r);

+		void TEXLDL(Registers &r, Vector4f &dst, Vector4f &src, const Src&);

+		void TEX(Registers &r, Vector4f &dst, Vector4f &src, const Src&);

+

+		void sampleTexture(Registers &r, Vector4f &c, const Src &s, Float4 &u, Float4 &v, Float4 &w, Float4 &q);

+

+		SamplerCore *sampler[4];

+

+		int ifDepth;

+		int loopRepDepth;

+		int breakDepth;

+		int currentLabel;

+		bool whileTest;

+

+		// FIXME: Get rid of llvm::

+		llvm::BasicBlock *ifFalseBlock[24 + 24];

+		llvm::BasicBlock *loopRepTestBlock[4];

+		llvm::BasicBlock *loopRepEndBlock[4];

+		llvm::BasicBlock *labelBlock[2048];

+		std::vector<llvm::BasicBlock*> callRetBlock[2048];

+		llvm::BasicBlock *returnBlock;

+		bool isConditionalIf[24 + 24];

+	};

+}

+

+#endif   // sw_VertexProgram_hpp

diff --git a/src/Shader/VertexRoutine.cpp b/src/Shader/VertexRoutine.cpp
index f28a741..1c2be07 100644
--- a/src/Shader/VertexRoutine.cpp
+++ b/src/Shader/VertexRoutine.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -20,7 +20,10 @@
 
 namespace sw
 {
-	VertexRoutine::VertexRoutine(const VertexProcessor::State &state) : state(state)
+	extern bool halfIntegerCoordinates;     // Pixel centers are not at integer coordinates
+	extern bool symmetricNormalizedDepth;   // [-1, 1] instead of [0, 1]
+
+	VertexRoutine::VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader) : state(state), shader(shader)
 	{
 		routine = 0;
 	}
@@ -46,7 +49,7 @@
 
 			UInt count = *Pointer<UInt>(task+ OFFSET(VertexTask,count));
 
-			Registers r;
+			Registers r(shader);
 			r.data = data;
 			r.constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
 
@@ -82,7 +85,7 @@
 			Return();
 		}
 
-		routine = function(L"VertexRoutine_%0.16llX", state.shaderHash);
+		routine = function(L"VertexRoutine_%0.8X", state.shaderID);
 	}
 
 	Routine *VertexRoutine::getRoutine()
@@ -108,41 +111,41 @@
 		// Backtransform
 		if(state.preTransformed)
 		{
-			Float4 rhw = Float4(1.0f, 1.0f, 1.0f, 1.0f) / r.ow[pos];
+			Float4 rhw = Float4(1.0f) / r.o[pos].w;
 
-			Float4 W = *Pointer<Float4>(r.data + OFFSET(DrawData,WWWWx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f);
-			Float4 H = *Pointer<Float4>(r.data + OFFSET(DrawData,HHHHx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f);
-			Float4 L = *Pointer<Float4>(r.data + OFFSET(DrawData,LLLLx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f);
-			Float4 T = *Pointer<Float4>(r.data + OFFSET(DrawData,TTTTx16)) * Float4(1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f, 1.0f / 16.0f);
+			Float4 W = *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16)) * Float4(1.0f / 16.0f);
+			Float4 H = *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16)) * Float4(1.0f / 16.0f);
+			Float4 L = *Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) * Float4(1.0f / 16.0f);
+			Float4 T = *Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) * Float4(1.0f / 16.0f);
 
-			r.ox[pos] = (r.ox[pos] - L) / W * rhw;
-			r.oy[pos] = (r.oy[pos] - T) / H * rhw;
-			r.oz[pos] = r.oz[pos] * rhw;
-			r.ow[pos] = rhw;
+			r.o[pos].x = (r.o[pos].x - L) / W * rhw;
+			r.o[pos].y = (r.o[pos].y - T) / H * rhw;
+			r.o[pos].z = r.o[pos].z * rhw;
+			r.o[pos].w = rhw;
 		}
 
 		if(state.superSampling)
 		{
-			r.ox[pos] = r.ox[pos] + *Pointer<Float4>(r.data + OFFSET(DrawData,XXXX)) * r.ow[pos];
-			r.oy[pos] = r.oy[pos] + *Pointer<Float4>(r.data + OFFSET(DrawData,YYYY)) * r.ow[pos];
+			r.o[pos].x = r.o[pos].x + *Pointer<Float4>(r.data + OFFSET(DrawData,XXXX)) * r.o[pos].w;
+			r.o[pos].y = r.o[pos].y + *Pointer<Float4>(r.data + OFFSET(DrawData,YYYY)) * r.o[pos].w;
 		}
 
-		Float4 clipX = r.ox[pos];
-		Float4 clipY = r.oy[pos];
+		Float4 clipX = r.o[pos].x;
+		Float4 clipY = r.o[pos].y;
 
 		if(state.multiSampling)   // Clip at pixel edges instead of pixel centers
 		{
-			clipX += *Pointer<Float4>(r.data + OFFSET(DrawData,offX)) * r.ow[pos];
-			clipY += *Pointer<Float4>(r.data + OFFSET(DrawData,offY)) * r.ow[pos];
+			clipX += *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelX)) * r.o[pos].w;
+			clipY += *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelY)) * r.o[pos].w;
 		}
 
-		Int4 maxX = CmpLT(r.ow[pos], clipX);
-		Int4 maxY = CmpLT(r.ow[pos], clipY);
-		Int4 maxZ = CmpLT(r.ow[pos], r.oz[pos]);
+		Int4 maxX = CmpLT(r.o[pos].w, clipX);
+		Int4 maxY = CmpLT(r.o[pos].w, clipY);
+		Int4 maxZ = CmpLT(r.o[pos].w, r.o[pos].z);
 
-		Int4 minX = CmpNLE(-r.ow[pos], clipX);
-		Int4 minY = CmpNLE(-r.ow[pos], clipY);
-		Int4 minZ = CmpNLE(Float4(0.0f, 0.0f, 0.0f, 0.0f), r.oz[pos]);
+		Int4 minX = CmpNLE(-r.o[pos].w, clipX);
+		Int4 minY = CmpNLE(-r.o[pos].w, clipY);
+		Int4 minZ = CmpNLE(Float4(0.0f), r.o[pos].z);
 
 		Int flags;
 
@@ -159,9 +162,9 @@
 		flags = SignMask(minZ);
 		r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,minZ) + flags * 4);
 
-		Int4 finiteX = CmpLE(Abs(r.ox[pos]), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
-		Int4 finiteY = CmpLE(Abs(r.oy[pos]), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
-		Int4 finiteZ = CmpLE(Abs(r.oz[pos]), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
+		Int4 finiteX = CmpLE(Abs(r.o[pos].x), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
+		Int4 finiteY = CmpLE(Abs(r.o[pos].y), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
+		Int4 finiteZ = CmpLE(Abs(r.o[pos].z), *Pointer<Float4>(r.constants + OFFSET(Constants,maxPos)));
 
 		flags = SignMask(finiteX & finiteY & finiteZ);
 		r.clipFlags |= *Pointer<Int>(r.constants + OFFSET(Constants,fini) + flags * 4);
@@ -172,11 +175,11 @@
 		}
 	}
 
-	Color4f VertexRoutine::readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
+	Vector4f VertexRoutine::readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
 	{
 		const bool texldl = state.shaderContainsTexldl;
 
-		Color4f v;
+		Vector4f v;
 
 		Pointer<Byte> source0 = buffer + index * stride;
 		Pointer<Byte> source1 = source0 + (!texldl ? stride : 0);
@@ -341,8 +344,8 @@
 
 				transpose4x3(v.x, v.y, v.z, v.w);
 
-				v.y *= Float4(1.0f / 0x00000400, 1.0f / 0x00000400, 1.0f / 0x00000400, 1.0f / 0x00000400);
-				v.z *= Float4(1.0f / 0x00100000, 1.0f / 0x00100000, 1.0f / 0x00100000, 1.0f / 0x00100000);
+				v.y *= Float4(1.0f / 0x00000400);
+				v.z *= Float4(1.0f / 0x00100000);
 			}
 			break;
 		case STREAMTYPE_DEC3N:
@@ -390,9 +393,9 @@
 
 				transpose4x3(v.x, v.y, v.z, v.w);
 
-				v.x *= Float4(1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f);
-				v.y *= Float4(1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f);
-				v.z *= Float4(1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f, 1.0f / 0x00400000 / 511.0f);
+				v.x *= Float4(1.0f / 0x00400000 / 511.0f);
+				v.y *= Float4(1.0f / 0x00400000 / 511.0f);
+				v.z *= Float4(1.0f / 0x00400000 / 511.0f);
 			}
 			break;
 		case STREAMTYPE_FIXED:
@@ -472,10 +475,10 @@
 			ASSERT(false);
 		}
 
-		if(stream.count < 1) v.x = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-		if(stream.count < 2) v.y = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-		if(stream.count < 3) v.z = Float4(0.0f, 0.0f, 0.0f, 0.0f);
-		if(stream.count < 4) v.w = Float4(1.0f, 1.0f, 1.0f, 1.0f);
+		if(stream.count < 1) v.x = Float4(0.0f);
+		if(stream.count < 2) v.y = Float4(0.0f);
+		if(stream.count < 3) v.z = Float4(0.0f);
+		if(stream.count < 4) v.w = Float4(1.0f);
 
 		return v;
 	}
@@ -484,55 +487,53 @@
 	{
 		int pos = state.positionRegister;
 
-		if(state.postTransform && !state.preTransformed)
+		if(halfIntegerCoordinates)
 		{
-			Float4 posScale = *Pointer<Float4>(r.data + OFFSET(DrawData,posScale));   // FIXME: Unpack
+			r.o[pos].x = r.o[pos].x - *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelX)) * r.o[pos].w;
+			r.o[pos].y = r.o[pos].y - *Pointer<Float4>(r.data + OFFSET(DrawData,halfPixelY)) * r.o[pos].w;
+		}
 
-			r.ox[pos] = r.ox[pos] * posScale.x;
-			r.oy[pos] = r.oy[pos] * posScale.y;
-
-			Float4 posOffset = *Pointer<Float4>(r.data + OFFSET(DrawData,posOffset));   // FIXME: Unpack
-
-			r.ox[pos] = r.ox[pos] + r.ow[pos] * posOffset.x;
-			r.oy[pos] = r.oy[pos] + r.ow[pos] * posOffset.y;
+		if(symmetricNormalizedDepth)
+		{
+			r.o[pos].z = (r.o[pos].z + r.o[pos].w) * Float4(0.5f);
 		}
 	}
 
 	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine, Registers &r)
 	{
-		Color4f v;
+		Vector4f v;
 
 		for(int i = 0; i < 12; i++)
 		{
 			if(state.output[i].write)
 			{
-				v.x = r.ox[i];
-				v.y = r.oy[i];
-				v.z = r.oz[i];
-				v.w = r.ow[i];
+				v.x = r.o[i].x;
+				v.y = r.o[i].y;
+				v.z = r.o[i].z;
+				v.w = r.o[i].w;
 
 				if(state.output[i].xClamp)
 				{
-					v.x = Max(v.x, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					v.x = Min(v.x, Float4(1.0f, 1.0f, 1.0f, 1.0f));
+					v.x = Max(v.x, Float4(0.0f));
+					v.x = Min(v.x, Float4(1.0f));
 				}
 
 				if(state.output[i].yClamp)
 				{
-					v.y = Max(v.y, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					v.y = Min(v.y, Float4(1.0f, 1.0f, 1.0f, 1.0f));
+					v.y = Max(v.y, Float4(0.0f));
+					v.y = Min(v.y, Float4(1.0f));
 				}
 
 				if(state.output[i].zClamp)
 				{
-					v.z = Max(v.z, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					v.z = Min(v.z, Float4(1.0f, 1.0f, 1.0f, 1.0f));
+					v.z = Max(v.z, Float4(0.0f));
+					v.z = Min(v.z, Float4(1.0f));
 				}
 
 				if(state.output[i].wClamp)
 				{
-					v.w = Max(v.w, Float4(0.0f, 0.0f, 0.0f, 0.0f));
-					v.w = Min(v.w, Float4(1.0f, 1.0f, 1.0f, 1.0f));
+					v.w = Max(v.w, Float4(0.0f));
+					v.w = Min(v.w, Float4(1.0f));
 				}
 
 				if(state.output[i].write == 0x01)
@@ -568,16 +569,16 @@
 
 		int pos = state.positionRegister;
 
-		v.x = r.ox[pos];
-		v.y = r.oy[pos];
-		v.z = r.oz[pos];
-		v.w = r.ow[pos];
+		v.x = r.o[pos].x;
+		v.y = r.o[pos].y;
+		v.z = r.o[pos].z;
+		v.w = r.o[pos].w;
 
-		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0, 0, 0, 0))) & As<Int4>(Float4(1, 1, 1, 1))));
+		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
 		Float4 rhw = Float4(1.0f) / w;
 
-		v.x = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,LLLLx16)) + v.x * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,WWWWx16))));
-		v.y = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,TTTTx16)) + v.y * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,HHHHx16))));
+		v.x = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Wx16))));
+		v.y = As<Float4>(RoundInt(*Pointer<Float4>(r.data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(r.data + OFFSET(DrawData,Hx16))));
 		v.z = v.z * rhw;
 		v.w = rhw;
 
diff --git a/src/Shader/VertexRoutine.hpp b/src/Shader/VertexRoutine.hpp
index 9c21662..72323f2 100644
--- a/src/Shader/VertexRoutine.hpp
+++ b/src/Shader/VertexRoutine.hpp
@@ -1,88 +1,105 @@
-// SwiftShader Software Renderer
-//
-// Copyright(c) 2005-2011 TransGaming Inc.
-//
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
-//
-
-#ifndef sw_VertexRoutine_hpp
-#define sw_VertexRoutine_hpp
-
-#include "Renderer/Color.hpp"
-#include "Renderer/VertexProcessor.hpp"
-#include "Reactor/Reactor.hpp"
-
-namespace sw
-{
-	class VertexRoutine
-	{
-	protected:
-		struct Registers
-		{
-			Registers() : callStack(4), aL(4), increment(4), iteration(4), enableStack(1 + 24), ox(12), oy(12), oz(12), ow(12)
-			{
-				loopDepth = -1;
-				enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-				enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
-			}
-
-			Pointer<Byte> data;
-			Pointer<Byte> constants;
-
-			Array<Float4> ox;
-			Array<Float4> oy;
-			Array<Float4> oz;
-			Array<Float4> ow;
-
-			Int clipFlags;
-
-			Color4f v[16];
-			Color4f r[32];
-			Color4f a0;
-			Array<Int> aL;
-			Color4f p0;
-
-			Array<Int> increment;
-			Array<Int> iteration;
-
-			Int loopDepth;
-			Int stackIndex;   // FIXME: Inc/decrement callStack
-			Array<UInt> callStack;
-
-			Int enableIndex;
-			Array<Int4> enableStack;
-			Int4 enableBreak;
-		};
-
-	public:
-		VertexRoutine(const VertexProcessor::State &state);
-
-		virtual ~VertexRoutine();
-
-		void generate();
-		Routine *getRoutine();
-
-	protected:
-		const VertexProcessor::State &state;
-
-	private:		
-		virtual void pipeline(Registers &r) = 0;
-
-		typedef VertexProcessor::State::Input Stream;
-		
-		Color4f readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index);
-		void readInput(Registers &r, UInt &index);
-		void computeClipFlags(Registers &r);
-		void postTransform(Registers &r);
-		void writeCache(Pointer<Byte> &cacheLine, Registers &r);
-		void writeVertex(Pointer<Byte> &vertex, Pointer<Byte> &cacheLine);
-
-		Routine *routine;
-	};
-}
-
-#endif   // sw_VertexRoutine_hpp
+// SwiftShader Software Renderer

+//

+// Copyright(c) 2005-2012 TransGaming Inc.

+//

+// All rights reserved. No part of this software may be copied, distributed, transmitted,

+// transcribed, stored in a retrieval system, translated into any human or computer

+// language by any means, or disclosed to third parties without the explicit written

+// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express

+// or implied, including but not limited to any patent rights, are granted to you.

+//

+

+#ifndef sw_VertexRoutine_hpp

+#define sw_VertexRoutine_hpp

+

+#include "Renderer/Color.hpp"

+#include "Renderer/VertexProcessor.hpp"

+#include "ShaderCore.hpp"

+#include "VertexShader.hpp"

+

+namespace sw

+{

+	class VertexRoutine

+	{

+	protected:

+		struct Registers

+		{

+			Registers(const VertexShader *shader) :

+				r(shader && shader->dynamicallyIndexedTemporaries),

+				v(shader && shader->dynamicallyIndexedInput),

+				o(shader && shader->dynamicallyIndexedOutput)

+			{

+				loopDepth = -1;

+				enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				

+				if(shader && shader->containsBreakInstruction())

+				{

+					enableBreak = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				}

+

+				if(shader && shader->containsContinueInstruction())

+				{

+					enableContinue = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				}

+

+				if(shader && shader->containsLeaveInstruction())

+				{

+					enableLeave = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

+				}

+			}

+

+			Pointer<Byte> data;

+			Pointer<Byte> constants;

+

+			Int clipFlags;

+

+			RegisterArray<16> v;

+			RegisterArray<4096> r;

+			RegisterArray<12> o;

+			Vector4f a0;

+			Array<Int, 4> aL;

+			Vector4f p0;

+

+			Array<Int, 4> increment;

+			Array<Int, 4> iteration;

+

+			Int loopDepth;

+			Int stackIndex;   // FIXME: Inc/decrement callStack

+			Array<UInt, 4> callStack;

+

+			Int enableIndex;

+			Array<Int4, 1 + 24> enableStack;

+			Int4 enableBreak;

+			Int4 enableContinue;

+			Int4 enableLeave;

+		};

+

+	public:

+		VertexRoutine(const VertexProcessor::State &state, const VertexShader *shader);

+

+		virtual ~VertexRoutine();

+

+		void generate();

+		Routine *getRoutine();

+

+	protected:

+		const VertexProcessor::State &state;

+		const VertexShader *const shader;

+

+	private:		

+		virtual void pipeline(Registers &r) = 0;

+

+		typedef VertexProcessor::State::Input Stream;

+		

+		Vector4f readStream(Registers &r, Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index);

+		void readInput(Registers &r, UInt &index);

+		void computeClipFlags(Registers &r);

+		void postTransform(Registers &r);

+		void writeCache(Pointer<Byte> &cacheLine, Registers &r);

+		void writeVertex(Pointer<Byte> &vertex, Pointer<Byte> &cacheLine);

+

+		Routine *routine;

+	};

+}

+

+#endif   // sw_VertexRoutine_hpp

diff --git a/src/Shader/VertexShader.cpp b/src/Shader/VertexShader.cpp
index c958820..d5b1171 100644
--- a/src/Shader/VertexShader.cpp
+++ b/src/Shader/VertexShader.cpp
@@ -1,6 +1,6 @@
 // SwiftShader Software Renderer
 //
-// Copyright(c) 2005-2011 TransGaming Inc.
+// Copyright(c) 2005-2012 TransGaming Inc.
 //
 // All rights reserved. No part of this software may be copied, distributed, transmitted,
 // transcribed, stored in a retrieval system, translated into any human or computer
@@ -16,50 +16,53 @@
 
 namespace sw
 {
-	VertexShader::VertexShader(const unsigned long *token) : Shader(token)
+	VertexShader::VertexShader(const VertexShader *vs) : Shader()
+	{
+		version = 0x0300;
+		positionRegister = Pos;
+		pointSizeRegister = -1;   // No vertex point size
+
+		for(int i = 0; i < 16; i++)
+		{
+			input[i] = Semantic(-1, -1);
+		}
+
+		if(vs)   // Make a copy
+		{
+			for(int i = 0; i < vs->getLength(); i++)
+			{
+				append(new sw::Shader::Instruction(*vs->getInstruction(i)));
+			}
+
+			memcpy(output, vs->output, sizeof(output));
+			memcpy(input, vs->input, sizeof(input));
+			positionRegister = vs->positionRegister;
+			pointSizeRegister = vs->pointSizeRegister;
+			usedSamplers = vs->usedSamplers;
+
+			analyze();
+		}
+	}
+
+	VertexShader::VertexShader(const unsigned long *token) : Shader()
 	{
 		parse(token);
+
+		positionRegister = Pos;
+		pointSizeRegister = -1;   // No vertex point size
+
+		for(int i = 0; i < 16; i++)
+		{
+			input[i] = Semantic(-1, -1);
+		}
+
+		analyze();
 	}
 
 	VertexShader::~VertexShader()
 	{
 	}
 
-	void VertexShader::parse(const unsigned long *token)
-	{
-		minorVersion = (unsigned char)(token[0] & 0x000000FF);
-		majorVersion = (unsigned char)((token[0] & 0x0000FF00) >> 8);
-		shaderType = (ShaderType)((token[0] & 0xFFFF0000) >> 16);
-
-		length = validate(token);
-		ASSERT(length != 0);
-
-		instruction = new Shader::Instruction*[length];
-
-		for(int i = 0; i < length; i++)
-		{
-			while((*token & 0x0000FFFF) == 0x0000FFFE)   // Comment token
-			{
-				int length = (*token & 0x7FFF0000) >> 16;
-
-				token += length + 1;
-			}
-
-			int tokenCount = size(*token);
-
-			instruction[i] = new Instruction(token, tokenCount, majorVersion);
-
-			token += 1 + tokenCount;
-		}
-
-		analyzeInput();
-		analyzeOutput();
-		analyzeDirtyConstants();
-		analyzeTexldl();
-		analyzeDynamicBranching();
-		analyzeSamplers();
-	}
-
 	int VertexShader::validate(const unsigned long *const token)
 	{
 		if(!token)
@@ -89,36 +92,36 @@
 			}
 			else
 			{
-				ShaderOpcode opcode = (ShaderOpcode)(token[i] & 0x0000FFFF);
+				Shader::Opcode opcode = (Shader::Opcode)(token[i] & 0x0000FFFF);
 
 				switch(opcode)
 				{
-				case ShaderOperation::OPCODE_TEXCOORD:
-				case ShaderOperation::OPCODE_TEXKILL:
-				case ShaderOperation::OPCODE_TEX:
-				case ShaderOperation::OPCODE_TEXBEM:
-				case ShaderOperation::OPCODE_TEXBEML:
-				case ShaderOperation::OPCODE_TEXREG2AR:
-				case ShaderOperation::OPCODE_TEXREG2GB:
-				case ShaderOperation::OPCODE_TEXM3X2PAD:
-				case ShaderOperation::OPCODE_TEXM3X2TEX:
-				case ShaderOperation::OPCODE_TEXM3X3PAD:
-				case ShaderOperation::OPCODE_TEXM3X3TEX:
-				case ShaderOperation::OPCODE_RESERVED0:
-				case ShaderOperation::OPCODE_TEXM3X3SPEC:
-				case ShaderOperation::OPCODE_TEXM3X3VSPEC:
-				case ShaderOperation::OPCODE_TEXREG2RGB:
-				case ShaderOperation::OPCODE_TEXDP3TEX:
-				case ShaderOperation::OPCODE_TEXM3X2DEPTH:
-				case ShaderOperation::OPCODE_TEXDP3:
-				case ShaderOperation::OPCODE_TEXM3X3:
-				case ShaderOperation::OPCODE_TEXDEPTH:
-				case ShaderOperation::OPCODE_CMP:
-				case ShaderOperation::OPCODE_BEM:
-				case ShaderOperation::OPCODE_DP2ADD:
-				case ShaderOperation::OPCODE_DSX:
-				case ShaderOperation::OPCODE_DSY:
-				case ShaderOperation::OPCODE_TEXLDD:
+				case Shader::OPCODE_TEXCOORD:
+				case Shader::OPCODE_TEXKILL:
+				case Shader::OPCODE_TEX:
+				case Shader::OPCODE_TEXBEM:
+				case Shader::OPCODE_TEXBEML:
+				case Shader::OPCODE_TEXREG2AR:
+				case Shader::OPCODE_TEXREG2GB:
+				case Shader::OPCODE_TEXM3X2PAD:
+				case Shader::OPCODE_TEXM3X2TEX:
+				case Shader::OPCODE_TEXM3X3PAD:
+				case Shader::OPCODE_TEXM3X3TEX:
+				case Shader::OPCODE_RESERVED0:
+				case Shader::OPCODE_TEXM3X3SPEC:
+				case Shader::OPCODE_TEXM3X3VSPEC:
+				case Shader::OPCODE_TEXREG2RGB:
+				case Shader::OPCODE_TEXDP3TEX:
+				case Shader::OPCODE_TEXM3X2DEPTH:
+				case Shader::OPCODE_TEXDP3:
+				case Shader::OPCODE_TEXM3X3:
+				case Shader::OPCODE_TEXDEPTH:
+				case Shader::OPCODE_CMP0:
+				case Shader::OPCODE_BEM:
+				case Shader::OPCODE_DP2ADD:
+				case Shader::OPCODE_DFDX:
+				case Shader::OPCODE_DFDY:
+				case Shader::OPCODE_TEXLDD:
 					return 0;   // Unsupported operation
 				default:
 					instructionCount++;
@@ -137,81 +140,85 @@
 		return texldl;
 	}
 
+	void VertexShader::analyze()
+	{
+		analyzeInput();
+		analyzeOutput();
+		analyzeDirtyConstants();
+		analyzeTexldl();
+		analyzeDynamicBranching();
+		analyzeSamplers();
+		analyzeCallSites();
+		analyzeDynamicIndexing();
+	}
+
 	void VertexShader::analyzeInput()
 	{
-		for(int i = 0; i < 16; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
-			input[i] = Semantic(-1, -1);
-		}
-
-		for(int i = 0; i < length; i++)
-		{
-			if(instruction[i]->getOpcode() == ShaderOperation::OPCODE_DCL &&
-			   instruction[i]->getDestinationParameter().type == ShaderParameter::PARAMETER_INPUT)
+			if(instruction[i]->opcode == Shader::OPCODE_DCL &&
+			   instruction[i]->dst.type == Shader::PARAMETER_INPUT)
 			{
-				int index = instruction[i]->getDestinationParameter().index;
+				int index = instruction[i]->dst.index;
 
-				input[index] = Semantic(instruction[i]->getUsage(), instruction[i]->getUsageIndex());
+				input[index] = Semantic(instruction[i]->usage, instruction[i]->usageIndex);
 			}
 		}
 	}
 
 	void VertexShader::analyzeOutput()
 	{
-		positionRegister = Pos;
-		pointSizeRegister = -1;   // No vertex point size
-
 		if(version < 0x0300)
 		{
-			output[Pos][0] = Semantic(ShaderOperation::USAGE_POSITION, 0);
-			output[Pos][1] = Semantic(ShaderOperation::USAGE_POSITION, 0);
-			output[Pos][2] = Semantic(ShaderOperation::USAGE_POSITION, 0);
-			output[Pos][3] = Semantic(ShaderOperation::USAGE_POSITION, 0);
+			output[Pos][0] = Semantic(Shader::USAGE_POSITION, 0);
+			output[Pos][1] = Semantic(Shader::USAGE_POSITION, 0);
+			output[Pos][2] = Semantic(Shader::USAGE_POSITION, 0);
+			output[Pos][3] = Semantic(Shader::USAGE_POSITION, 0);
 
-			for(int i = 0; i < length; i++)
+			for(unsigned int i = 0; i < instruction.size(); i++)
 			{
-				const Instruction::DestinationParameter &dst = instruction[i]->getDestinationParameter();
+				const DestinationParameter &dst = instruction[i]->dst;
 
 				switch(dst.type)
 				{
-				case ShaderParameter::PARAMETER_RASTOUT:
+				case Shader::PARAMETER_RASTOUT:
 					switch(dst.index)
 					{
 					case 0:
 						// Position already assumed written
 						break;
 					case 1:
-						output[Fog][0] = Semantic(ShaderOperation::USAGE_FOG, 0);
+						output[Fog][0] = Semantic(Shader::USAGE_FOG, 0);
 						break;
 					case 2:
-						output[Pts][1] = Semantic(ShaderOperation::USAGE_PSIZE, 0);
+						output[Pts][1] = Semantic(Shader::USAGE_PSIZE, 0);
 						pointSizeRegister = Pts;
 						break;
 					default: ASSERT(false);
 					}
 					break;
-				case ShaderParameter::PARAMETER_ATTROUT:
+				case Shader::PARAMETER_ATTROUT:
 					if(dst.index == 0)
 					{
-						if(dst.x) output[D0][0] = Semantic(ShaderOperation::USAGE_COLOR, 0);
-						if(dst.y) output[D0][1] = Semantic(ShaderOperation::USAGE_COLOR, 0);
-						if(dst.z) output[D0][2] = Semantic(ShaderOperation::USAGE_COLOR, 0);
-						if(dst.w) output[D0][3] = Semantic(ShaderOperation::USAGE_COLOR, 0);
+						if(dst.x) output[D0][0] = Semantic(Shader::USAGE_COLOR, 0);
+						if(dst.y) output[D0][1] = Semantic(Shader::USAGE_COLOR, 0);
+						if(dst.z) output[D0][2] = Semantic(Shader::USAGE_COLOR, 0);
+						if(dst.w) output[D0][3] = Semantic(Shader::USAGE_COLOR, 0);
 					}
 					else if(dst.index == 1)
 					{
-						if(dst.x) output[D1][0] = Semantic(ShaderOperation::USAGE_COLOR, 1);
-						if(dst.y) output[D1][1] = Semantic(ShaderOperation::USAGE_COLOR, 1);
-						if(dst.z) output[D1][2] = Semantic(ShaderOperation::USAGE_COLOR, 1);
-						if(dst.w) output[D1][3] = Semantic(ShaderOperation::USAGE_COLOR, 1);
+						if(dst.x) output[D1][0] = Semantic(Shader::USAGE_COLOR, 1);
+						if(dst.y) output[D1][1] = Semantic(Shader::USAGE_COLOR, 1);
+						if(dst.z) output[D1][2] = Semantic(Shader::USAGE_COLOR, 1);
+						if(dst.w) output[D1][3] = Semantic(Shader::USAGE_COLOR, 1);
 					}
 					else ASSERT(false);
 					break;
-				case ShaderParameter::PARAMETER_TEXCRDOUT:
-					if(dst.x) output[T0 + dst.index][0] = Semantic(ShaderOperation::USAGE_TEXCOORD, dst.index);
-					if(dst.y) output[T0 + dst.index][1] = Semantic(ShaderOperation::USAGE_TEXCOORD, dst.index);
-					if(dst.z) output[T0 + dst.index][2] = Semantic(ShaderOperation::USAGE_TEXCOORD, dst.index);
-					if(dst.w) output[T0 + dst.index][3] = Semantic(ShaderOperation::USAGE_TEXCOORD, dst.index);	
+				case Shader::PARAMETER_TEXCRDOUT:
+					if(dst.x) output[T0 + dst.index][0] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+					if(dst.y) output[T0 + dst.index][1] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+					if(dst.z) output[T0 + dst.index][2] = Semantic(Shader::USAGE_TEXCOORD, dst.index);
+					if(dst.w) output[T0 + dst.index][3] = Semantic(Shader::USAGE_TEXCOORD, dst.index);	
 					break;
 				default:
 					break;
@@ -220,27 +227,27 @@
 		}
 		else   // Shader Model 3.0 input declaration
 		{
-			for(int i = 0; i < length; i++)
+			for(unsigned int i = 0; i < instruction.size(); i++)
 			{
-				if(instruction[i]->getOpcode() == ShaderOperation::OPCODE_DCL &&
-				   instruction[i]->getDestinationParameter().type == ShaderParameter::PARAMETER_OUTPUT)
+				if(instruction[i]->opcode == Shader::OPCODE_DCL &&
+				   instruction[i]->dst.type == Shader::PARAMETER_OUTPUT)
 				{
-					unsigned char usage = instruction[i]->getUsage();
-					unsigned char usageIndex = instruction[i]->getUsageIndex();
+					unsigned char usage = instruction[i]->usage;
+					unsigned char usageIndex = instruction[i]->usageIndex;
 
-					const Instruction::DestinationParameter &dst = instruction[i]->getDestinationParameter();
+					const DestinationParameter &dst = instruction[i]->dst;
 
 					if(dst.x) output[dst.index][0] = Semantic(usage, usageIndex);
 					if(dst.y) output[dst.index][1] = Semantic(usage, usageIndex);
 					if(dst.z) output[dst.index][2] = Semantic(usage, usageIndex);
 					if(dst.w) output[dst.index][3] = Semantic(usage, usageIndex);
 
-					if(usage == ShaderOperation::USAGE_POSITION && usageIndex == 0)
+					if(usage == Shader::USAGE_POSITION && usageIndex == 0)
 					{
 						positionRegister = dst.index;
 					}
 
-					if(usage == ShaderOperation::USAGE_PSIZE && usageIndex == 0)
+					if(usage == Shader::USAGE_PSIZE && usageIndex == 0)
 					{
 						pointSizeRegister = dst.index;
 					}
@@ -253,9 +260,9 @@
 	{
 		texldl = false;
 
-		for(int i = 0; i < length; i++)
+		for(unsigned int i = 0; i < instruction.size(); i++)
 		{
-			if(instruction[i]->getOpcode() == Instruction::Operation::OPCODE_TEXLDL)
+			if(instruction[i]->opcode == Shader::OPCODE_TEXLDL)
 			{
 				texldl = true;
 
diff --git a/src/Shader/VertexShader.hpp b/src/Shader/VertexShader.hpp
index 1200b20..3bfc0f4 100644
--- a/src/Shader/VertexShader.hpp
+++ b/src/Shader/VertexShader.hpp
@@ -1,48 +1,47 @@
-// SwiftShader Software Renderer
-//
-// Copyright(c) 2005-2011 TransGaming Inc.
-//
-// All rights reserved. No part of this software may be copied, distributed, transmitted,
-// transcribed, stored in a retrieval system, translated into any human or computer
-// language by any means, or disclosed to third parties without the explicit written
-// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express
-// or implied, including but not limited to any patent rights, are granted to you.
-//
-
-#ifndef sw_VertexShader_hpp
-#define sw_VertexShader_hpp
-
-#include "Shader.hpp"
-
-namespace sw
-{
-	class VertexShader : public Shader
-	{
-	public:
-		VertexShader(const unsigned long *token);
-
-		virtual ~VertexShader();
-
-		static int validate(const unsigned long *const token);   // Returns number of instructions if valid
-		bool containsTexldl() const;
-		
-		int positionRegister;     // FIXME: Private
-		int pointSizeRegister;    // FIXME: Private
-
-		Semantic input[16];       // FIXME: Private
-		Semantic output[12][4];   // FIXME: Private
-
-	private:
-		void parse(const unsigned long *token);
-
-		void analyzeInput();
-		void analyzeOutput();
-		void analyzeTexldl();
-
-		bool texldl;
-	};
-
-	typedef VertexShader::Instruction VertexShaderInstruction;
-}
-
-#endif   // sw_VertexShader_hpp
+// SwiftShader Software Renderer

+//

+// Copyright(c) 2005-2012 TransGaming Inc.

+//

+// All rights reserved. No part of this software may be copied, distributed, transmitted,

+// transcribed, stored in a retrieval system, translated into any human or computer

+// language by any means, or disclosed to third parties without the explicit written

+// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express

+// or implied, including but not limited to any patent rights, are granted to you.

+//

+

+#ifndef sw_VertexShader_hpp

+#define sw_VertexShader_hpp

+

+#include "Shader.hpp"

+

+namespace sw

+{

+	class VertexShader : public Shader

+	{

+	public:

+		explicit VertexShader(const VertexShader *vs = 0);

+		explicit VertexShader(const unsigned long *token);

+

+		virtual ~VertexShader();

+

+		static int validate(const unsigned long *const token);   // Returns number of instructions if valid

+		bool containsTexldl() const;

+		

+		virtual void analyze();

+

+		int positionRegister;     // FIXME: Private

+		int pointSizeRegister;    // FIXME: Private

+

+		Semantic input[16];       // FIXME: Private

+		Semantic output[12][4];   // FIXME: Private

+

+	private:

+		void analyzeInput();

+		void analyzeOutput();

+		void analyzeTexldl();

+

+		bool texldl;

+	};

+}

+

+#endif   // sw_VertexShader_hpp