Rework setup, VS->FS structures, etc for Vulkan

- Remnants of old fixed function attributes all gone
- Initial support for some builtins to prove the model
- Setup now driven by correct shader state
- VS->FS intermediate structure matches SPIRV model -- builtins are not
in location space; location space itself is flat scalars rather than
vec4-oriented.

There are still some vertex pipe features which are not supported, as
ES3 didn't have them -- proper handling of noperspective, etc.

Change-Id: Ia8e3c72af54c4d1cbcc18482a741daa5e8e7c053
Bug: b/120799499
Reviewed-on: https://swiftshader-review.googlesource.com/c/24376
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index aa62cee..6cf20f0 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -31,6 +31,7 @@
 #include "System/Timer.hpp"
 #include "Vulkan/VkDebug.hpp"
 #include "Pipeline/SpirvShader.hpp"
+#include "Vertex.hpp"
 
 #undef max
 
@@ -1236,7 +1237,6 @@
 		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
 
 		int ms = state.multiSample;
-		int pos = state.positionRegister;
 		const DrawData *data = draw.data;
 		int visible = 0;
 
@@ -1248,7 +1248,7 @@
 
 			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
 			{
-				Polygon polygon(&v0.v[pos], &v1.v[pos], &v2.v[pos]);
+				Polygon polygon(&v0.builtins.position, &v1.builtins.position, &v2.builtins.position);
 
 				int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags | draw.clipFlags;
 
@@ -1332,10 +1332,8 @@
 		Vertex &v0 = triangle.v0;
 		Vertex &v1 = triangle.v1;
 
-		int pos = state.positionRegister;
-
-		const float4 &P0 = v0.v[pos];
-		const float4 &P1 = v1.v[pos];
+		const float4 &P0 = v0.builtins.position;
+		const float4 &P1 = v1.builtins.position;
 
 		if(P0.w <= 0 && P1.w <= 0)
 		{
@@ -1525,30 +1523,17 @@
 
 		Vertex &v = triangle.v0;
 
-		float pSize;
-
-		int pts = state.pointSizeRegister;
-
-		if(state.pointSizeRegister != Unused)
-		{
-			pSize = v.v[pts].y;
-		}
-		else
-		{
-			pSize = 1.0f;
-		}
+		float pSize = v.builtins.pointSize;
 
 		pSize = clamp(pSize, data.pointSizeMin, data.pointSizeMax);
 
 		float4 P[4];
 		int C[4];
 
-		int pos = state.positionRegister;
-
-		P[0] = v.v[pos];
-		P[1] = v.v[pos];
-		P[2] = v.v[pos];
-		P[3] = v.v[pos];
+		P[0] = v.builtins.position;
+		P[1] = v.builtins.position;
+		P[2] = v.builtins.position;
+		P[3] = v.builtins.position;
 
 		const float X = pSize * P[0].w * data.halfPixelX[0];
 		const float Y = pSize * P[0].w * data.halfPixelY[0];
@@ -1572,8 +1557,8 @@
 		triangle.v1 = triangle.v0;
 		triangle.v2 = triangle.v0;
 
-		triangle.v1.X += iround(16 * 0.5f * pSize);
-		triangle.v2.Y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
+		triangle.v1.projected.x += iround(16 * 0.5f * pSize);
+		triangle.v2.projected.y -= iround(16 * 0.5f * pSize) * (data.Hx16[0] > 0.0f ? 1 : -1);   // Both Direct3D and OpenGL expect (0, 0) in the top-left corner
 
 		Polygon polygon(P, 4);
 
diff --git a/src/Device/SetupProcessor.cpp b/src/Device/SetupProcessor.cpp
index 800d320..4e0823b 100644
--- a/src/Device/SetupProcessor.cpp
+++ b/src/Device/SetupProcessor.cpp
@@ -87,60 +87,16 @@
 		state.slopeDepthBias = context->slopeDepthBias != 0.0f;
 		state.vFace = context->pixelShader && context->pixelShader->hasBuiltinInput(spv::BuiltInFrontFacing);
 
-		state.positionRegister = Pos;
-		state.pointSizeRegister = Unused;
-
 		state.multiSample = context->getMultiSampleCount();
 		state.rasterizerDiscard = context->rasterizerDiscard;
 
-		//TODO: route properly
-		state.positionRegister = 0;//context->vertexShader->getPositionRegister();
-		state.pointSizeRegister = 1;//context->vertexShader->getPointSizeRegister();
-
-		for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+		for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
 		{
-			for(int component = 0; component < 4; component++)
-			{
-				state.gradient[interpolant][component].attribute = Unused;
-				state.gradient[interpolant][component].flat = false;
-				state.gradient[interpolant][component].wrap = false;
-			}
+			state.gradient[interpolant] = context->pixelShader->inputs[interpolant];
 		}
 
 		const bool point = context->isDrawPoint();
 
-//		for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
-//		{
-//			for(int component = 0; component < 4; component++)
-//			{
-//				const Shader::Semantic& semantic = context->pixelShader->getInput(interpolant, component);
-//
-//				if(semantic.active())
-//				{
-//					int input = interpolant;
-//					for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
-//					{
-//						if(semantic == context->vertexShader->getOutput(i, component))
-//						{
-//							input = i;
-//							break;
-//						}
-//					}
-//
-//					bool flat = point;
-//
-//					switch(semantic.usage)
-//					{
-//					case Shader::USAGE_TEXCOORD: flat = false;                  break;
-//					case Shader::USAGE_COLOR:    flat = semantic.flat || point; break;
-//					}
-//
-//					state.gradient[interpolant][component].attribute = input;
-//					state.gradient[interpolant][component].flat = flat;
-//				}
-//			}
-//		}
-
 		state.hash = state.computeHash();
 
 		return state;
diff --git a/src/Device/SetupProcessor.hpp b/src/Device/SetupProcessor.hpp
index 797c362..7001bce 100644
--- a/src/Device/SetupProcessor.hpp
+++ b/src/Device/SetupProcessor.hpp
@@ -15,6 +15,7 @@
 #ifndef sw_SetupProcessor_hpp
 #define sw_SetupProcessor_hpp
 
+#include <Pipeline/SpirvShader.hpp>
 #include "Context.hpp"
 #include "RoutineCache.hpp"
 #include "System/Types.hpp"
@@ -41,8 +42,6 @@
 			bool interpolateZ              : 1;
 			bool interpolateW              : 1;
 			bool perspective               : 1;
-			unsigned int positionRegister  : BITS(VERTEX_OUTPUT_LAST);
-			unsigned int pointSizeRegister : BITS(VERTEX_OUTPUT_LAST);
 			CullMode cullMode              : BITS(CULL_LAST);
 			bool twoSidedStencil           : 1;
 			bool slopeDepthBias            : 1;
@@ -50,14 +49,7 @@
 			unsigned int multiSample       : 3;   // 1, 2 or 4
 			bool rasterizerDiscard         : 1;
 
-			struct Gradient
-			{
-				unsigned char attribute : BITS(VERTEX_OUTPUT_LAST);
-				bool flat               : 1;
-				bool wrap               : 1;
-			};
-
-			Gradient gradient[MAX_FRAGMENT_INPUTS][4];
+			SpirvShader::InterfaceComponent gradient[MAX_INTERFACE_COMPONENTS];
 		};
 
 		struct State : States
diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
index 972bbf3..27b8b18 100644
--- a/src/Device/Vertex.hpp
+++ b/src/Device/Vertex.hpp
@@ -21,75 +21,25 @@
 
 namespace sw
 {
-	enum Out
-	{
-		// Default vertex output semantics
-		Pos = 0,
-		C0 = 1,   // Diffuse
-		C1 = 2,   // Specular
-		T0 = 3,
-		T1 = 4,
-		T2 = 5,
-		T3 = 6,
-		T4 = 7,
-		T5 = 8,
-		T6 = 9,
-		T7 = 10,
-		Fog = 11,    // x component
-		Pts = Fog,   // y component
-
-		// Variable semantics
-		V0 = 0,
-		Vn_1 = MAX_VERTEX_OUTPUTS - 1,
-
-		Unused,
-		VERTEX_OUTPUT_LAST = Unused,
-	};
-
-	struct UVWQ
-	{
-		float u;
-		float v;
-		float w;
-		float q;
-
-		float &operator[](int i)
-		{
-			return (&u)[i];
-		}
-	};
-
 	ALIGN(16, struct Vertex
 	{
-		union
+		float v[MAX_INTERFACE_COMPONENTS];
+
+		struct
 		{
-			struct   // Fixed semantics
-			{
-				// Position
-				float x;
-				float y;
-				float z;
-				float w;
-
-				float4 C[2];   // Diffuse and specular color
-
-				UVWQ T[8];           // Texture coordinates
-
-				float f;             // Fog
-				float pSize;         // Point size
-			};
-
-			float4 v[MAX_VERTEX_OUTPUTS];   // Generic components using semantic declaration
-		};
-
-		// Projected coordinates
-		int X;
-		int Y;
-		float Z;
-		float W;
+			float4 position;
+			float pointSize;
+		} builtins;
+		struct
+		{
+			int x;
+			int y;
+			float z;
+			float w;
+		} projected;
 
 		int clipFlags;
-		int padding[3];
+		int padding[2];
 	});
 
 	static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index d3fd5c3..80a4557 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Device/Vertex.hpp>
 #include "SetupRoutine.hpp"
 
 #include "Constants.hpp"
@@ -54,8 +55,6 @@
 			const int V1 = (triangle || line) ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0);
 			const int V2 = triangle ? OFFSET(Triangle,v2) : (line ? OFFSET(Triangle,v1) : OFFSET(Triangle,v0));
 
-			int pos = state.positionRegister;
-
 			Pointer<Byte> v0 = tri + V0;
 			Pointer<Byte> v1 = tri + V1;
 			Pointer<Byte> v2 = tri + V2;
@@ -63,13 +62,13 @@
 			Array<Int> X(16);
 			Array<Int> Y(16);
 
-			X[0] = *Pointer<Int>(v0 + OFFSET(Vertex,X));
-			X[1] = *Pointer<Int>(v1 + OFFSET(Vertex,X));
-			X[2] = *Pointer<Int>(v2 + OFFSET(Vertex,X));
+			X[0] = *Pointer<Int>(v0 + OFFSET(Vertex,projected.x));
+			X[1] = *Pointer<Int>(v1 + OFFSET(Vertex,projected.x));
+			X[2] = *Pointer<Int>(v2 + OFFSET(Vertex,projected.x));
 
-			Y[0] = *Pointer<Int>(v0 + OFFSET(Vertex,Y));
-			Y[1] = *Pointer<Int>(v1 + OFFSET(Vertex,Y));
-			Y[2] = *Pointer<Int>(v2 + OFFSET(Vertex,Y));
+			Y[0] = *Pointer<Int>(v0 + OFFSET(Vertex,projected.y));
+			Y[1] = *Pointer<Int>(v1 + OFFSET(Vertex,projected.y));
+			Y[2] = *Pointer<Int>(v2 + OFFSET(Vertex,projected.y));
 
 			Int d = 1;     // Winding direction
 
@@ -91,9 +90,9 @@
 					Return(false);
 				}
 
-				Int w0w1w2 = *Pointer<Int>(v0 + pos * 16 + 12) ^
-							 *Pointer<Int>(v1 + pos * 16 + 12) ^
-							 *Pointer<Int>(v2 + pos * 16 + 12);
+				Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, builtins.position.w)) ^
+							 *Pointer<Int>(v1 + OFFSET(Vertex, builtins.position.w)) ^
+							 *Pointer<Int>(v2 + OFFSET(Vertex, builtins.position.w));
 
 				A = IfThenElse(w0w1w2 < 0, -A, A);
 
@@ -279,9 +278,9 @@
 			// Sort by minimum y
 			if(triangle)
 			{
-				Float y0 = *Pointer<Float>(v0 + pos * 16 + 4);
-				Float y1 = *Pointer<Float>(v1 + pos * 16 + 4);
-				Float y2 = *Pointer<Float>(v2 + pos * 16 + 4);
+				Float y0 = *Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.y));
+				Float y1 = *Pointer<Float>(v1 + OFFSET(Vertex, builtins.position.y));
+				Float y2 = *Pointer<Float>(v2 + OFFSET(Vertex, builtins.position.y));
 
 				Float yMin = Min(Min(y0, y1), y2);
 
@@ -292,9 +291,9 @@
 			// Sort by maximum w
 			if(triangle)
 			{
-				Float w0 = *Pointer<Float>(v0 + pos * 16 + 12);
-				Float w1 = *Pointer<Float>(v1 + pos * 16 + 12);
-				Float w2 = *Pointer<Float>(v2 + pos * 16 + 12);
+				Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.w));
+				Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, builtins.position.w));
+				Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, builtins.position.w));
 
 				Float wMax = Max(Max(w0, w1), w2);
 
@@ -302,9 +301,9 @@
 				conditionalRotate2(wMax == w2, v0, v1, v2);
 			}
 
-			Float w0 = *Pointer<Float>(v0 + pos * 16 + 12);
-			Float w1 = *Pointer<Float>(v1 + pos * 16 + 12);
-			Float w2 = *Pointer<Float>(v2 + pos * 16 + 12);
+			Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.w));
+			Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, builtins.position.w));
+			Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, builtins.position.w));
 
 			Float4 w012;
 
@@ -313,15 +312,15 @@
 			w012.z = w2;
 			w012.w = 1;
 
-			Float rhw0 = *Pointer<Float>(v0 + OFFSET(Vertex,W));
+			Float rhw0 = *Pointer<Float>(v0 + OFFSET(Vertex,projected.w));
 
-			Int X0 = *Pointer<Int>(v0 + OFFSET(Vertex,X));
-			Int X1 = *Pointer<Int>(v1 + OFFSET(Vertex,X));
-			Int X2 = *Pointer<Int>(v2 + OFFSET(Vertex,X));
+			Int X0 = *Pointer<Int>(v0 + OFFSET(Vertex,projected.x));
+			Int X1 = *Pointer<Int>(v1 + OFFSET(Vertex,projected.x));
+			Int X2 = *Pointer<Int>(v2 + OFFSET(Vertex,projected.x));
 
-			Int Y0 = *Pointer<Int>(v0 + OFFSET(Vertex,Y));
-			Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex,Y));
-			Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex,Y));
+			Int Y0 = *Pointer<Int>(v0 + OFFSET(Vertex,projected.y));
+			Int Y1 = *Pointer<Int>(v1 + OFFSET(Vertex,projected.y));
+			Int Y2 = *Pointer<Int>(v2 + OFFSET(Vertex,projected.y));
 
 			if(line)
 			{
@@ -396,9 +395,9 @@
 
 			if(state.interpolateZ)
 			{
-				Float z0 = *Pointer<Float>(v0 + OFFSET(Vertex,Z));
-				Float z1 = *Pointer<Float>(v1 + OFFSET(Vertex,Z));
-				Float z2 = *Pointer<Float>(v2 + OFFSET(Vertex,Z));
+				Float z0 = *Pointer<Float>(v0 + OFFSET(Vertex,projected.z));
+				Float z1 = *Pointer<Float>(v1 + OFFSET(Vertex,projected.z));
+				Float z2 = *Pointer<Float>(v2 + OFFSET(Vertex,projected.z));
 
 				z1 -= z0;
 				z2 -= z0;
@@ -451,19 +450,16 @@
 				*Pointer<Float4>(primitive + OFFSET(Primitive,z.C), 16) = C;
 			}
 
-			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
+			for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
 			{
-				for(int component = 0; component < 4; component++)
-				{
-					int attribute = state.gradient[interpolant][component].attribute;
-					bool flat = state.gradient[interpolant][component].flat;
-					bool wrap = state.gradient[interpolant][component].wrap;
-
-					if(attribute != Unused)
-					{
-						setupGradient(primitive, tri, w012, M, v0, v1, v2, OFFSET(Vertex,v[attribute][component]), OFFSET(Primitive,V[interpolant][component]), flat, point, state.perspective, wrap, component);
-					}
-				}
+				// TODO: fix point, perspective, etc. Not convinced various edge cases are really correct here for either VK or GL.
+				if (state.gradient[interpolant].Type != SpirvShader::ATTRIBTYPE_UNUSED)
+					setupGradient(primitive, tri, w012, M, v0, v1, v2,
+							OFFSET(Vertex, v[interpolant]),
+							OFFSET(Primitive, V[interpolant]),
+							state.gradient[interpolant].Flat,
+							point,
+							state.perspective, 0);
 			}
 
 			Return(true);
@@ -472,7 +468,7 @@
 		routine = function("SetupRoutine");
 	}
 
-	void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool sprite, bool perspective, bool wrap, int component)
+	void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool sprite, bool perspective, int component)
 	{
 		Float4 i;
 
@@ -505,21 +501,6 @@
 				i.w = 0;
 			}
 
-			if(wrap)
-			{
-				Float m;
-
-				m = *Pointer<Float>(v0 + attribute);
-				m = Max(m, *Pointer<Float>(v1 + attribute));
-				m = Max(m, *Pointer<Float>(v2 + attribute));
-				m -= 0.5f;
-
-				// TODO: Vectorize
-				If(Float(i.x) < m) i.x = i.x + 1.0f;
-				If(Float(i.y) < m) i.y = i.y + 1.0f;
-				If(Float(i.z) < m) i.z = i.z + 1.0f;
-			}
-
 			if(!perspective)
 			{
 				i *= w012;
diff --git a/src/Pipeline/SetupRoutine.hpp b/src/Pipeline/SetupRoutine.hpp
index 977eb8f..b43dd7a 100644
--- a/src/Pipeline/SetupRoutine.hpp
+++ b/src/Pipeline/SetupRoutine.hpp
@@ -33,7 +33,7 @@
 		Routine *getRoutine();
 
 	private:
-		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool sprite, bool perspective, bool wrap, int component);
+		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool sprite, bool perspective, int component);
 		void edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q);
 		void conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
 		void conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index 1e0e162..7bb498f 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -32,11 +32,14 @@
 
 		enableStack[0] = Int4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
 
-		// TODO: wire up builtins
-		//if(shader->isInstanceIdDeclared())
-		//{
-		//	instanceID = *Pointer<Int>(data + OFFSET(DrawData,instanceID));
-		//}
+		auto it = spirvShader->inputBuiltins.find(spv::BuiltInInstanceIndex);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			// TODO: we could do better here; we know InstanceIndex is uniform across all lanes
+			assert(it->second.SizeInComponents == 1);
+			(*routine.lvalues[it->second.Id])[it->second.FirstComponent] =
+					As<Float4>(Int4((*Pointer<Int>(data + OFFSET(DrawData, instanceID)))));
+		}
 	}
 
 	VertexProgram::~VertexProgram()
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 636351f..e1f0b3d 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <Device/Vertex.hpp>
 #include "VertexRoutine.hpp"
 
 #include "Constants.hpp"
@@ -94,14 +95,21 @@
 
 	void VertexRoutine::computeClipFlags()
 	{
-		int pos = state.positionRegister;
+		auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
+		assert(it != spirvShader->outputBuiltins.end());
+		assert(it->second.SizeInComponents == 4);
+		auto &pos = (*routine.lvalues[it->second.Id]);
+		auto posX = pos[it->second.FirstComponent];
+		auto posY = pos[it->second.FirstComponent + 1];
+		auto posZ = pos[it->second.FirstComponent + 2];
+		auto posW = pos[it->second.FirstComponent + 3];
 
-		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
-		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
-		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
-		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
-		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
-		Int4 minZ = CmpNLE(Float4(0.0f), o[pos].z);
+		Int4 maxX = CmpLT(posW, posX);
+		Int4 maxY = CmpLT(posW, posY);
+		Int4 maxZ = CmpLT(posW, posZ);
+		Int4 minX = CmpNLE(-posW, posX);
+		Int4 minY = CmpNLE(-posW, posY);
+		Int4 minZ = CmpNLE(Float4(0.0f), posZ);
 
 		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
@@ -110,9 +118,9 @@
 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
 
-		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
-		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
-		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+		Int4 finiteX = CmpLE(Abs(posX), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+		Int4 finiteY = CmpLE(Abs(posY), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+		Int4 finiteZ = CmpLE(Abs(posZ), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
 
 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
@@ -658,12 +666,28 @@
 		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
 
 		// Viewport transform
-		int pos = state.positionRegister;
+		auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
+		assert(it != spirvShader->outputBuiltins.end());
+		assert(it->second.SizeInComponents == 4);
+		auto &pos = (*routine.lvalues[it->second.Id]);
+		auto posX = pos[it->second.FirstComponent];
+		auto posY = pos[it->second.FirstComponent + 1];
+		auto posZ = pos[it->second.FirstComponent + 2];
+		auto posW = pos[it->second.FirstComponent + 3];
 
-		v.x = o[pos].x;
-		v.y = o[pos].y;
-		v.z = o[pos].z;
-		v.w = o[pos].w;
+		v.x = posX;
+		v.y = posY;
+		v.z = posZ;
+		v.w = posW;
+
+		// Write the builtin pos into the vertex; it's not going to be consumed by the FS, but may need to reproject if we have to clip.
+		Vector4f v2 = v;
+		transpose4x4(v2.x, v2.y, v2.z, v2.w);
+
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 0, 16) = v2.x;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 1, 16) = v2.y;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 2, 16) = v2.z;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 3, 16) = v2.w;
 
 		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
 		Float4 rhw = Float4(1.0f) / w;
@@ -675,23 +699,26 @@
 
 		transpose4x4(v.x, v.y, v.z, v.w);
 
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 0, 16) = v.x;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 1, 16) = v.y;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 2, 16) = v.z;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 0, 16) = v.x;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 1, 16) = v.y;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 2, 16) = v.z;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 3, 16) = v.w;
 	}
 
 	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
 	{
-		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
+		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
 		{
-			if(state.output[i].write)
+			if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 			{
-				*Pointer<Int4>(vertex + OFFSET(Vertex,v[i]), 16) = *Pointer<Int4>(cache + OFFSET(Vertex,v[i]), 16);
+				*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cache + OFFSET(Vertex, v[i]), 4);
 			}
 		}
 
-		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
+		*Pointer<Int4>(vertex + OFFSET(Vertex,projected)) = *Pointer<Int4>(cache + OFFSET(Vertex,projected));
 		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
+		*Pointer<Int4>(vertex + OFFSET(Vertex,builtins.position)) = *Pointer<Int4>(cache + OFFSET(Vertex,builtins.position));
+		*Pointer<Int>(vertex + OFFSET(Vertex,builtins.pointSize)) = *Pointer<Int>(cache + OFFSET(Vertex,builtins.pointSize));
+
 	}
 }