Fix alignment of projected vertex coordinates

Also reorder fields to reduce space wasted on alignment padding, and
shuffle writing them to the vertex cache in the same order for
consistency.

Bug: b/27351835
Change-Id: I06ca0c836aabd9d095893762d973c098f694ee30
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/32788
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Device/Config.hpp b/src/Device/Config.hpp
index 9661bdc..ecadc59 100644
--- a/src/Device/Config.hpp
+++ b/src/Device/Config.hpp
@@ -66,7 +66,7 @@
 		MAX_TEXTURE_LOD = MIPMAP_LEVELS - 2,   // Trilinear accesses lod+1
 		RENDERTARGETS = 8,
 		NUM_TEMPORARY_REGISTERS = 4096,
-		MAX_INTERFACE_COMPONENTS = 32 * 4,
+		MAX_INTERFACE_COMPONENTS = 32 * 4,  // Must be multiple of 4 for 16-byte alignment.
 	};
 }
 
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index ee82055..dc0c3f5 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -888,7 +888,7 @@
 
 			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
 			{
-				Polygon polygon(&v0.builtins.position, &v1.builtins.position, &v2.builtins.position);
+				Polygon polygon(&v0.position, &v1.position, &v2.position);
 
 				int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
 
@@ -972,8 +972,8 @@
 		Vertex &v0 = triangle.v0;
 		Vertex &v1 = triangle.v1;
 
-		const float4 &P0 = v0.builtins.position;
-		const float4 &P1 = v1.builtins.position;
+		const float4 &P0 = v0.position;
+		const float4 &P1 = v1.position;
 
 		if(P0.w <= 0 && P1.w <= 0)
 		{
@@ -1162,17 +1162,17 @@
 
 		Vertex &v = triangle.v0;
 
-		float pSize = v.builtins.pointSize;
+		float pSize = v.pointSize;
 
 		pSize = clamp(pSize, 1.0f, static_cast<float>(vk::MAX_POINT_SIZE));
 
 		float4 P[4];
 		int C[4];
 
-		P[0] = v.builtins.position;
-		P[1] = v.builtins.position;
-		P[2] = v.builtins.position;
-		P[3] = v.builtins.position;
+		P[0] = v.position;
+		P[1] = v.position;
+		P[2] = v.position;
+		P[3] = v.position;
 
 		const float X = pSize * P[0].w * data.halfPixelX[0];
 		const float Y = pSize * P[0].w * data.halfPixelY[0];
diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
index 27b8b18..7725aef 100644
--- a/src/Device/Vertex.hpp
+++ b/src/Device/Vertex.hpp
@@ -23,14 +23,12 @@
 {
 	ALIGN(16, struct Vertex
 	{
-		float v[MAX_INTERFACE_COMPONENTS];
+		float4 position;
+		float pointSize;
 
-		struct
-		{
-			float4 position;
-			float pointSize;
-		} builtins;
-		struct
+		int clipFlags;
+
+		alignas(16) struct
 		{
 			int x;
 			int y;
@@ -38,8 +36,7 @@
 			float w;
 		} projected;
 
-		int clipFlags;
-		int padding[2];
+		alignas(16) float v[MAX_INTERFACE_COMPONENTS];
 	});
 
 	static_assert((sizeof(Vertex) & 0x0000000F) == 0, "Vertex size not a multiple of 16 bytes (alignment requirement)");
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index 594fdee..fd1702e 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -86,9 +86,9 @@
 					Return(0);
 				}
 
-				Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, builtins.position.w)) ^
-							 *Pointer<Int>(v1 + OFFSET(Vertex, builtins.position.w)) ^
-							 *Pointer<Int>(v2 + OFFSET(Vertex, builtins.position.w));
+				Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, position.w)) ^
+							 *Pointer<Int>(v1 + OFFSET(Vertex, position.w)) ^
+							 *Pointer<Int>(v2 + OFFSET(Vertex, position.w));
 
 				A = IfThenElse(w0w1w2 < 0, -A, A);
 
@@ -268,9 +268,9 @@
 			// Sort by minimum y
 			if(triangle)
 			{
-				Float y0 = *Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.y));
-				Float y1 = *Pointer<Float>(v1 + OFFSET(Vertex, builtins.position.y));
-				Float y2 = *Pointer<Float>(v2 + OFFSET(Vertex, builtins.position.y));
+				Float y0 = *Pointer<Float>(v0 + OFFSET(Vertex, position.y));
+				Float y1 = *Pointer<Float>(v1 + OFFSET(Vertex, position.y));
+				Float y2 = *Pointer<Float>(v2 + OFFSET(Vertex, position.y));
 
 				Float yMin = Min(Min(y0, y1), y2);
 
@@ -281,9 +281,9 @@
 			// Sort by maximum w
 			if(triangle)
 			{
-				Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.w));
-				Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, builtins.position.w));
-				Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, builtins.position.w));
+				Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, position.w));
+				Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, position.w));
+				Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, position.w));
 
 				Float wMax = Max(Max(w0, w1), w2);
 
@@ -292,13 +292,13 @@
 			}
 
 			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)) =
-				*Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.x));
+				*Pointer<Float>(v0 + OFFSET(Vertex, position.x));
 			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)) =
-				*Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.y));
+				*Pointer<Float>(v0 + OFFSET(Vertex, position.y));
 
-			Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, builtins.position.w));
-			Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, builtins.position.w));
-			Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, builtins.position.w));
+			Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, position.w));
+			Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, position.w));
+			Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, position.w));
 
 			Float4 w012;
 
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index baabb2d..ae65feb 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -12,15 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Device/Vertex.hpp>
 #include "VertexRoutine.hpp"
 
 #include "Constants.hpp"
+#include "SpirvShader.hpp"
 #include "Device/Vertex.hpp"
 #include "Device/Renderer.hpp"
-#include "System/Half.hpp"
 #include "Vulkan/VkDebug.hpp"
-#include "SpirvShader.hpp"
+#include "System/Half.hpp"
 
 namespace sw
 {
@@ -51,7 +50,7 @@
 
 		Do
 		{
-			UInt index = *Pointer<UInt>(batch);
+			UInt index = *batch;
 			UInt tagIndex = index & 0x0000003C;
 			UInt indexQ = index & 0xFFFFFFFC;
 
@@ -72,7 +71,7 @@
 			writeVertex(vertex, cacheLine);
 
 			vertex += sizeof(Vertex);
-			batch += sizeof(unsigned int);
+			batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
 			vertexCount--;
 		}
 		Until(vertexCount == 0)
@@ -84,20 +83,20 @@
 	{
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
 		{
-			if (spirvShader->inputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-				spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-				spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-				spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
+			if(spirvShader->inputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+			   spirvShader->inputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+			   spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+			   spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 			{
 
-				Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i/4));
-				UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(unsigned int) * (i/4));
+				Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4));
+				UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
 
-				auto value = readStream(input, stride, state.input[i/4], index);
-				routine.inputs[i] = value.x;
-				routine.inputs[i+1] = value.y;
-				routine.inputs[i+2] = value.z;
-				routine.inputs[i+3] = value.w;
+				auto value = readStream(input, stride, state.input[i / 4], index);
+				routine.inputs[i + 0] = value.x;
+				routine.inputs[i + 1] = value.y;
+				routine.inputs[i + 2] = value.z;
+				routine.inputs[i + 3] = value.w;
 			}
 		}
 	}
@@ -108,7 +107,7 @@
 		assert(it != spirvShader->outputBuiltins.end());
 		assert(it->second.SizeInComponents == 4);
 		auto &pos = routine.getVariable(it->second.Id);
-		auto posX = pos[it->second.FirstComponent];
+		auto posX = pos[it->second.FirstComponent + 0];
 		auto posY = pos[it->second.FirstComponent + 1];
 		auto posZ = pos[it->second.FirstComponent + 2];
 		auto posW = pos[it->second.FirstComponent + 3];
@@ -120,19 +119,19 @@
 		Int4 minY = CmpNLE(-posW, posY);
 		Int4 minZ = CmpNLE(Float4(0.0f), posZ);
 
-		clipFlags = *Pointer<Int>(constants + OFFSET(Constants,maxX) + SignMask(maxX) * 4);   // FIXME: Array indexing
-		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxY) + SignMask(maxY) * 4);
-		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,maxZ) + SignMask(maxZ) * 4);
-		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minX) + SignMask(minX) * 4);
-		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minY) + SignMask(minY) * 4);
-		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,minZ) + SignMask(minZ) * 4);
+		clipFlags =  Pointer<Int>(constants + OFFSET(Constants,maxX))[SignMask(maxX)];
+		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,maxY))[SignMask(maxY)];
+		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,maxZ))[SignMask(maxZ)];
+		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minX))[SignMask(minX)];
+		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minY))[SignMask(minY)];
+		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,minZ))[SignMask(minZ)];
 
 		Int4 finiteX = CmpLE(Abs(posX), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
 		Int4 finiteY = CmpLE(Abs(posY), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
 		Int4 finiteZ = CmpLE(Abs(posZ), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
 
 		Int4 finiteXYZ = finiteX & finiteY & finiteZ;
-		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + SignMask(finiteXYZ) * 4);
+		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
 	}
 
 	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
@@ -476,7 +475,7 @@
 			}
 			break;
 		default:
-			ASSERT(false);
+			UNSUPPORTED("stream.type %d", int(stream.type));
 		}
 
 		if(stream.count < 1) v.x = Float4(0.0f);
@@ -489,19 +488,70 @@
 
 	void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
 	{
-		Vector4f v;
+		auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
+		assert(it != spirvShader->outputBuiltins.end());
+		assert(it->second.SizeInComponents == 4);
+		auto &position = routine.getVariable(it->second.Id);
 
-		for (int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
+		Vector4f pos;
+		pos.x = position[it->second.FirstComponent + 0];
+		pos.y = position[it->second.FirstComponent + 1];
+		pos.z = position[it->second.FirstComponent + 2];
+		pos.w = position[it->second.FirstComponent + 3];
+
+		// Projection and viewport transform.
+		Float4 w = As<Float4>(As<Int4>(pos.w) | (As<Int4>(CmpEQ(pos.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
+		Float4 rhw = Float4(1.0f) / w;
+
+		Vector4f proj;
+		proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
+		proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
+		proj.z = pos.z * rhw;
+		proj.w = rhw;
+
+		transpose4x4(pos.x, pos.y, pos.z, pos.w);
+
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 0, 16) = pos.x;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 1, 16) = pos.y;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 2, 16) = pos.z;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 3, 16) = pos.w;
+
+		it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
+		if(it != spirvShader->outputBuiltins.end())
 		{
-			if (spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-				spirvShader->outputs[i+1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-				spirvShader->outputs[i+2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
-				spirvShader->outputs[i+3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
+			assert(it->second.SizeInComponents == 1);
+			auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
+
+			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 0) = Extract(psize, 0);
+			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 1) = Extract(psize, 1);
+			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 2) = Extract(psize, 2);
+			*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 3) = Extract(psize, 3);
+		}
+
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
+		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
+
+		transpose4x4(proj.x, proj.y, proj.z, proj.w);
+
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 0, 16) = proj.x;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 1, 16) = proj.y;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 2, 16) = proj.z;
+		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 3, 16) = proj.w;
+
+		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
+		{
+			if(spirvShader->outputs[i + 0].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+			   spirvShader->outputs[i + 1].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+			   spirvShader->outputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
+			   spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 			{
-				v.x = routine.outputs[i];
-				v.y = routine.outputs[i+1];
-				v.z = routine.outputs[i+2];
-				v.w = routine.outputs[i+3];
+				Vector4f v;
+				v.x = routine.outputs[i + 0];
+				v.y = routine.outputs[i + 1];
+				v.z = routine.outputs[i + 2];
+				v.w = routine.outputs[i + 3];
 
 				transpose4x4(v.x, v.y, v.z, v.w);
 
@@ -511,77 +561,22 @@
 				*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
 			}
 		}
-
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0)  & 0x0000000FF;
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8)  & 0x0000000FF;
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
-		*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
-
-		// Viewport transform
-		auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
-		assert(it != spirvShader->outputBuiltins.end());
-		assert(it->second.SizeInComponents == 4);
-		auto &pos = routine.getVariable(it->second.Id);
-		auto posX = pos[it->second.FirstComponent];
-		auto posY = pos[it->second.FirstComponent + 1];
-		auto posZ = pos[it->second.FirstComponent + 2];
-		auto posW = pos[it->second.FirstComponent + 3];
-
-		v.x = posX;
-		v.y = posY;
-		v.z = posZ;
-		v.w = posW;
-
-		// Write the builtin pos into the vertex; it's not going to be consumed by the FS, but may need to reproject if we have to clip.
-		Vector4f v2 = v;
-		transpose4x4(v2.x, v2.y, v2.z, v2.w);
-
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 0, 16) = v2.x;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 1, 16) = v2.y;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 2, 16) = v2.z;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,builtins.position) + sizeof(Vertex) * 3, 16) = v2.w;
-
-		Float4 w = As<Float4>(As<Int4>(v.w) | (As<Int4>(CmpEQ(v.w, Float4(0.0f))) & As<Int4>(Float4(1.0f))));
-		Float4 rhw = Float4(1.0f) / w;
-
-		v.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,X0x16)) + v.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Wx16))));
-		v.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData,Y0x16)) + v.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData,Hx16))));
-		v.z = v.z * rhw;
-		v.w = rhw;
-
-		transpose4x4(v.x, v.y, v.z, v.w);
-
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 0, 16) = v.x;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 1, 16) = v.y;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 2, 16) = v.z;
-		*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 3, 16) = v.w;
-
-		it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
-		if (it != spirvShader->outputBuiltins.end())
-		{
-			assert(it->second.SizeInComponents == 1);
-			auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,builtins.pointSize) + sizeof(Vertex) * 0) = Extract(psize, 0);
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,builtins.pointSize) + sizeof(Vertex) * 1) = Extract(psize, 1);
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,builtins.pointSize) + sizeof(Vertex) * 2) = Extract(psize, 2);
-			*Pointer<Float>(cacheLine + OFFSET(Vertex,builtins.pointSize) + sizeof(Vertex) * 3) = Extract(psize, 3);
-		}
 	}
 
-	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
+	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
 	{
+		*Pointer<Int4>(vertex + OFFSET(Vertex,position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex,position));
+		*Pointer<Int>(vertex + OFFSET(Vertex,pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,pointSize));
+
+		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,clipFlags));
+		*Pointer<Int4>(vertex + OFFSET(Vertex,projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex,projected));
+
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
 		{
 			if(spirvShader->outputs[i].Type != SpirvShader::ATTRIBTYPE_UNUSED)
 			{
-				*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cache + OFFSET(Vertex, v[i]), 4);
+				*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
 			}
 		}
-
-		*Pointer<Int4>(vertex + OFFSET(Vertex,projected)) = *Pointer<Int4>(cache + OFFSET(Vertex,projected));
-		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
-		*Pointer<Int4>(vertex + OFFSET(Vertex,builtins.position)) = *Pointer<Int4>(cache + OFFSET(Vertex,builtins.position));
-		*Pointer<Int>(vertex + OFFSET(Vertex,builtins.pointSize)) = *Pointer<Int>(cache + OFFSET(Vertex,builtins.pointSize));
-
 	}
 }
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
index 617dc0c..2e71343 100644
--- a/src/Pipeline/VertexRoutine.hpp
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -15,10 +15,10 @@
 #ifndef sw_VertexRoutine_hpp
 #define sw_VertexRoutine_hpp
 
-#include "Device/Color.hpp"
-#include "Device/VertexProcessor.hpp"
 #include "ShaderCore.hpp"
 #include "SpirvShader.hpp"
+#include "Device/Color.hpp"
+#include "Device/VertexProcessor.hpp"
 
 namespace vk
 {
@@ -27,7 +27,7 @@
 
 namespace sw
 {
-	class VertexRoutinePrototype : public Function<Void(Pointer<Byte>, Pointer<Byte>, Pointer<Byte>, Pointer<Byte>)>
+	class VertexRoutinePrototype : public Function<Void(Pointer<Byte>, Pointer<UInt>, Pointer<Byte>, Pointer<Byte>)>
 	{
 	public:
 		VertexRoutinePrototype() : vertex(Arg<0>()), batch(Arg<1>()), task(Arg<2>()), data(Arg<3>()) {}
@@ -35,7 +35,7 @@
 
 	protected:
 		Pointer<Byte> vertex;
-		Pointer<Byte> batch;
+		Pointer<UInt> batch;
 		Pointer<Byte> task;
 		Pointer<Byte> data;
 	};
@@ -70,7 +70,7 @@
 		void readInput(UInt &index);
 		void computeClipFlags();
 		void writeCache(Pointer<Byte> &cacheLine);
-		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheLine);
+		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry);
 	};
 }