VertexRoutine code for Transform Feedback

This cl adds the code that actually performs the copy of the
vertex shader outputs into transform feedback buffers. It
also contains a fix for symmetricNormalizedDepth, which must
be computed after the information was copied into the
transform feedback buffers, when transform feedback is active.

Change-Id: I418f94a15b9425bba0905c840f8cf4828233d0fb
Reviewed-on: https://swiftshader-review.googlesource.com/5172
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Renderer/Renderer.cpp b/src/Renderer/Renderer.cpp
index 2cb19e3..8be8093 100644
--- a/src/Renderer/Renderer.cpp
+++ b/src/Renderer/Renderer.cpp
@@ -240,7 +240,7 @@
 
 			if(update || oldMultiSampleMask != context->multiSampleMask)
 			{
-				vertexState = VertexProcessor::update();
+				vertexState = VertexProcessor::update(drawType);
 				setupState = SetupProcessor::update();
 				pixelState = PixelProcessor::update();
 
@@ -1460,12 +1460,8 @@
 			return;
 		}
 
-		task->vertexStart = start * 3;
+		task->primitiveStart = start;
 		task->vertexCount = triangleCount * 3;
-		// Note: Quads aren't handled for verticesPerPrimitive, but verticesPerPrimitive is used for transform feedback,
-		//       which is an OpenGL ES 3.0 feature, and OpenGL ES 3.0 doesn't support quads as a primitive type.
-		DrawType type = static_cast<DrawType>(static_cast<unsigned int>(draw->drawType) & 0xF);
-		task->verticesPerPrimitive = 1 + (type >= DRAW_LINELIST) + (type >= DRAW_TRIANGLELIST);
 		vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
 	}
 
diff --git a/src/Renderer/VertexProcessor.cpp b/src/Renderer/VertexProcessor.cpp
index 90c954f..73f277b 100644
--- a/src/Renderer/VertexProcessor.cpp
+++ b/src/Renderer/VertexProcessor.cpp
@@ -839,7 +839,7 @@
 		routineCache = new RoutineCache<State>(clamp(cacheSize, 1, 65536), precacheVertex ? "sw-vertex" : 0);
 	}
 
-	const VertexProcessor::State VertexProcessor::update()
+	const VertexProcessor::State VertexProcessor::update(DrawType drawType)
 	{
 		if(isFixedFunction())
 		{
@@ -912,6 +912,11 @@
 		state.transformFeedbackQueryEnabled = context->transformFeedbackQueryEnabled;
 		state.transformFeedbackEnabled = context->transformFeedbackEnabled;
 
+		// Note: Quads aren't handled for verticesPerPrimitive, but verticesPerPrimitive is used for transform feedback,
+		//       which is an OpenGL ES 3.0 feature, and OpenGL ES 3.0 doesn't support quads as a primitive type.
+		DrawType type = static_cast<DrawType>(static_cast<unsigned int>(drawType) & 0xF);
+		state.verticesPerPrimitive = 1 + (type >= DRAW_LINELIST) + (type >= DRAW_TRIANGLELIST);
+
 		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
 		{
 			state.input[i].type = context->input[i].type;
diff --git a/src/Renderer/VertexProcessor.hpp b/src/Renderer/VertexProcessor.hpp
index 9629c47..3af6690 100644
--- a/src/Renderer/VertexProcessor.hpp
+++ b/src/Renderer/VertexProcessor.hpp
@@ -35,9 +35,8 @@
 
 	struct VertexTask
 	{
-		unsigned int vertexStart;
 		unsigned int vertexCount;
-		unsigned int verticesPerPrimitive;
+		unsigned int primitiveStart;
 		VertexCache vertexCache;
 	};
 
@@ -76,6 +75,7 @@
 			bool pointScaleActive                             : 1;
 			bool transformFeedbackQueryEnabled                : 1;
 			uint64_t transformFeedbackEnabled                 : 64;
+			unsigned char verticesPerPrimitive                : 2; // 1 (points), 2 (lines) or 3 (triangles)
 
 			bool preTransformed : 1;
 			bool superSampling  : 1;
@@ -275,7 +275,7 @@
 		const Matrix &getModelTransform(int i);
 		const Matrix &getViewTransform();
 
-		const State update();
+		const State update(DrawType drawType);
 		Routine *routine(const State &state);
 
 		bool isFixedFunction();
diff --git a/src/Shader/VertexRoutine.cpp b/src/Shader/VertexRoutine.cpp
index dce0dc2..6428cdf 100644
--- a/src/Shader/VertexRoutine.cpp
+++ b/src/Shader/VertexRoutine.cpp
@@ -46,6 +46,8 @@
 		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
 
 		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
+		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
+		UInt indexInPrimitive = 0;
 
 		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
 
@@ -70,11 +72,20 @@
 
 			UInt cacheIndex = index & 0x0000003F;
 			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
-			writeVertex(vertex, cacheLine);
+			writeVertex(vertex, cacheLine, primitiveNumber, indexInPrimitive);
 
 			vertex += sizeof(Vertex);
 			batch += sizeof(unsigned int);
 			vertexCount--;
+			if(state.transformFeedbackEnabled != 0)
+			{
+				indexInPrimitive++;
+				If(indexInPrimitive == 3)
+				{
+					primitiveNumber++;
+					indexInPrimitive = 0;
+				}
+			}
 		}
 		Until(vertexCount == 0)
 
@@ -96,13 +107,19 @@
 	{
 		int pos = state.positionRegister;
 
+		Float4 outPosZ = o[pos].z;
+		if(state.transformFeedbackEnabled && symmetricNormalizedDepth && !state.fixedFunction)
+		{
+			outPosZ = (outPosZ + o[pos].w) * Float4(0.5f);
+		}
+
 		Int4 maxX = CmpLT(o[pos].w, o[pos].x);
 		Int4 maxY = CmpLT(o[pos].w, o[pos].y);
-		Int4 maxZ = CmpLT(o[pos].w, o[pos].z);
+		Int4 maxZ = CmpLT(o[pos].w, outPosZ);
 
 		Int4 minX = CmpNLE(-o[pos].w, o[pos].x);
 		Int4 minY = CmpNLE(-o[pos].w, o[pos].y);
-		Int4 minZ = CmpNLE(Float4(0.0f), o[pos].z);
+		Int4 minZ = CmpNLE(Float4(0.0f), outPosZ);
 
 		Int flags;
 
@@ -121,7 +138,7 @@
 
 		Int4 finiteX = CmpLE(Abs(o[pos].x), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
 		Int4 finiteY = CmpLE(Abs(o[pos].y), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
-		Int4 finiteZ = CmpLE(Abs(o[pos].z), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
+		Int4 finiteZ = CmpLE(Abs(outPosZ), *Pointer<Float4>(constants + OFFSET(Constants,maxPos)));
 
 		flags = SignMask(finiteX & finiteY & finiteZ);
 		clipFlags |= *Pointer<Int>(constants + OFFSET(Constants,fini) + flags * 4);
@@ -570,7 +587,7 @@
 			o[pos].y = o[pos].y + *Pointer<Float4>(data + OFFSET(DrawData,YYYY)) * o[pos].w;
 		}
 
-		if(symmetricNormalizedDepth && !state.fixedFunction)
+		if(!state.transformFeedbackEnabled && symmetricNormalizedDepth && !state.fixedFunction)
 		{
 			o[pos].z = (o[pos].z + o[pos].w) * Float4(0.5f);
 		}
@@ -667,7 +684,7 @@
 		*Pointer<Float4>(cacheLine + OFFSET(Vertex,X) + sizeof(Vertex) * 3, 16) = v.w;
 	}
 
-	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache)
+	void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cache, const UInt &primitiveNumber, const UInt &indexInPrimitive)
 	{
 		for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
 		{
@@ -679,5 +696,45 @@
 
 		*Pointer<Int4>(vertex + OFFSET(Vertex,X)) = *Pointer<Int4>(cache + OFFSET(Vertex,X));
 		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cache + OFFSET(Vertex,clipFlags));
+
+		if(state.transformFeedbackEnabled != 0)
+		{
+			If(indexInPrimitive < state.verticesPerPrimitive)
+			{
+				UInt tOffset = primitiveNumber * state.verticesPerPrimitive + indexInPrimitive;
+				for(int i = 0; i < MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS; ++i)
+				{
+					if(state.transformFeedbackEnabled & (1ULL << i))
+					{
+						UInt reg = *Pointer<UInt>(data + OFFSET(DrawData, vs.reg[i]));
+						UInt row = *Pointer<UInt>(data + OFFSET(DrawData, vs.row[i]));
+						UInt col = *Pointer<UInt>(data + OFFSET(DrawData, vs.col[i]));
+						UInt str = *Pointer<UInt>(data + OFFSET(DrawData, vs.str[i]));
+
+						Pointer<Byte> t = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, vs.t[i])) + (tOffset * str * sizeof(float));
+						Pointer<Byte> v = vertex + OFFSET(Vertex, v) + reg * sizeof(float);
+
+						For(UInt r = 0, r < row, r++)
+						{
+							UInt rOffsetX = r * col * sizeof(float);
+							UInt rOffset4 = r * sizeof(float4);
+							For(UInt c = 0, c < col, c++)
+							{
+								UInt cOffset = c * sizeof(float);
+								*Pointer<Float>(t + rOffsetX + cOffset) = *Pointer<Float>(v + rOffset4 + cOffset);
+							}
+						}
+					}
+				}
+			}
+
+			// Make this correction after transform feedback has been outputted
+			if(symmetricNormalizedDepth && !state.fixedFunction && state.output[state.positionRegister].write)
+			{
+				Float z = *Pointer<Float>(vertex + OFFSET(Vertex, v[state.positionRegister]) + 2 * sizeof(float));
+				Float w = *Pointer<Float>(vertex + OFFSET(Vertex, v[state.positionRegister]) + 3 * sizeof(float));
+				*Pointer<Float>(vertex + OFFSET(Vertex, v[state.positionRegister]) + 2 * sizeof(float)) = (z + w) * Float(0.5f);
+			}
+		}
 	}
 }
diff --git a/src/Shader/VertexRoutine.hpp b/src/Shader/VertexRoutine.hpp
index 7f29ff5..eadd795 100644
--- a/src/Shader/VertexRoutine.hpp
+++ b/src/Shader/VertexRoutine.hpp
@@ -63,7 +63,7 @@
 		void computeClipFlags();
 		void postTransform();
 		void writeCache(Pointer<Byte> &cacheLine);
-		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheLine);
+		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheLine, const UInt &primitiveNumber, const UInt &indexInPrimitive);
 	};
 }