Optimize non-solid polygon rasterization

Instead of using a batch size of 1, divide the batch size by 3 so that
we have enough room for 3x more primitives to render. Also use local
Triangle data structures to copy the vertices for the extra primitives,
instead of adding them to the input batch.

Bug: b/139872671
Change-Id: I1bc860d291b53fdd266b6c018ac0c47c876aaa09
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35588
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index debcc7c..51b03f1 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -217,11 +217,11 @@
 				break;
 			case VK_POLYGON_MODE_LINE:
 				setupPrimitives = &DrawCall::setupWireframeTriangles;
-				numPrimitivesPerBatch = 1;
+				numPrimitivesPerBatch /= 3;
 				break;
 			case VK_POLYGON_MODE_POINT:
 				setupPrimitives = &DrawCall::setupPointTriangles;
-				numPrimitivesPerBatch = 1;
+				numPrimitivesPerBatch /= 3;
 				break;
 			default:
 				UNSUPPORTED("polygon mode: %d", int(context->polygonMode));
@@ -631,39 +631,42 @@
 		int ms = state.multiSample;
 		int visible = 0;
 
-		const Vertex &v0 = triangles[0].v0;
-		const Vertex &v1 = triangles[0].v1;
-		const Vertex &v2 = triangles[0].v2;
-
-		float d = (v0.position.y * v1.position.x - v0.position.x * v1.position.y) * v2.position.w +
-		          (v0.position.x * v2.position.y - v0.position.y * v2.position.x) * v1.position.w +
-		          (v2.position.x * v1.position.y - v1.position.x * v2.position.y) * v0.position.w;
-
-		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? d > 0.0f : d < 0.0f;
-		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
+		for(int i = 0; i < count; i++)
 		{
-			if(frontFacing) return 0;
-		}
-		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
-		{
-			if(!frontFacing) return 0;
-		}
+			const Vertex &v0 = triangles[i].v0;
+			const Vertex &v1 = triangles[i].v1;
+			const Vertex &v2 = triangles[i].v2;
 
-		// Copy attributes
-		triangles[1].v0 = v1;
-		triangles[1].v1 = v2;
-		triangles[2].v0 = v2;
-		triangles[2].v1 = v0;
+			float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
+			          (v0.x * v2.y - v0.y * v2.x) * v1.w +
+			          (v2.x * v1.y - v1.x * v2.y) * v0.w;
 
-		for(int i = 0; i < 3; i++)
-		{
-			if(setupLine(*primitives, *triangles, *drawCall))
+			bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 			{
-				primitives += ms;
-				visible++;
+				if(frontFacing) continue;
+			}
+			if(state.cullMode & VK_CULL_MODE_BACK_BIT)
+			{
+				if(!frontFacing) continue;
 			}
 
-			triangles++;
+			Triangle lines[3];
+			lines[0].v0 = v0;
+			lines[0].v1 = v1;
+			lines[1].v0 = v1;
+			lines[1].v1 = v2;
+			lines[2].v0 = v2;
+			lines[2].v1 = v0;
+
+			for(int i = 0; i < 3; i++)
+			{
+				if(setupLine(*primitives, lines[i], *drawCall))
+				{
+					primitives += ms;
+					visible++;
+				}
+			}
 		}
 
 		return visible;
@@ -676,37 +679,39 @@
 		int ms = state.multiSample;
 		int visible = 0;
 
-		const Vertex &v0 = triangles[0].v0;
-		const Vertex &v1 = triangles[0].v1;
-		const Vertex &v2 = triangles[0].v2;
-
-		float d = (v0.position.y * v1.position.x - v0.position.x * v1.position.y) * v2.position.w +
-		          (v0.position.x * v2.position.y - v0.position.y * v2.position.x) * v1.position.w +
-		          (v2.position.x * v1.position.y - v1.position.x * v2.position.y) * v0.position.w;
-
-		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? d > 0.0f : d < 0.0f;
-		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
+		for(int i = 0; i < count; i++)
 		{
-			if(frontFacing) return 0;
-		}
-		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
-		{
-			if(!frontFacing) return 0;
-		}
+			const Vertex &v0 = triangles[i].v0;
+			const Vertex &v1 = triangles[i].v1;
+			const Vertex &v2 = triangles[i].v2;
 
-		// Copy attributes
-		triangles[1].v0 = v1;
-		triangles[2].v0 = v2;
+			float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
+			          (v0.x * v2.y - v0.y * v2.x) * v1.w +
+			          (v2.x * v1.y - v1.x * v2.y) * v0.w;
 
-		for(int i = 0; i < 3; i++)
-		{
-			if(setupPoint(*primitives, *triangles, *drawCall))
+			bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 			{
-				primitives += ms;
-				visible++;
+				if(frontFacing) continue;
+			}
+			if(state.cullMode & VK_CULL_MODE_BACK_BIT)
+			{
+				if(!frontFacing) continue;
 			}
 
-			triangles++;
+			Triangle points[3];
+			points[0].v0 = v0;
+			points[1].v0 = v1;
+			points[2].v0 = v2;
+
+			for(int i = 0; i < 3; i++)
+			{
+				if(setupPoint(*primitives, points[i], *drawCall))
+				{
+					primitives += ms;
+					visible++;
+				}
+			}
 		}
 
 		return visible;
diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
index 7725aef..611416b 100644
--- a/src/Device/Vertex.hpp
+++ b/src/Device/Vertex.hpp
@@ -23,7 +23,19 @@
 {
 	ALIGN(16, struct Vertex
 	{
-		float4 position;
+		union
+		{
+			struct
+			{
+				float x;
+				float y;
+				float z;
+				float w;
+			};
+
+			float4 position;
+		};
+
 		float pointSize;
 
 		int clipFlags;
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index 7740cc1..37cc5d5 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -86,9 +86,9 @@
 					Return(0);
 				}
 
-				Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, position.w)) ^
-							 *Pointer<Int>(v1 + OFFSET(Vertex, position.w)) ^
-							 *Pointer<Int>(v2 + OFFSET(Vertex, position.w));
+				Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, w)) ^
+				             *Pointer<Int>(v1 + OFFSET(Vertex, w)) ^
+				             *Pointer<Int>(v2 + OFFSET(Vertex, w));
 
 				A = IfThenElse(w0w1w2 < 0, -A, A);
 
@@ -268,9 +268,9 @@
 			// Sort by minimum y
 			if(triangle)
 			{
-				Float y0 = *Pointer<Float>(v0 + OFFSET(Vertex, position.y));
-				Float y1 = *Pointer<Float>(v1 + OFFSET(Vertex, position.y));
-				Float y2 = *Pointer<Float>(v2 + OFFSET(Vertex, position.y));
+				Float y0 = *Pointer<Float>(v0 + OFFSET(Vertex, y));
+				Float y1 = *Pointer<Float>(v1 + OFFSET(Vertex, y));
+				Float y2 = *Pointer<Float>(v2 + OFFSET(Vertex, y));
 
 				Float yMin = Min(Min(y0, y1), y2);
 
@@ -281,9 +281,9 @@
 			// Sort by maximum w
 			if(triangle)
 			{
-				Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, position.w));
-				Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, position.w));
-				Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, position.w));
+				Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, w));
+				Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, w));
+				Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, w));
 
 				Float wMax = Max(Max(w0, w1), w2);
 
@@ -292,13 +292,13 @@
 			}
 
 			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordX)) =
-				*Pointer<Float>(v0 + OFFSET(Vertex, position.x));
+				*Pointer<Float>(v0 + OFFSET(Vertex, x));
 			*Pointer<Float>(primitive + OFFSET(Primitive, pointCoordY)) =
-				*Pointer<Float>(v0 + OFFSET(Vertex, position.y));
+				*Pointer<Float>(v0 + OFFSET(Vertex, y));
 
-			Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, position.w));
-			Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, position.w));
-			Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, position.w));
+			Float w0 = *Pointer<Float>(v0 + OFFSET(Vertex, w));
+			Float w1 = *Pointer<Float>(v1 + OFFSET(Vertex, w));
+			Float w2 = *Pointer<Float>(v2 + OFFSET(Vertex, w));
 
 			Float4 w012;
 
@@ -443,11 +443,13 @@
 			for (int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
 			{
 				if (state.gradient[interpolant].Type != SpirvShader::ATTRIBTYPE_UNUSED)
+				{
 					setupGradient(primitive, tri, w012, M, v0, v1, v2,
 							OFFSET(Vertex, v[interpolant]),
 							OFFSET(Primitive, V[interpolant]),
 							state.gradient[interpolant].Flat,
 							!state.gradient[interpolant].NoPerspective, 0);
+				}
 			}
 
 			Return(1);
diff --git a/src/Vulkan/VkPipeline.cpp b/src/Vulkan/VkPipeline.cpp
index 0e131f1..c5b72de 100644
--- a/src/Vulkan/VkPipeline.cpp
+++ b/src/Vulkan/VkPipeline.cpp
@@ -397,7 +397,8 @@
 	const VkPipelineMultisampleStateCreateInfo* multisampleState = pCreateInfo->pMultisampleState;
 	if(multisampleState)
 	{
-		switch (multisampleState->rasterizationSamples) {
+		switch (multisampleState->rasterizationSamples)
+		{
 		case VK_SAMPLE_COUNT_1_BIT:
 			context.sampleCount = 1;
 			break;
@@ -409,7 +410,9 @@
 		}
 
 		if (multisampleState->pSampleMask)
+		{
 			context.sampleMask = multisampleState->pSampleMask[0];
+		}
 
 		context.alphaToCoverage = (multisampleState->alphaToCoverageEnable == VK_TRUE);