Point vertex processing optimization Instead of packing each point in a triangle primitive, tightly pack all point vertices and write them out 3 at a time in the output primitive. This should be roughly twice as fast. Explanation: Currently: Vertices: 0 0 0 1 1 1 2 2 2 ... Processing: 1) 0 0 0 1 2) 0 (cache hit) 3) 0 (cache hit) 2) 1 (cache hit) 3) 1 (cache hit) 4) 1 (cache hit) 5) 2 2 2 3 ... -> We processed 8 vertices to get points 0 1 2 3 New way: 1) 0 1 2 3 -> We processed 4 vertices to get points 0 1 2 3 2) 4 5 6 7 Will affect these tests once vertexPipelineStoresAndAtomics is enabled: dEQP-VK.glsl.atomic_operations.* Note that these tests are affected because they wrongly assume vertices won't be processed more than once. These tests should still get fixed. Bug b/140294254 Change-Id: Idb21085838317db7b7a6630a18de4d7284534429 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/36349 Presubmit-Ready: Alexis Hétu <sugoi@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com> Tested-by: Alexis Hétu <sugoi@google.com>

commit: 9f2b6c451fdf886272c8caa730e7ccafc17c6afb [log] [tgz]
author: Alexis Hetu <sugoi@google.com> Fri Aug 30 11:55:02 2019 -0400
committer: Alexis Hétu <sugoi@google.com> Mon Sep 16 19:50:39 2019 +0000
tree: c39d554074965752dcedac5a4e6dd4f4deb014b0
parent: 4ba1b04bb1920367b83cf5ffd9573e315b69ef44 [diff]
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 854c935..8b0ab36 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp

@@ -53,13 +53,17 @@
 		case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
 		{
 			auto index = start;
+			auto pointBatch = &(batch[0][0]);
 			for(unsigned int i = 0; i < triangleCount; i++)
 			{
-				batch[i][0] = indices[index];
-				batch[i][1] = indices[index];
-				batch[i][2] = indices[index];
+				*pointBatch++ = indices[index++];
+			}
 
-				index += 1;
+			// Repeat the last index to allow for SIMD width overrun.
+			index--;
+			for(unsigned int i = 0; i < 3; i++)
+			{
+				*pointBatch++ = indices[index];
 			}
 			break;
 		}
@@ -496,7 +500,8 @@
 
 		auto& vertexTask = batch->vertexTask;
 		vertexTask.primitiveStart = batch->firstPrimitive;
-		vertexTask.vertexCount = batch->numPrimitives * 3;
+		// We're only using batch compaction for points, not lines
+		vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
 		if (vertexTask.vertexCache.drawCall != draw->id)
 		{
 			vertexTask.vertexCache.clear();
@@ -590,10 +595,14 @@
 			}
 		}
 
-		// Repeat the last index to allow for SIMD width overrun.
-		triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
-		triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
-		triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
+		// setBatchIndices() takes care of the point case, since it's different due to the compaction
+		if (topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
+		{
+			// Repeat the last index to allow for SIMD width overrun.
+			triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
+			triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
+			triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
+		}
 	}
 
 	int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)

diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index c6e5c13..72e4cf4 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp

@@ -78,6 +78,7 @@
 		State state;
 
 		state.shaderID = context->vertexShader->getSerialID();
+		state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
 
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
 		{

diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index 66ba01d..247e8a3 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp

@@ -75,6 +75,7 @@
 			};
 
 			Input input[MAX_INTERFACE_COMPONENTS / 4];
+			bool isPoint : 1;
 		};
 
 		struct State : States

diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 8ada5d4..e43e96b 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp

@@ -67,9 +67,14 @@
 			}
 
 			Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
-			writeVertex(vertex, cacheEntry);
 
-			vertex += sizeof(Vertex);
+			// For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
+			for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
+			{
+				writeVertex(vertex, cacheEntry);
+				vertex += sizeof(Vertex);
+			}
+
 			batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
 			vertexCount--;
 		}
commit	9f2b6c451fdf886272c8caa730e7ccafc17c6afb	[log] [tgz]
author	Alexis Hetu <sugoi@google.com>	Fri Aug 30 11:55:02 2019 -0400
committer	Alexis Hétu <sugoi@google.com>	Mon Sep 16 19:50:39 2019 +0000
tree	c39d554074965752dcedac5a4e6dd4f4deb014b0
parent	4ba1b04bb1920367b83cf5ffd9573e315b69ef44 [diff]