Point vertex processing optimization
Instead of packing each point in a triangle primitive,
tightly pack all point vertices and write them out 3
at a time in the output primitive. This should be
roughly twice as fast. Explanation:
Currently:
Vertices: 0 0 0 1 1 1 2 2 2 ...
Processing:
1) 0 0 0 1
2) 0 (cache hit)
3) 0 (cache hit)
2) 1 (cache hit)
3) 1 (cache hit)
4) 1 (cache hit)
5) 2 2 2 3
...
-> We processed 8 vertices to get points 0 1 2 3
New way:
1) 0 1 2 3 -> We processed 4 vertices to get points 0 1 2 3
2) 4 5 6 7
Will affect these tests once vertexPipelineStoresAndAtomics
is enabled:
dEQP-VK.glsl.atomic_operations.*
Note that these tests are affected because they wrongly assume
vertices won't be processed more than once. These tests should
still get fixed.
Bug b/140294254
Change-Id: Idb21085838317db7b7a6630a18de4d7284534429
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/36349
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 854c935..8b0ab36 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -53,13 +53,17 @@
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
{
auto index = start;
+ auto pointBatch = &(batch[0][0]);
for(unsigned int i = 0; i < triangleCount; i++)
{
- batch[i][0] = indices[index];
- batch[i][1] = indices[index];
- batch[i][2] = indices[index];
+ *pointBatch++ = indices[index++];
+ }
- index += 1;
+ // Repeat the last index to allow for SIMD width overrun.
+ index--;
+ for(unsigned int i = 0; i < 3; i++)
+ {
+ *pointBatch++ = indices[index];
}
break;
}
@@ -496,7 +500,8 @@
auto& vertexTask = batch->vertexTask;
vertexTask.primitiveStart = batch->firstPrimitive;
- vertexTask.vertexCount = batch->numPrimitives * 3;
+ // We're only using batch compaction for points, not lines
+ vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
if (vertexTask.vertexCache.drawCall != draw->id)
{
vertexTask.vertexCache.clear();
@@ -590,10 +595,14 @@
}
}
- // Repeat the last index to allow for SIMD width overrun.
- triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
- triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
- triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
+ // setBatchIndices() takes care of the point case, since it's different due to the compaction
+ if (topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
+ {
+ // Repeat the last index to allow for SIMD width overrun.
+ triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
+ triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
+ triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
+ }
}
int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index c6e5c13..72e4cf4 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp
@@ -78,6 +78,7 @@
State state;
state.shaderID = context->vertexShader->getSerialID();
+ state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
{
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index 66ba01d..247e8a3 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp
@@ -75,6 +75,7 @@
};
Input input[MAX_INTERFACE_COMPONENTS / 4];
+ bool isPoint : 1;
};
struct State : States
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 8ada5d4..e43e96b 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -67,9 +67,14 @@
}
Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
- writeVertex(vertex, cacheEntry);
- vertex += sizeof(Vertex);
+ // For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
+ for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
+ {
+ writeVertex(vertex, cacheEntry);
+ vertex += sizeof(Vertex);
+ }
+
batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
vertexCount--;
}