Process independent vertex elements
Previously, vertices would be processed in consecutive groups of four
(for SSE/NEON). Now four indices are read from the index buffer.
Reading the input was already a gather operation, but with constant
stride. The vertex cache now performs a scatter. The vertices are
written in reverse order so that the first vertex in a group is always
present in the cache.
Also use 2^32-1 as invalid vertex cache index (corresponds with the
primitive restart index) instead of 0x80000000, since
maxDrawIndexedIndexValue is UINT32_MAX.
Bug: b/27351835
Test: dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex
Change-Id: Ic69dbf53c67cbda50e44913ccae91aaca2b86e21
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/32609
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index dc0c3f5..3d4581b 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -824,7 +824,7 @@
task->vertexCache.drawCall = primitiveDrawCall;
}
- unsigned int batch[128][3]; // FIXME: Adjust to dynamic batch size
+ unsigned int batch[128 + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology));
if(!indices)
@@ -862,6 +862,11 @@
}
}
+ // Repeat the last index to allow for SIMD width overrun.
+ batch[triangleCount][0] = batch[triangleCount - 1][2];
+ batch[triangleCount][1] = batch[triangleCount - 1][2];
+ batch[triangleCount][2] = batch[triangleCount - 1][2];
+
task->primitiveStart = start;
task->vertexCount = triangleCount * 3;
vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp
index 82c4547..5c66309 100644
--- a/src/Device/VertexProcessor.cpp
+++ b/src/Device/VertexProcessor.cpp
@@ -25,9 +25,9 @@
{
void VertexCache::clear()
{
- for(int i = 0; i < 16; i++)
+ for(uint32_t i = 0; i < SIZE; i++)
{
- tag[i] = 0x80000000;
+ tag[i] = 0xFFFFFFFF;
}
}
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp
index 811ac32..a17e86a 100644
--- a/src/Device/VertexProcessor.hpp
+++ b/src/Device/VertexProcessor.hpp
@@ -25,12 +25,16 @@
{
struct DrawData;
- struct VertexCache // FIXME: Variable size
+ // Basic direct mapped vertex cache.
+ struct VertexCache
{
+ static constexpr uint32_t SIZE = 64; // TODO: Variable size?
+ static constexpr uint32_t TAG_MASK = SIZE - 1; // Size must be power of 2.
+
void clear();
- Vertex vertex[16][4];
- unsigned int tag[16];
+ Vertex vertex[SIZE];
+ uint32_t tag[SIZE];
int drawCall;
};
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index 647ff3a..e240e7f 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -73,17 +73,23 @@
{
}
- void VertexProgram::program(UInt &index)
+ void VertexProgram::program(Pointer<UInt> &batch)
{
auto it = spirvShader->inputBuiltins.find(spv::BuiltInVertexIndex);
if (it != spirvShader->inputBuiltins.end())
{
assert(it->second.SizeInComponents == 1);
+
+ Int4 indices;
+ indices = Insert(indices, As<Int>(batch[0]), 0);
+ indices = Insert(indices, As<Int>(batch[1]), 1);
+ indices = Insert(indices, As<Int>(batch[2]), 2);
+ indices = Insert(indices, As<Int>(batch[3]), 3);
routine.getVariable(it->second.Id)[it->second.FirstComponent] =
- As<Float4>(Int4(As<Int>(index) + *Pointer<Int>(data + OFFSET(DrawData, baseVertex))) + Int4(0, 1, 2, 3));
+ As<Float4>(indices + Int4(*Pointer<Int>(data + OFFSET(DrawData, baseVertex))));
}
- auto activeLaneMask = SIMD::Int(0xFFFFFFFF); // TODO: Control this.
+ auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
spirvShader->emit(&routine, activeLaneMask, descriptorSets);
spirvShader->emitEpilog(&routine);
diff --git a/src/Pipeline/VertexProgram.hpp b/src/Pipeline/VertexProgram.hpp
index 765575d..3c8a664 100644
--- a/src/Pipeline/VertexProgram.hpp
+++ b/src/Pipeline/VertexProgram.hpp
@@ -34,7 +34,7 @@
virtual ~VertexProgram();
private:
- void program(UInt &index) override;
+ void program(Pointer<UInt> &batch) override;
const vk::DescriptorSet::Bindings &descriptorSets;
};
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index ae65feb..8ada5d4 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -42,33 +42,32 @@
{
Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
- Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
+ Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache,tag));
UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
+ // Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
+ // On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
+ // in reverse order to guarantee that the first one doesn't get evicted and can be written out.
+
Do
{
UInt index = *batch;
- UInt tagIndex = index & 0x0000003C;
- UInt indexQ = index & 0xFFFFFFFC;
+ UInt cacheIndex = index & VertexCache::TAG_MASK;
- If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
+ If(tagCache[cacheIndex] != index)
{
- *Pointer<UInt>(tagCache + tagIndex) = indexQ;
-
- readInput(indexQ);
- program(indexQ);
+ readInput(batch);
+ program(batch);
computeClipFlags();
- Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
- writeCache(cacheLine0);
+ writeCache(vertexCache, tagCache, batch);
}
- UInt cacheIndex = index & 0x0000003F;
- Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
- writeVertex(vertex, cacheLine);
+ Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
+ writeVertex(vertex, cacheEntry);
vertex += sizeof(Vertex);
batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
@@ -79,7 +78,7 @@
Return();
}
- void VertexRoutine::readInput(UInt &index)
+ void VertexRoutine::readInput(Pointer<UInt> &batch)
{
for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
{
@@ -88,11 +87,10 @@
spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
{
-
Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4));
UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
- auto value = readStream(input, stride, state.input[i / 4], index);
+ auto value = readStream(input, stride, state.input[i / 4], batch);
routine.inputs[i + 0] = value.x;
routine.inputs[i + 1] = value.y;
routine.inputs[i + 2] = value.z;
@@ -134,14 +132,14 @@
clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
}
- Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
+ Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch)
{
Vector4f v;
- Pointer<Byte> source0 = buffer + index * stride;
- Pointer<Byte> source1 = source0 + stride;
- Pointer<Byte> source2 = source1 + stride;
- Pointer<Byte> source3 = source2 + stride;
+ Pointer<Byte> source0 = buffer + batch[0] * stride;
+ Pointer<Byte> source1 = buffer + batch[1] * stride;
+ Pointer<Byte> source2 = buffer + batch[2] * stride;
+ Pointer<Byte> source3 = buffer + batch[3] * stride;
bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;
@@ -486,8 +484,25 @@
return v;
}
- void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
+ void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
{
+ UInt index0 = batch[0];
+ UInt index1 = batch[1];
+ UInt index2 = batch[2];
+ UInt index3 = batch[3];
+
+ UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
+ UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
+ UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
+ UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
+
+ // We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
+ // Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
+ tagCache[cacheIndex3] = index3;
+ tagCache[cacheIndex2] = index2;
+ tagCache[cacheIndex1] = index1;
+ tagCache[cacheIndex0] = index0;
+
auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
assert(it != spirvShader->outputBuiltins.end());
assert(it->second.SizeInComponents == 4);
@@ -511,10 +526,10 @@
transpose4x4(pos.x, pos.y, pos.z, pos.w);
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 0, 16) = pos.x;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 1, 16) = pos.y;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 2, 16) = pos.z;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 3, 16) = pos.w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,position), 16) = pos.w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,position), 16) = pos.z;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,position), 16) = pos.y;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,position), 16) = pos.x;
it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
if(it != spirvShader->outputBuiltins.end())
@@ -522,23 +537,23 @@
assert(it->second.SizeInComponents == 1);
auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
- *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 0) = Extract(psize, 0);
- *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 1) = Extract(psize, 1);
- *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 2) = Extract(psize, 2);
- *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 3) = Extract(psize, 3);
+ *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,pointSize)) = Extract(psize, 3);
+ *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,pointSize)) = Extract(psize, 2);
+ *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,pointSize)) = Extract(psize, 1);
+ *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0);
}
- *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0) & 0x0000000FF;
- *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8) & 0x0000000FF;
- *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
- *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
+ *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
+ *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
+ *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
+ *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
transpose4x4(proj.x, proj.y, proj.z, proj.w);
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 0, 16) = proj.x;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 1, 16) = proj.y;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 2, 16) = proj.z;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 3, 16) = proj.w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,projected), 16) = proj.w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,projected), 16) = proj.z;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,projected), 16) = proj.y;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,projected), 16) = proj.x;
for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
{
@@ -555,10 +570,10 @@
transpose4x4(v.x, v.y, v.z, v.w);
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
- *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,v[i]), 16) = v.w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,v[i]), 16) = v.z;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,v[i]), 16) = v.y;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,v[i]), 16) = v.x;
}
}
}
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
index 2e71343..dc22ecb 100644
--- a/src/Pipeline/VertexRoutine.hpp
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -62,14 +62,14 @@
SpirvShader const * const spirvShader;
private:
- virtual void program(UInt &index) = 0;
+ virtual void program(Pointer<UInt> &batch) = 0;
typedef VertexProcessor::State::Input Stream;
- Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index);
- void readInput(UInt &index);
+ Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch);
+ void readInput(Pointer<UInt> &batch);
void computeClipFlags();
- void writeCache(Pointer<Byte> &cacheLine);
+ void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch);
void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry);
};
}