Process independent vertex elements Previously, vertices would be processed in consecutive groups of four (for SSE/NEON). Now four indices are read from the index buffer. Reading the input was already a gather operation, but with constant stride. The vertex cache now performs a scatter. The vertices are written in reverse order so that the first vertex in a group is always present in the cache. Also use 2^32-1 as invalid vertex cache index (corresponds with the primitive restart index) instead of 0x80000000, since maxDrawIndexedIndexValue is UINT32_MAX. Bug: b/27351835 Test: dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex Change-Id: Ic69dbf53c67cbda50e44913ccae91aaca2b86e21 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/32609 Presubmit-Ready: Nicolas Capens <nicolascapens@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp index dc0c3f5..3d4581b 100644 --- a/src/Device/Renderer.cpp +++ b/src/Device/Renderer.cpp
@@ -824,7 +824,7 @@ task->vertexCache.drawCall = primitiveDrawCall; } - unsigned int batch[128][3]; // FIXME: Adjust to dynamic batch size + unsigned int batch[128 + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size. VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology)); if(!indices) @@ -862,6 +862,11 @@ } } + // Repeat the last index to allow for SIMD width overrun. + batch[triangleCount][0] = batch[triangleCount - 1][2]; + batch[triangleCount][1] = batch[triangleCount - 1][2]; + batch[triangleCount][2] = batch[triangleCount - 1][2]; + task->primitiveStart = start; task->vertexCount = triangleCount * 3; vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
diff --git a/src/Device/VertexProcessor.cpp b/src/Device/VertexProcessor.cpp index 82c4547..5c66309 100644 --- a/src/Device/VertexProcessor.cpp +++ b/src/Device/VertexProcessor.cpp
@@ -25,9 +25,9 @@ { void VertexCache::clear() { - for(int i = 0; i < 16; i++) + for(uint32_t i = 0; i < SIZE; i++) { - tag[i] = 0x80000000; + tag[i] = 0xFFFFFFFF; } }
diff --git a/src/Device/VertexProcessor.hpp b/src/Device/VertexProcessor.hpp index 811ac32..a17e86a 100644 --- a/src/Device/VertexProcessor.hpp +++ b/src/Device/VertexProcessor.hpp
@@ -25,12 +25,16 @@ { struct DrawData; - struct VertexCache // FIXME: Variable size + // Basic direct mapped vertex cache. + struct VertexCache { + static constexpr uint32_t SIZE = 64; // TODO: Variable size? + static constexpr uint32_t TAG_MASK = SIZE - 1; // Size must be power of 2. + void clear(); - Vertex vertex[16][4]; - unsigned int tag[16]; + Vertex vertex[SIZE]; + uint32_t tag[SIZE]; int drawCall; };
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp index 647ff3a..e240e7f 100644 --- a/src/Pipeline/VertexProgram.cpp +++ b/src/Pipeline/VertexProgram.cpp
@@ -73,17 +73,23 @@ { } - void VertexProgram::program(UInt &index) + void VertexProgram::program(Pointer<UInt> &batch) { auto it = spirvShader->inputBuiltins.find(spv::BuiltInVertexIndex); if (it != spirvShader->inputBuiltins.end()) { assert(it->second.SizeInComponents == 1); + + Int4 indices; + indices = Insert(indices, As<Int>(batch[0]), 0); + indices = Insert(indices, As<Int>(batch[1]), 1); + indices = Insert(indices, As<Int>(batch[2]), 2); + indices = Insert(indices, As<Int>(batch[3]), 3); routine.getVariable(it->second.Id)[it->second.FirstComponent] = - As<Float4>(Int4(As<Int>(index) + *Pointer<Int>(data + OFFSET(DrawData, baseVertex))) + Int4(0, 1, 2, 3)); + As<Float4>(indices + Int4(*Pointer<Int>(data + OFFSET(DrawData, baseVertex)))); } - auto activeLaneMask = SIMD::Int(0xFFFFFFFF); // TODO: Control this. + auto activeLaneMask = SIMD::Int(0xFFFFFFFF); spirvShader->emit(&routine, activeLaneMask, descriptorSets); spirvShader->emitEpilog(&routine);
diff --git a/src/Pipeline/VertexProgram.hpp b/src/Pipeline/VertexProgram.hpp index 765575d..3c8a664 100644 --- a/src/Pipeline/VertexProgram.hpp +++ b/src/Pipeline/VertexProgram.hpp
@@ -34,7 +34,7 @@ virtual ~VertexProgram(); private: - void program(UInt &index) override; + void program(Pointer<UInt> &batch) override; const vk::DescriptorSet::Bindings &descriptorSets; };
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp index ae65feb..8ada5d4 100644 --- a/src/Pipeline/VertexRoutine.cpp +++ b/src/Pipeline/VertexRoutine.cpp
@@ -42,33 +42,32 @@ { Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache); Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex); - Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag); + Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache,tag)); UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount)); constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants)); + // Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer. + // On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache + // in reverse order to guarantee that the first one doesn't get evicted and can be written out. + Do { UInt index = *batch; - UInt tagIndex = index & 0x0000003C; - UInt indexQ = index & 0xFFFFFFFC; + UInt cacheIndex = index & VertexCache::TAG_MASK; - If(*Pointer<UInt>(tagCache + tagIndex) != indexQ) + If(tagCache[cacheIndex] != index) { - *Pointer<UInt>(tagCache + tagIndex) = indexQ; - - readInput(indexQ); - program(indexQ); + readInput(batch); + program(batch); computeClipFlags(); - Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex)); - writeCache(cacheLine0); + writeCache(vertexCache, tagCache, batch); } - UInt cacheIndex = index & 0x0000003F; - Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex)); - writeVertex(vertex, cacheLine); + Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex)); + writeVertex(vertex, cacheEntry); vertex += sizeof(Vertex); batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t)); @@ -79,7 +78,7 @@ Return(); } - void VertexRoutine::readInput(UInt &index) + void VertexRoutine::readInput(Pointer<UInt> &batch) { for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4) { @@ -88,11 +87,10 @@ spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED || spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED) { - Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4)); UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4)); - auto value = readStream(input, stride, state.input[i / 4], index); + auto value = readStream(input, stride, state.input[i / 4], batch); routine.inputs[i + 0] = value.x; routine.inputs[i + 1] = value.y; routine.inputs[i + 2] = value.z; @@ -134,14 +132,14 @@ clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)]; } - Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index) + Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch) { Vector4f v; - Pointer<Byte> source0 = buffer + index * stride; - Pointer<Byte> source1 = source0 + stride; - Pointer<Byte> source2 = source1 + stride; - Pointer<Byte> source3 = source2 + stride; + Pointer<Byte> source0 = buffer + batch[0] * stride; + Pointer<Byte> source1 = buffer + batch[1] * stride; + Pointer<Byte> source2 = buffer + batch[2] * stride; + Pointer<Byte> source3 = buffer + batch[3] * stride; bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized; @@ -486,8 +484,25 @@ return v; } - void VertexRoutine::writeCache(Pointer<Byte> &cacheLine) + void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch) { + UInt index0 = batch[0]; + UInt index1 = batch[1]; + UInt index2 = batch[2]; + UInt index3 = batch[3]; + + UInt cacheIndex0 = index0 & VertexCache::TAG_MASK; + UInt cacheIndex1 = index1 & VertexCache::TAG_MASK; + UInt cacheIndex2 = index2 & VertexCache::TAG_MASK; + UInt cacheIndex3 = index3 & VertexCache::TAG_MASK; + + // We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check. + // Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache. + tagCache[cacheIndex3] = index3; + tagCache[cacheIndex2] = index2; + tagCache[cacheIndex1] = index1; + tagCache[cacheIndex0] = index0; + auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition); assert(it != spirvShader->outputBuiltins.end()); assert(it->second.SizeInComponents == 4); @@ -511,10 +526,10 @@ transpose4x4(pos.x, pos.y, pos.z, pos.w); - *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 0, 16) = pos.x; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 1, 16) = pos.y; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 2, 16) = pos.z; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 3, 16) = pos.w; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,position), 16) = pos.w; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,position), 16) = pos.z; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,position), 16) = pos.y; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,position), 16) = pos.x; it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize); if(it != spirvShader->outputBuiltins.end()) @@ -522,23 +537,23 @@ assert(it->second.SizeInComponents == 1); auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent]; - *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 0) = Extract(psize, 0); - *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 1) = Extract(psize, 1); - *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 2) = Extract(psize, 2); - *Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 3) = Extract(psize, 3); + *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,pointSize)) = Extract(psize, 3); + *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,pointSize)) = Extract(psize, 2); + *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,pointSize)) = Extract(psize, 1); + *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0); } - *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0) & 0x0000000FF; - *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8) & 0x0000000FF; - *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF; - *Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF; + *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF; + *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF; + *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 8) & 0x0000000FF; + *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 0) & 0x0000000FF; transpose4x4(proj.x, proj.y, proj.z, proj.w); - *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 0, 16) = proj.x; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 1, 16) = proj.y; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 2, 16) = proj.z; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 3, 16) = proj.w; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,projected), 16) = proj.w; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,projected), 16) = proj.z; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,projected), 16) = proj.y; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,projected), 16) = proj.x; for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4) { @@ -555,10 +570,10 @@ transpose4x4(v.x, v.y, v.z, v.w); - *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z; - *Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,v[i]), 16) = v.w; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,v[i]), 16) = v.z; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,v[i]), 16) = v.y; + *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,v[i]), 16) = v.x; } } }
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp index 2e71343..dc22ecb 100644 --- a/src/Pipeline/VertexRoutine.hpp +++ b/src/Pipeline/VertexRoutine.hpp
@@ -62,14 +62,14 @@ SpirvShader const * const spirvShader; private: - virtual void program(UInt &index) = 0; + virtual void program(Pointer<UInt> &batch) = 0; typedef VertexProcessor::State::Input Stream; - Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index); - void readInput(UInt &index); + Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch); + void readInput(Pointer<UInt> &batch); void computeClipFlags(); - void writeCache(Pointer<Byte> &cacheLine); + void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch); void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry); }; }