Device: Migrate Renderer to Yarn
Drop the complex task scheduling logic for yarn.
Performance gains seen up to around ~30% FPS.
Bug: b/139142453
Change-Id: I264fee36323425a791088565d99dc586670a948a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35572
Tested-by: Ben Clayton <bclayton@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 628d549..3aa4c3e 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -19,7 +19,6 @@
#include "Polygon.hpp"
#include "Reactor/Reactor.hpp"
#include "Pipeline/Constants.hpp"
-#include "System/CPUID.hpp"
#include "System/Memory.hpp"
#include "System/Half.hpp"
#include "System/Math.hpp"
@@ -33,6 +32,10 @@
#include "Pipeline/SpirvShader.hpp"
#include "Vertex.hpp"
+#include "Yarn/Containers.hpp"
+#include "Yarn/Defer.hpp"
+#include "Yarn/Trace.hpp"
+
#undef max
#ifndef NDEBUG
@@ -42,11 +45,6 @@
namespace sw
{
- static const int batchSize = 128;
- std::atomic<int> threadCount(1);
- std::atomic<int> Renderer::unitCount(1);
- std::atomic<int> Renderer::clusterCount(1);
-
template<typename T>
inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, T indices, unsigned int start, unsigned int triangleCount)
{
@@ -138,20 +136,8 @@
return true;
}
- struct Parameters
- {
- Renderer *renderer;
- int threadIndex;
- };
-
DrawCall::DrawCall()
{
- occlusionQuery = nullptr;
-
- references = -1;
-
- events = nullptr;
-
data = (DrawData*)allocate(sizeof(DrawData));
data->constants = &constants;
}
@@ -163,74 +149,14 @@
Renderer::Renderer(vk::Device* device) : device(device)
{
- for(int i = 0; i < 16; i++)
- {
- vertexTask[i] = nullptr;
-
- worker[i] = nullptr;
- resume[i] = nullptr;
- suspend[i] = nullptr;
- }
-
- threadsAwake = 0;
- resumeApp = new Event();
-
- currentDraw = 0;
- nextDraw = 0;
-
- qHead = 0;
- qSize = 0;
-
- for(int i = 0; i < 16; i++)
- {
- triangleBatch[i] = nullptr;
- primitiveBatch[i] = nullptr;
- }
-
- for(int draw = 0; draw < DRAW_COUNT; draw++)
- {
- drawCall[draw] = new DrawCall();
- drawList[draw] = drawCall[draw];
- }
-
- for(int unit = 0; unit < 16; unit++)
- {
- primitiveProgress[unit].init();
- }
-
- for(int cluster = 0; cluster < 16; cluster++)
- {
- pixelProgress[cluster].init();
- }
-
- updateConfiguration(true);
+ VertexProcessor::setRoutineCacheSize(1024);
+ PixelProcessor::setRoutineCacheSize(1024);
+ SetupProcessor::setRoutineCacheSize(1024);
}
Renderer::~Renderer()
{
- sync.wait();
- terminateThreads();
-
- delete resumeApp;
- resumeApp = nullptr;
-
- for(int draw = 0; draw < DRAW_COUNT; draw++)
- {
- delete drawCall[draw];
- drawCall[draw] = nullptr;
- }
- }
-
- // This object has to be mem aligned
- void* Renderer::operator new(size_t size)
- {
- ASSERT(size == sizeof(Renderer)); // This operator can't be called from a derived class
- return sw::allocate(sizeof(Renderer), 16);
- }
-
- void Renderer::operator delete(void * mem)
- {
- sw::deallocate(mem);
+ drawTickets.take().wait();
}
void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
@@ -239,6 +165,9 @@
{
if(count == 0) { return; }
+ auto id = nextDrawID++;
+ YARN_SCOPED_EVENT("draw %d", id);
+
#ifndef NDEBUG
{
unsigned int minPrimitives = 1;
@@ -250,8 +179,6 @@
}
#endif
- updateConfiguration();
-
int ms = context->sampleCount;
if(!context->multiSampleMask)
@@ -259,10 +186,16 @@
return;
}
- sync.add();
+ yarn::Pool<sw::DrawCall>::Loan draw;
+ {
+ YARN_SCOPED_EVENT("drawCallPool.borrow()");
+ draw = drawCallPool.borrow();
+ }
+ draw->id = id;
if(update)
{
+ YARN_SCOPED_EVENT("update");
vertexState = VertexProcessor::update(context);
setupState = SetupProcessor::update(context);
pixelState = PixelProcessor::update(context);
@@ -272,56 +205,29 @@
pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
}
- int batch = batchSize / ms;
-
- int (Renderer::*setupPrimitives)(int batch, int count);
+ DrawCall::SetupFunction setupPrimitives = nullptr;
if(context->isDrawTriangle())
{
- setupPrimitives = &Renderer::setupTriangles;
+ setupPrimitives = &DrawCall::setupTriangles;
}
else if(context->isDrawLine())
{
- setupPrimitives = &Renderer::setupLines;
+ setupPrimitives = &DrawCall::setupLines;
}
else // Point draw
{
- setupPrimitives = &Renderer::setupPoints;
+ setupPrimitives = &DrawCall::setupPoints;
}
- DrawCall *draw = nullptr;
-
- do
- {
- for(int i = 0; i < DRAW_COUNT; i++)
- {
- if(drawCall[i]->references == -1)
- {
- draw = drawCall[i];
- drawList[nextDraw & DRAW_COUNT_BITS] = draw;
-
- break;
- }
- }
-
- if(!draw)
- {
- resumeApp->wait();
- }
- }
- while(!draw);
-
DrawData *data = draw->data;
-
- if (occlusionQuery)
- {
- occlusionQuery->start();
- }
draw->occlusionQuery = occlusionQuery;
-
+ draw->batchDataPool = &batchDataPool;
+ draw->numPrimitives = count;
+ draw->numPrimitivesPerBatch = MaxBatchSize / ms;
+ draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
draw->topology = context->topology;
draw->indexType = indexType;
- draw->batchSize = batch;
draw->vertexRoutine = vertexRoutine;
draw->setupRoutine = setupRoutine;
@@ -335,14 +241,6 @@
data->descriptorSets = context->descriptorSets;
data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
- if(events)
- {
- events->start();
- }
-
- ASSERT(!draw->events);
- draw->events = events;
-
for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
{
data->input[i] = context->input[i].buffer;
@@ -383,7 +281,7 @@
if(pixelState.occlusionEnabled)
{
- for(int cluster = 0; cluster < clusterCount; cluster++)
+ for(int cluster = 0; cluster < MaxClusterCount; cluster++)
{
data->occlusion[cluster] = 0;
}
@@ -461,357 +359,197 @@
data->pushConstants = pushConstants;
}
- draw->primitive = 0;
- draw->count = count;
+ draw->events = events;
- draw->references = (count + batch - 1) / batch;
+ DrawCall::run(draw, &drawTickets, clusterQueues);
+ }
- schedulerMutex.lock();
- ++nextDraw; // Atomic
- schedulerMutex.unlock();
-
- #ifndef NDEBUG
- if(threadCount == 1) // Use main thread for draw execution
+ void DrawCall::setup()
+ {
+ if(occlusionQuery != nullptr)
{
- threadsAwake = 1;
- task[0].type = Task::RESUME;
-
- taskLoop(0);
+ occlusionQuery->start();
}
- else
- #endif
+
+ if(events)
{
- if(!threadsAwake)
+ events->start();
+ }
+ }
+
+ void DrawCall::teardown()
+ {
+ if(events)
+ {
+ events->finish();
+ events = nullptr;
+ }
+
+ if (occlusionQuery != nullptr)
+ {
+ for(int cluster = 0; cluster < MaxClusterCount; cluster++)
{
- suspend[0]->wait();
-
- threadsAwake = 1;
- task[0].type = Task::RESUME;
-
- resume[0]->signal();
+ occlusionQuery->add(data->occlusion[cluster]);
}
+ occlusionQuery->finish();
}
+
+ vertexRoutine.reset();
+ setupRoutine.reset();
+ pixelRoutine.reset();
}
- void Renderer::threadFunction(void *parameters)
+ void DrawCall::run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount])
{
- Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
- int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;
+ draw->setup();
- CPUID::setFlushToZero(true);
- CPUID::setDenormalsAreZero(true);
+ auto const numPrimitives = draw->numPrimitives;
+ auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
+ auto const numBatches = draw->numBatches;
- renderer->threadLoop(threadIndex);
- }
+ auto ticket = tickets->take();
+ auto finally = yarn::make_shared_finally([draw, ticket] {
+ YARN_SCOPED_EVENT("FINISH draw %d", draw->id);
+ draw->teardown();
+ ticket.done();
+ });
- void Renderer::threadLoop(int threadIndex)
- {
- while(!exitThreads)
+ for (unsigned int batchId = 0; batchId < numBatches; batchId++)
{
- taskLoop(threadIndex);
+ auto batch = draw->batchDataPool->borrow();
+ batch->id = batchId;
+ batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
+ batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
- suspend[threadIndex]->signal();
- resume[threadIndex]->wait();
- }
- }
-
- void Renderer::taskLoop(int threadIndex)
- {
- while(task[threadIndex].type != Task::SUSPEND)
- {
- scheduleTask(threadIndex);
- executeTask(threadIndex);
- }
- }
-
- void Renderer::findAvailableTasks()
- {
- // Find pixel tasks
- for(int cluster = 0; cluster < clusterCount; cluster++)
- {
- if(!pixelProgress[cluster].executing)
+ for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{
- for(int unit = 0; unit < unitCount; unit++)
+ batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
+ }
+
+ yarn::schedule([draw, batch, finally] {
+
+ processVertices(draw.get(), batch.get());
+
+ if (!draw->setupState.rasterizerDiscard)
{
- if(primitiveProgress[unit].references > 0) // Contains processed primitives
+ processPrimitives(draw.get(), batch.get());
+
+ if (batch->numVisible > 0)
{
- if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
- {
- if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive) // Previous primitives have been rendered
- {
- Task &task = taskQueue[qHead];
- task.type = Task::PIXELS;
- task.primitiveUnit = unit;
- task.pixelCluster = cluster;
-
- pixelProgress[cluster].executing = true;
-
- // Commit to the task queue
- qHead = (qHead + 1) & TASK_COUNT_BITS;
- ++qSize; // Atomic
-
- break;
- }
- }
+ processPixels(draw, batch, finally);
+ return;
}
}
- }
- }
- // Find primitive tasks
- if(currentDraw == nextDraw)
- {
- return; // No more primitives to process
- }
-
- for(int unit = 0; unit < unitCount; unit++)
- {
- DrawCall *draw = drawList[currentDraw & DRAW_COUNT_BITS];
-
- int primitive = draw->primitive;
- int count = draw->count;
-
- if(primitive >= count)
- {
- ++currentDraw; // Atomic
-
- if(currentDraw == nextDraw)
+ for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{
- return; // No more primitives to process
+ batch->clusterTickets[cluster].done();
}
-
- draw = drawList[currentDraw & DRAW_COUNT_BITS];
- }
-
- if(!primitiveProgress[unit].references) // Task not already being executed and not still in use by a pixel unit
- {
- primitive = draw->primitive;
- count = draw->count;
- int batch = draw->batchSize;
-
- primitiveProgress[unit].drawCall = currentDraw.load();
- primitiveProgress[unit].firstPrimitive = primitive;
- primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;
-
- draw->primitive += batch;
-
- Task &task = taskQueue[qHead];
- task.type = Task::PRIMITIVES;
- task.primitiveUnit = unit;
-
- primitiveProgress[unit].references = -1;
-
- // Commit to the task queue
- qHead = (qHead + 1) & TASK_COUNT_BITS;
- ++qSize; // Atomic
- }
+ });
}
}
- void Renderer::scheduleTask(int threadIndex)
+ void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
{
- schedulerMutex.lock();
+ YARN_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
- int curThreadsAwake = threadsAwake;
-
- if((int)qSize < threadCount - curThreadsAwake + 1)
+ unsigned int triangleIndices[MaxBatchSize + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
{
- findAvailableTasks();
+ YARN_SCOPED_EVENT("processPrimitiveVertices");
+ processPrimitiveVertices(
+ triangleIndices,
+ draw->data->indices,
+ draw->indexType,
+ batch->firstPrimitive,
+ batch->numPrimitives,
+ draw->topology);
}
- if(qSize != 0)
+ auto& vertexTask = batch->vertexTask;
+ vertexTask.primitiveStart = batch->firstPrimitive;
+ vertexTask.vertexCount = batch->numPrimitives * 3;
+ if (vertexTask.vertexCache.drawCall != draw->id)
{
- task[threadIndex] = taskQueue[(qHead - qSize) & TASK_COUNT_BITS];
- --qSize; // Atomic
-
- if(curThreadsAwake != threadCount)
- {
- int wakeup = qSize - curThreadsAwake + 1;
-
- for(int i = 0; i < threadCount && wakeup > 0; i++)
- {
- if(task[i].type == Task::SUSPEND)
- {
- suspend[i]->wait();
- task[i].type = Task::RESUME;
- resume[i]->signal();
-
- ++threadsAwake; // Atomic
- wakeup--;
- }
- }
- }
- }
- else
- {
- task[threadIndex].type = Task::SUSPEND;
-
- --threadsAwake; // Atomic
+ vertexTask.vertexCache.clear();
+ vertexTask.vertexCache.drawCall = draw->id;
}
- schedulerMutex.unlock();
+ draw->vertexPointer(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
}
- void Renderer::executeTask(int threadIndex)
+ void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
{
- switch(task[threadIndex].type.load())
+ YARN_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
+ auto triangles = &batch->triangles[0];
+ auto primitives = &batch->primitives[0];
+ batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
+ }
+
+ void DrawCall::processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
+ {
+ struct Data
{
- case Task::PRIMITIVES:
+ Data(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
+ : draw(draw), batch(batch), finally(finally) {}
+ yarn::Loan<DrawCall> draw;
+ yarn::Loan<BatchData> batch;
+ std::shared_ptr<yarn::Finally> finally;
+ };
+ auto data = std::make_shared<Data>(draw, batch, finally);
+ for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+ {
+ batch->clusterTickets[cluster].onCall([data, cluster]
{
- int unit = task[threadIndex].primitiveUnit;
-
- int input = primitiveProgress[unit].firstPrimitive;
- int count = primitiveProgress[unit].primitiveCount;
- DrawCall *draw = drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
- int (Renderer::*setupPrimitives)(int batch, int count) = draw->setupPrimitives;
-
- processPrimitiveVertices(unit, input, count, draw->count, threadIndex);
-
- int visible = 0;
-
- if(!draw->setupState.rasterizerDiscard)
- {
- visible = (this->*setupPrimitives)(unit, count);
- }
-
- primitiveProgress[unit].visible = visible;
- primitiveProgress[unit].references = clusterCount.load();
- }
- break;
- case Task::PIXELS:
- {
- int unit = task[threadIndex].primitiveUnit;
- int visible = primitiveProgress[unit].visible;
-
- if(visible > 0)
- {
- int cluster = task[threadIndex].pixelCluster;
- Primitive *primitive = primitiveBatch[unit];
- DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
- DrawData *data = draw->data;
- PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
-
- pixelRoutine(primitive, visible, cluster, clusterCount, data);
- }
-
- finishRendering(task[threadIndex]);
- }
- break;
- case Task::RESUME:
- break;
- case Task::SUSPEND:
- break;
- default:
- ASSERT(false);
+ auto& draw = data->draw;
+ auto& batch = data->batch;
+ YARN_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
+ draw->pixelPointer(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
+ batch->clusterTickets[cluster].done();
+ });
}
}
void Renderer::synchronize()
{
- sync.wait();
+ YARN_SCOPED_EVENT("synchronize");
+ auto ticket = drawTickets.take();
+ ticket.wait();
device->updateSamplingRoutineConstCache();
+ ticket.done();
}
- void Renderer::finishRendering(Task &pixelTask)
+ void DrawCall::processPrimitiveVertices(
+ unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+ const void *primitiveIndices,
+ VkIndexType indexType,
+ unsigned int start,
+ unsigned int triangleCount,
+ VkPrimitiveTopology topology)
{
- int unit = pixelTask.primitiveUnit;
- int cluster = pixelTask.pixelCluster;
-
- DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
- DrawData &data = *draw.data;
- int primitive = primitiveProgress[unit].firstPrimitive;
- int count = primitiveProgress[unit].primitiveCount;
- int processedPrimitives = primitive + count;
-
- pixelProgress[cluster].processedPrimitives = processedPrimitives;
-
- if(pixelProgress[cluster].processedPrimitives >= draw.count)
- {
- ++pixelProgress[cluster].drawCall; // Atomic
- pixelProgress[cluster].processedPrimitives = 0;
- }
-
- int ref = --primitiveProgress[unit].references; // Atomic
-
- if(ref == 0)
- {
- ref = --draw.references; // Atomic
-
- if(ref == 0)
- {
- if (draw.occlusionQuery)
- {
- for(int cluster = 0; cluster < clusterCount; cluster++)
- {
- draw.occlusionQuery->add(data.occlusion[cluster]);
- }
- draw.occlusionQuery->finish();
- }
-
- draw.vertexRoutine.reset();
- draw.setupRoutine.reset();
- draw.pixelRoutine.reset();
-
- if(draw.events)
- {
- draw.events->finish();
- draw.events = nullptr;
- }
-
- sync.done();
-
- draw.references = -1;
- resumeApp->signal();
- }
- }
-
- pixelProgress[cluster].executing = false;
- }
-
- void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread)
- {
- Triangle *triangle = triangleBatch[unit];
- int primitiveDrawCall = primitiveProgress[unit].drawCall;
- DrawCall *draw = drawList[primitiveDrawCall & DRAW_COUNT_BITS];
- DrawData *data = draw->data;
- VertexTask *task = vertexTask[thread];
-
- const void *indices = data->indices;
- VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;
-
- if(task->vertexCache.drawCall != primitiveDrawCall)
- {
- task->vertexCache.clear();
- task->vertexCache.drawCall = primitiveDrawCall;
- }
-
- unsigned int batch[128 + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
- VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology));
-
- if(!indices)
+ if(!primitiveIndices)
{
struct LinearIndex
{
unsigned int operator[](unsigned int i) { return i; }
};
- if(!setBatchIndices(batch, topology, LinearIndex(), start, triangleCount))
+ if(!setBatchIndices(triangleIndicesOut, topology, LinearIndex(), start, triangleCount))
{
return;
}
}
else
{
- switch(draw->indexType.load())
+ switch(indexType)
{
case VK_INDEX_TYPE_UINT16:
- if(!setBatchIndices(batch, topology, static_cast<const uint16_t*>(indices), start, triangleCount))
+ if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
{
return;
}
break;
case VK_INDEX_TYPE_UINT32:
- if(!setBatchIndices(batch, topology, static_cast<const uint32_t*>(indices), start, triangleCount))
+ if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
{
return;
}
@@ -824,33 +562,25 @@
}
// Repeat the last index to allow for SIMD width overrun.
- batch[triangleCount][0] = batch[triangleCount - 1][2];
- batch[triangleCount][1] = batch[triangleCount - 1][2];
- batch[triangleCount][2] = batch[triangleCount - 1][2];
-
- task->primitiveStart = start;
- task->vertexCount = triangleCount * 3;
- vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
+ triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
+ triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
+ triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
}
- int Renderer::setupTriangles(int unit, int count)
+ int DrawCall::setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
- Triangle *triangle = triangleBatch[unit];
- Primitive *primitive = primitiveBatch[unit];
-
- DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
- SetupProcessor::State &state = draw.setupState;
- const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
+ auto &state = drawCall->setupState;
+ auto setupRoutine = drawCall->setupPointer;
int ms = state.multiSample;
- const DrawData *data = draw.data;
+ const DrawData *data = drawCall->data;
int visible = 0;
- for(int i = 0; i < count; i++, triangle++)
+ for(int i = 0; i < count; i++, triangles++)
{
- Vertex &v0 = triangle->v0;
- Vertex &v1 = triangle->v1;
- Vertex &v2 = triangle->v2;
+ Vertex &v0 = triangles->v0;
+ Vertex &v1 = triangles->v1;
+ Vertex &v2 = triangles->v2;
if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
{
@@ -860,15 +590,15 @@
if(clipFlagsOr != Clipper::CLIP_FINITE)
{
- if(!Clipper::Clip(polygon, clipFlagsOr, draw))
+ if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
{
continue;
}
}
- if(setupRoutine(primitive, triangle, &polygon, data))
+ if(setupRoutine(primitives, triangles, &polygon, data))
{
- primitive += ms;
+ primitives += ms;
visible++;
}
}
@@ -877,57 +607,49 @@
return visible;
}
- int Renderer::setupLines(int unit, int count)
+ int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
- Triangle *triangle = triangleBatch[unit];
- Primitive *primitive = primitiveBatch[unit];
+ auto &state = drawCall->setupState;
+
int visible = 0;
-
- DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
- SetupProcessor::State &state = draw.setupState;
-
int ms = state.multiSample;
for(int i = 0; i < count; i++)
{
- if(setupLine(*primitive, *triangle, draw))
+ if(setupLine(*primitives, *triangles, *drawCall))
{
- primitive += ms;
+ primitives += ms;
visible++;
}
- triangle++;
+ triangles++;
}
return visible;
}
- int Renderer::setupPoints(int unit, int count)
+ int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
- Triangle *triangle = triangleBatch[unit];
- Primitive *primitive = primitiveBatch[unit];
+ auto &state = drawCall->setupState;
+
int visible = 0;
-
- DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
- SetupProcessor::State &state = draw.setupState;
-
int ms = state.multiSample;
for(int i = 0; i < count; i++)
{
- if(setupPoint(*primitive, *triangle, draw))
+ if(setupPoint(*primitives, *triangles, *drawCall))
{
- primitive += ms;
+ primitives += ms;
visible++;
}
- triangle++;
+ triangles++;
}
return visible;
}
- bool Renderer::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+ bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
const DrawData &data = *draw.data;
@@ -1120,7 +842,7 @@
return false;
}
- bool Renderer::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+ bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
const DrawData &data = *draw.data;
@@ -1183,76 +905,6 @@
return false;
}
- void Renderer::initializeThreads()
- {
- unitCount = ceilPow2(threadCount);
- clusterCount = ceilPow2(threadCount);
-
- for(int i = 0; i < unitCount; i++)
- {
- triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
- primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
- }
-
- for(int i = 0; i < threadCount; i++)
- {
- vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
- vertexTask[i]->vertexCache.drawCall = -1;
-
- task[i].type = Task::SUSPEND;
-
- resume[i] = new Event();
- suspend[i] = new Event();
-
- Parameters parameters;
- parameters.threadIndex = i;
- parameters.renderer = this;
-
- exitThreads = false;
- worker[i] = new std::thread(threadFunction, ¶meters);
-
- suspend[i]->wait();
- suspend[i]->signal();
- }
- }
-
- void Renderer::terminateThreads()
- {
- while(threadsAwake != 0)
- {
- std::this_thread::yield();
- }
-
- for(int thread = 0; thread < threadCount; thread++)
- {
- if(worker[thread])
- {
- exitThreads = true;
- resume[thread]->signal();
- worker[thread]->join();
-
- delete worker[thread];
- worker[thread] = 0;
- delete resume[thread];
- resume[thread] = 0;
- delete suspend[thread];
- suspend[thread] = 0;
- }
-
- deallocate(vertexTask[thread]);
- vertexTask[thread] = 0;
- }
-
- for(int i = 0; i < 16; i++)
- {
- deallocate(triangleBatch[i]);
- triangleBatch[i] = 0;
-
- deallocate(primitiveBatch[i]);
- primitiveBatch[i] = 0;
- }
- }
-
void Renderer::addQuery(vk::Query *query)
{
ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
@@ -1292,28 +944,4 @@
this->scissor = scissor;
}
- void Renderer::updateConfiguration(bool initialUpdate)
- {
- if(initialUpdate)
- {
- terminateThreads();
-
- VertexProcessor::setRoutineCacheSize(1024);
- PixelProcessor::setRoutineCacheSize(1024);
- SetupProcessor::setRoutineCacheSize(1024);
-
- threadCount = CPUID::processAffinity();
-
- CPUID::setEnableSSE4_1(true);
- CPUID::setEnableSSSE3(true);
- CPUID::setEnableSSE3(true);
- CPUID::setEnableSSE2(true);
- CPUID::setEnableSSE(true);
- }
-
- if(!initialUpdate && !worker[0])
- {
- initializeThreads();
- }
- }
}
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index e714d37..70a4b02 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -19,11 +19,15 @@
#include "PixelProcessor.hpp"
#include "SetupProcessor.hpp"
#include "Plane.hpp"
+#include "Primitive.hpp"
#include "Blitter.hpp"
#include "Device/Config.hpp"
-#include "System/Synchronization.hpp"
#include "Vulkan/VkDescriptorSet.hpp"
+#include "Yarn/Pool.hpp"
+#include "Yarn/Finally.hpp"
+#include "Yarn/Ticket.hpp"
+
#include <atomic>
#include <list>
#include <mutex>
@@ -46,6 +50,14 @@
class Resource;
struct Constants;
+ static constexpr int MaxBatchSize = 128;
+ static constexpr int MaxBatchCount = 16;
+ static constexpr int MaxClusterCount = 16;
+ static constexpr int MaxDrawCount = 16;
+
+ using TriangleBatch = std::array<Triangle, MaxBatchSize>;
+ using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
+
struct DrawData
{
const Constants *constants;
@@ -64,7 +76,7 @@
PixelProcessor::Stencil stencil[2]; // clockwise, counterclockwise
PixelProcessor::Factor factor;
- unsigned int occlusion[16]; // Number of pixels passing depth test
+ unsigned int occlusion[MaxClusterCount]; // Number of pixels passing depth test
float4 Wx16;
float4 Hx16;
@@ -100,71 +112,88 @@
PushConstantStorage pushConstants;
};
+ struct DrawCall
+ {
+ struct BatchData
+ {
+ using Pool = yarn::BoundedPool<BatchData, MaxBatchCount, yarn::PoolPolicy::Preserve>;
+
+ TriangleBatch triangles;
+ PrimitiveBatch primitives;
+ VertexTask vertexTask;
+ unsigned int id;
+ unsigned int firstPrimitive;
+ unsigned int numPrimitives;
+ int numVisible;
+ yarn::Ticket clusterTickets[MaxClusterCount];
+ };
+
+ using Pool = yarn::BoundedPool<DrawCall, MaxDrawCount, yarn::PoolPolicy::Preserve>;
+ using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+
+ DrawCall();
+ ~DrawCall();
+
+ static void run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount]);
+ static void processVertices(DrawCall* draw, BatchData* batch);
+ static void processPrimitives(DrawCall* draw, BatchData* batch);
+ static void processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally);
+ void setup();
+ void teardown();
+
+ int id;
+
+ BatchData::Pool *batchDataPool;
+ unsigned int numPrimitives;
+ unsigned int numPrimitivesPerBatch;
+ unsigned int numBatches;
+
+ VkPrimitiveTopology topology;
+ VkIndexType indexType;
+
+ std::shared_ptr<Routine> vertexRoutine;
+ std::shared_ptr<Routine> setupRoutine;
+ std::shared_ptr<Routine> pixelRoutine;
+
+ VertexProcessor::RoutinePointer vertexPointer;
+ SetupProcessor::RoutinePointer setupPointer;
+ PixelProcessor::RoutinePointer pixelPointer;
+
+ SetupFunction setupPrimitives;
+ SetupProcessor::State setupState;
+
+ vk::ImageView *renderTarget[RENDERTARGETS];
+ vk::ImageView *depthBuffer;
+ vk::ImageView *stencilBuffer;
+ TaskEvents *events;
+
+ vk::Query* occlusionQuery;
+
+ DrawData *data;
+
+ static void processPrimitiveVertices(
+ unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+ const void *primitiveIndices,
+ VkIndexType indexType,
+ unsigned int start,
+ unsigned int triangleCount,
+ VkPrimitiveTopology topology);
+
+ static int setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+ static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+ static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+
+ static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+ static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+ };
+
class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
{
- struct Task
- {
- enum Type
- {
- PRIMITIVES,
- PIXELS,
-
- RESUME,
- SUSPEND
- };
-
- void operator=(const Task& task)
- {
- type = task.type.load();
- primitiveUnit = task.primitiveUnit.load();
- pixelCluster = task.pixelCluster.load();
- }
-
- std::atomic<int> type;
- std::atomic<int> primitiveUnit;
- std::atomic<int> pixelCluster;
- };
-
- struct PrimitiveProgress
- {
- void init()
- {
- drawCall = 0;
- firstPrimitive = 0;
- primitiveCount = 0;
- visible = 0;
- references = 0;
- }
-
- std::atomic<int> drawCall;
- std::atomic<int> firstPrimitive;
- std::atomic<int> primitiveCount;
- std::atomic<int> visible;
- std::atomic<int> references;
- };
-
- struct PixelProgress
- {
- void init()
- {
- drawCall = 0;
- processedPrimitives = 0;
- executing = false;
- }
-
- std::atomic<int> drawCall;
- std::atomic<int> processedPrimitives;
- std::atomic<int> executing;
- };
-
public:
Renderer(vk::Device* device);
virtual ~Renderer();
- void *operator new(size_t size);
- void operator delete(void * mem);
-
bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
@@ -182,74 +211,18 @@
void synchronize();
- static int getClusterCount() { return clusterCount; }
-
private:
- static void threadFunction(void *parameters);
- void threadLoop(int threadIndex);
- void taskLoop(int threadIndex);
- void findAvailableTasks();
- void scheduleTask(int threadIndex);
- void executeTask(int threadIndex);
- void finishRendering(Task &pixelTask);
-
- void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
-
- int setupTriangles(int batch, int count);
- int setupLines(int batch, int count);
- int setupPoints(int batch, int count);
-
- bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
- bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
-
- void updateConfiguration(bool initialUpdate = false);
- void initializeThreads();
- void terminateThreads();
-
VkViewport viewport;
VkRect2D scissor;
- Triangle *triangleBatch[16];
- Primitive *primitiveBatch[16];
+ DrawCall::Pool drawCallPool;
+ DrawCall::BatchData::Pool batchDataPool;
- std::atomic<int> exitThreads;
- std::atomic<int> threadsAwake;
- std::thread *worker[16];
- Event *resume[16]; // Events for resuming threads
- Event *suspend[16]; // Events for suspending threads
- Event *resumeApp; // Event for resuming the application thread
-
- PrimitiveProgress primitiveProgress[16];
- PixelProgress pixelProgress[16];
- Task task[16]; // Current tasks for threads
-
- enum {
- DRAW_COUNT = 16, // Number of draw calls buffered (must be power of 2)
- DRAW_COUNT_BITS = DRAW_COUNT - 1,
- };
- DrawCall *drawCall[DRAW_COUNT];
- DrawCall *drawList[DRAW_COUNT];
-
- std::atomic<int> currentDraw;
- std::atomic<int> nextDraw;
-
- enum {
- TASK_COUNT = 32, // Size of the task queue (must be power of 2)
- TASK_COUNT_BITS = TASK_COUNT - 1,
- };
- Task taskQueue[TASK_COUNT];
- std::atomic<int> qHead;
- std::atomic<int> qSize;
-
- static std::atomic<int> unitCount;
- static std::atomic<int> clusterCount;
-
- std::mutex schedulerMutex;
-
- VertexTask *vertexTask[16];
+ std::atomic<int> nextDrawID = {0};
vk::Query *occlusionQuery;
- WaitGroup sync;
+ yarn::Ticket::Queue drawTickets;
+ yarn::Ticket::Queue clusterQueues[MaxClusterCount];
VertexProcessor::State vertexState;
SetupProcessor::State setupState;
@@ -262,40 +235,6 @@
vk::Device* device;
};
- struct DrawCall
- {
- DrawCall();
-
- ~DrawCall();
-
- std::atomic<int> topology;
- std::atomic<int> indexType;
- std::atomic<int> batchSize;
-
- std::shared_ptr<Routine> vertexRoutine;
- std::shared_ptr<Routine> setupRoutine;
- std::shared_ptr<Routine> pixelRoutine;
-
- VertexProcessor::RoutinePointer vertexPointer;
- SetupProcessor::RoutinePointer setupPointer;
- PixelProcessor::RoutinePointer pixelPointer;
-
- int (Renderer::*setupPrimitives)(int batch, int count);
- SetupProcessor::State setupState;
-
- vk::ImageView *renderTarget[RENDERTARGETS];
- vk::ImageView *depthBuffer;
- vk::ImageView *stencilBuffer;
- TaskEvents *events;
-
- vk::Query *occlusionQuery;
-
- std::atomic<int> primitive; // Current primitive to enter pipeline
- std::atomic<int> count; // Number of primitives to render
- std::atomic<int> references; // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
-
- DrawData *data;
- };
}
#endif // sw_Renderer_hpp