Threaded Queue submit with events and fences

This cl does 3 main things:
- It pushes the queue submit operation to its own thread
- It implements events
- It implements fences

Some details:
- Because we can have N async draw operations and we need to signal
  the fence only after all operations are completed, fences have a
  add/done mechanism to allow signaling the fence only when all
  draw operations are completed.
- Device::waitForFences() detects large timeouts to avoid integer
  overflow if now+timeout is bigger than the remaining nanoseconds
  available in a long long.

Bug b/117835459

Change-Id: I2f02c3b4bb9d9ac9037909b02b0601e1bae15d21
Tests: dEQP-VK.synchronization.*
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/29769
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Vulkan/VkCommandBuffer.cpp b/src/Vulkan/VkCommandBuffer.cpp
index 9a25dad..56c37f1 100644
--- a/src/Vulkan/VkCommandBuffer.cpp
+++ b/src/Vulkan/VkCommandBuffer.cpp
@@ -15,6 +15,7 @@
 #include "VkCommandBuffer.hpp"
 #include "VkBuffer.hpp"
 #include "VkEvent.hpp"
+#include "VkFence.hpp"
 #include "VkFramebuffer.hpp"
 #include "VkImage.hpp"
 #include "VkImageView.hpp"
@@ -523,7 +524,8 @@
 		for(uint32_t instance = firstInstance; instance != firstInstance + instanceCount; instance++)
 		{
 			executionState.renderer->setInstanceID(instance);
-			executionState.renderer->draw(context.topology, executionState.indexType, primitiveCount, vertexOffset);
+			executionState.renderer->draw(context.topology, executionState.indexType, primitiveCount, vertexOffset,
+			                              executionState.fence);
 			executionState.renderer->advanceInstanceAttributes();
 		}
 	}
@@ -843,11 +845,8 @@
 
 	void play(CommandBuffer::ExecutionState& executionState) override
 	{
-		if(Cast(ev)->signal())
-		{
-			// Was waiting for signal on this event, sync now
-			executionState.renderer->synchronize();
-		}
+		executionState.renderer->synchronize();
+		Cast(ev)->signal();
 	}
 
 private:
@@ -879,11 +878,8 @@
 
 	void play(CommandBuffer::ExecutionState& executionState) override
 	{
-		if(!Cast(ev)->wait())
-		{
-			// Already signaled, sync now
-			executionState.renderer->synchronize();
-		}
+		executionState.renderer->synchronize();
+		Cast(ev)->wait();
 	}
 
 private:
diff --git a/src/Vulkan/VkCommandBuffer.hpp b/src/Vulkan/VkCommandBuffer.hpp
index 0aeeb96..f13ee23 100644
--- a/src/Vulkan/VkCommandBuffer.hpp
+++ b/src/Vulkan/VkCommandBuffer.hpp
@@ -31,6 +31,7 @@
 namespace vk
 {
 
+class Fence;
 class Framebuffer;
 class Pipeline;
 class RenderPass;
@@ -132,6 +133,7 @@
 		};
 
 		sw::Renderer* renderer = nullptr;
+		Fence* fence = nullptr;
 		RenderPass* renderPass = nullptr;
 		Framebuffer* renderPassFramebuffer = nullptr;
 		std::array<PipelineState, VK_PIPELINE_BIND_POINT_RANGE_SIZE> pipelineState;
diff --git a/src/Vulkan/VkDevice.cpp b/src/Vulkan/VkDevice.cpp
index ca91985..6b082e6 100644
--- a/src/Vulkan/VkDevice.cpp
+++ b/src/Vulkan/VkDevice.cpp
@@ -17,11 +17,22 @@
 #include "VkConfig.h"
 #include "VkDebug.hpp"
 #include "VkDescriptorSetLayout.hpp"
+#include "VkFence.hpp"
 #include "VkQueue.hpp"
 #include "Device/Blitter.hpp"
 
+#include <chrono>
+#include <climits>
 #include <new> // Must #include this to use "placement new"
 
+namespace
+{
+	std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> now()
+	{
+		return std::chrono::time_point_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now());
+	}
+}
+
 namespace vk
 {
 
@@ -42,7 +53,7 @@
 
 		for(uint32_t j = 0; j < queueCreateInfo.queueCount; j++, queueID++)
 		{
-			new (&queues[queueID]) Queue(queueCreateInfo.queueFamilyIndex, queueCreateInfo.pQueuePriorities[j]);
+			new (&queues[queueID]) Queue();
 		}
 	}
 
@@ -86,17 +97,85 @@
 	return queues[queueIndex];
 }
 
-void Device::waitForFences(uint32_t fenceCount, const VkFence* pFences, VkBool32 waitAll, uint64_t timeout)
+VkResult Device::waitForFences(uint32_t fenceCount, const VkFence* pFences, VkBool32 waitAll, uint64_t timeout)
 {
-	// FIXME(b/117835459) : noop
+	const time_point start = now();
+	const uint64_t max_timeout = (LLONG_MAX - start.time_since_epoch().count());
+	bool infiniteTimeout = (timeout > max_timeout);
+	const time_point end_ns = start + std::chrono::nanoseconds(std::min(max_timeout, timeout));
+	if(waitAll) // All fences must be signaled
+	{
+		for(uint32_t i = 0; i < fenceCount; i++)
+		{
+			if(timeout == 0)
+			{
+				if(Cast(pFences[i])->getStatus() != VK_SUCCESS) // At least one fence is not signaled
+				{
+					return VK_TIMEOUT;
+				}
+			}
+			else if(infiniteTimeout)
+			{
+				if(Cast(pFences[i])->wait() != VK_SUCCESS) // At least one fence is not signaled
+				{
+					return VK_TIMEOUT;
+				}
+			}
+			else
+			{
+				if(Cast(pFences[i])->waitUntil(end_ns) != VK_SUCCESS) // At least one fence is not signaled
+				{
+					return VK_TIMEOUT;
+				}
+			}
+		}
+
+		return VK_SUCCESS;
+	}
+	else // At least one fence must be signaled
+	{
+		// Start by quickly checking the status of all fences, as only one is required
+		for(uint32_t i = 0; i < fenceCount; i++)
+		{
+			if(Cast(pFences[i])->getStatus() == VK_SUCCESS) // At least one fence is signaled
+			{
+				return VK_SUCCESS;
+			}
+		}
+
+		if(timeout > 0)
+		{
+			for(uint32_t i = 0; i < fenceCount; i++)
+			{
+				if(infiniteTimeout)
+				{
+					if(Cast(pFences[i])->wait() == VK_SUCCESS) // At least one fence is signaled
+					{
+						return VK_SUCCESS;
+					}
+				}
+				else
+				{
+					if(Cast(pFences[i])->waitUntil(end_ns) == VK_SUCCESS) // At least one fence is signaled
+					{
+						return VK_SUCCESS;
+					}
+				}
+			}
+		}
+
+		return VK_TIMEOUT;
+	}
 }
 
-void Device::waitIdle()
+VkResult Device::waitIdle()
 {
 	for(uint32_t i = 0; i < queueCount; i++)
 	{
 		queues[i].waitIdle();
 	}
+
+	return VK_SUCCESS;
 }
 
 void Device::getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo,
diff --git a/src/Vulkan/VkDevice.hpp b/src/Vulkan/VkDevice.hpp
index 1da119d..2b129f1 100644
--- a/src/Vulkan/VkDevice.hpp
+++ b/src/Vulkan/VkDevice.hpp
@@ -44,8 +44,8 @@
 	static size_t ComputeRequiredAllocationSize(const CreateInfo* info);
 
 	VkQueue getQueue(uint32_t queueFamilyIndex, uint32_t queueIndex) const;
-	void waitForFences(uint32_t fenceCount, const VkFence* pFences, VkBool32 waitAll, uint64_t timeout);
-	void waitIdle();
+	VkResult waitForFences(uint32_t fenceCount, const VkFence* pFences, VkBool32 waitAll, uint64_t timeout);
+	VkResult waitIdle();
 	void getDescriptorSetLayoutSupport(const VkDescriptorSetLayoutCreateInfo* pCreateInfo,
 	                                   VkDescriptorSetLayoutSupport* pSupport) const;
 	VkPhysicalDevice getPhysicalDevice() const { return physicalDevice; }
diff --git a/src/Vulkan/VkEvent.hpp b/src/Vulkan/VkEvent.hpp
index 4735564..d56da28 100644
--- a/src/Vulkan/VkEvent.hpp
+++ b/src/Vulkan/VkEvent.hpp
@@ -16,6 +16,8 @@
 #define VK_EVENT_HPP_
 
 #include "VkObject.hpp"
+#include <condition_variable>
+#include <mutex>
 
 namespace vk
 {
@@ -34,37 +36,38 @@
 		return 0;
 	}
 
-	bool signal()
+	void signal()
 	{
+		std::unique_lock<std::mutex> lock(mutex);
 		status = VK_EVENT_SET;
-		bool wasWaiting = waiting;
-		waiting = false;
-		return wasWaiting;
+		lock.unlock();
+		condition.notify_all();
 	}
 
 	void reset()
 	{
+		std::unique_lock<std::mutex> lock(mutex);
 		status = VK_EVENT_RESET;
 	}
 
-	VkResult getStatus() const
+	VkResult getStatus()
 	{
-		return status;
+		std::unique_lock<std::mutex> lock(mutex);
+		auto result = status;
+		lock.unlock();
+		return result;
 	}
 
-	bool wait()
+	void wait()
 	{
-		if(status != VK_EVENT_SET)
-		{
-			waiting = true;
-		}
-
-		return waiting;
+		std::unique_lock<std::mutex> lock(mutex);
+		condition.wait(lock, [this] { return status == VK_EVENT_SET; });
 	}
 
 private:
-	VkResult status = VK_EVENT_RESET;
-	bool waiting = false;
+	VkResult status = VK_EVENT_RESET; // guarded by mutex
+	std::mutex mutex;
+	std::condition_variable condition;
 };
 
 static inline Event* Cast(VkEvent object)
diff --git a/src/Vulkan/VkFence.hpp b/src/Vulkan/VkFence.hpp
index ba2e764..e86ac67 100644
--- a/src/Vulkan/VkFence.hpp
+++ b/src/Vulkan/VkFence.hpp
@@ -16,42 +16,94 @@
 #define VK_FENCE_HPP_
 
 #include "VkObject.hpp"
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
 
 namespace vk
 {
 
+using time_point = std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
+
 class Fence : public Object<Fence, VkFence>
 {
 public:
+	Fence() : status(VK_NOT_READY) {}
+
 	Fence(const VkFenceCreateInfo* pCreateInfo, void* mem) :
 		status((pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) ? VK_SUCCESS : VK_NOT_READY)
 	{
 	}
 
-	~Fence() = delete;
-
 	static size_t ComputeRequiredAllocationSize(const VkFenceCreateInfo* pCreateInfo)
 	{
 		return 0;
 	}
 
+	void add()
+	{
+		std::unique_lock<std::mutex> lock(mutex);
+		++count;
+	}
+
+	void done()
+	{
+		std::unique_lock<std::mutex> lock(mutex);
+		ASSERT(count > 0);
+		--count;
+		if(count == 0)
+		{
+			// signal the fence, without the unlock/lock required to call signal() here
+			status = VK_SUCCESS;
+			lock.unlock();
+			condition.notify_all();
+		}
+	}
+
 	void signal()
 	{
+		std::unique_lock<std::mutex> lock(mutex);
 		status = VK_SUCCESS;
+		lock.unlock();
+		condition.notify_all();
 	}
 
 	void reset()
 	{
+		std::unique_lock<std::mutex> lock(mutex);
+		ASSERT(count == 0);
 		status = VK_NOT_READY;
 	}
 
-	VkResult getStatus() const
+	VkResult getStatus()
 	{
-		return status;
+		std::unique_lock<std::mutex> lock(mutex);
+		auto out = status;
+		lock.unlock();
+		return out;
+	}
+
+	VkResult wait()
+	{
+		std::unique_lock<std::mutex> lock(mutex);
+		condition.wait(lock, [this] { return status == VK_SUCCESS; });
+		auto out = status;
+		lock.unlock();
+		return out;
+	}
+
+	VkResult waitUntil(const time_point& timeout_ns)
+	{
+		std::unique_lock<std::mutex> lock(mutex);
+		return condition.wait_until(lock, timeout_ns, [this] { return status == VK_SUCCESS; }) ?
+		       VK_SUCCESS : VK_TIMEOUT;
 	}
 
 private:
-	VkResult status = VK_NOT_READY;
+	VkResult status = VK_NOT_READY; // guarded by mutex
+	int32_t count = 0; // guarded by mutex
+	std::mutex mutex;
+	std::condition_variable condition;
 };
 
 static inline Fence* Cast(VkFence object)
diff --git a/src/Vulkan/VkQueue.cpp b/src/Vulkan/VkQueue.cpp
index 4239350..db545d2 100644
--- a/src/Vulkan/VkQueue.cpp
+++ b/src/Vulkan/VkQueue.cpp
@@ -16,30 +16,115 @@
 #include "VkFence.hpp"
 #include "VkQueue.hpp"
 #include "VkSemaphore.hpp"
-#include "Device/Renderer.hpp"
 #include "WSI/VkSwapchainKHR.hpp"
 
+#include <cstring>
+
+namespace
+{
+
+VkSubmitInfo* DeepCopySubmitInfo(uint32_t submitCount, const VkSubmitInfo* pSubmits)
+{
+	size_t submitSize = sizeof(VkSubmitInfo) * submitCount;
+	size_t totalSize = submitSize;
+	for(uint32_t i = 0; i < submitCount; i++)
+	{
+		totalSize += pSubmits[i].waitSemaphoreCount * sizeof(VkSemaphore);
+		totalSize += pSubmits[i].waitSemaphoreCount * sizeof(VkPipelineStageFlags);
+		totalSize += pSubmits[i].signalSemaphoreCount * sizeof(VkSemaphore);
+		totalSize += pSubmits[i].commandBufferCount * sizeof(VkCommandBuffer);
+	}
+
+	uint8_t* mem = static_cast<uint8_t*>(
+		vk::allocate(totalSize, vk::REQUIRED_MEMORY_ALIGNMENT, vk::DEVICE_MEMORY, vk::Fence::GetAllocationScope()));
+
+	auto submits = new (mem) VkSubmitInfo[submitCount];
+	memcpy(mem, pSubmits, submitSize);
+	mem += submitSize;
+
+	for(uint32_t i = 0; i < submitCount; i++)
+	{
+		size_t size = pSubmits[i].waitSemaphoreCount * sizeof(VkSemaphore);
+		submits[i].pWaitSemaphores = new (mem) VkSemaphore[pSubmits[i].waitSemaphoreCount];
+		memcpy(mem, pSubmits[i].pWaitSemaphores, size);
+		mem += size;
+
+		size = pSubmits[i].waitSemaphoreCount * sizeof(VkPipelineStageFlags);
+		submits[i].pWaitDstStageMask = new (mem) VkPipelineStageFlags[pSubmits[i].waitSemaphoreCount];
+		memcpy(mem, pSubmits[i].pWaitDstStageMask, size);
+		mem += size;
+
+		size = pSubmits[i].signalSemaphoreCount * sizeof(VkSemaphore);
+		submits[i].pSignalSemaphores = new (mem) VkSemaphore[pSubmits[i].signalSemaphoreCount];
+		memcpy(mem, pSubmits[i].pSignalSemaphores, size);
+		mem += size;
+
+		size = pSubmits[i].commandBufferCount * sizeof(VkCommandBuffer);
+		submits[i].pCommandBuffers = new (mem) VkCommandBuffer[pSubmits[i].commandBufferCount];
+		memcpy(mem, pSubmits[i].pCommandBuffers, size);
+		mem += size;
+	}
+
+	return submits;
+}
+
+} // anonymous namespace
+
 namespace vk
 {
 
-Queue::Queue(uint32_t pFamilyIndex, float pPriority) : familyIndex(pFamilyIndex), priority(pPriority)
+Queue::Queue()
 {
-	// FIXME (b/119409619): use an allocator here so we can control all memory allocations
-	context = new sw::Context();
-	renderer = new sw::Renderer(context, sw::OpenGL, true);
+	context.reset(new sw::Context());
+	renderer.reset(new sw::Renderer(context.get(), sw::OpenGL, true));
+
+	queueThread = std::thread(TaskLoop, this);
 }
 
 void Queue::destroy()
 {
-	delete context;
-	delete renderer;
+	Task task;
+	task.type = Task::KILL_THREAD;
+	pending.put(task);
+
+	queueThread.join();
+	ASSERT_MSG(pending.count() == 0, "queue has work after worker thread shutdown");
+
+	garbageCollect();
+
+	renderer.reset(nullptr);
+	context.reset(nullptr);
 }
 
-void Queue::submit(uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence)
+VkResult Queue::submit(uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence)
 {
-	for(uint32_t i = 0; i < submitCount; i++)
+	garbageCollect();
+
+	Task task;
+	task.submitCount = submitCount;
+	task.pSubmits = DeepCopySubmitInfo(submitCount, pSubmits);
+	task.fence = (fence != VK_NULL_HANDLE) ? vk::Cast(fence) : nullptr;
+
+	if(task.fence)
 	{
-		auto& submitInfo = pSubmits[i];
+		task.fence->add();
+	}
+
+	pending.put(task);
+
+	return VK_SUCCESS;
+}
+
+void Queue::TaskLoop(vk::Queue* queue)
+{
+	queue->taskLoop();
+}
+
+void Queue::submitQueue(const Task& task)
+{
+	for(uint32_t i = 0; i < task.submitCount; i++)
+	{
+		auto& submitInfo = task.pSubmits[i];
 		for(uint32_t j = 0; j < submitInfo.waitSemaphoreCount; j++)
 		{
 			vk::Cast(submitInfo.pWaitSemaphores[j])->wait(submitInfo.pWaitDstStageMask[j]);
@@ -47,7 +132,8 @@
 
 		{
 			CommandBuffer::ExecutionState executionState;
-			executionState.renderer = renderer;
+			executionState.renderer = renderer.get();
+			executionState.fence = task.fence;
 			for(uint32_t j = 0; j < submitInfo.commandBufferCount; j++)
 			{
 				vk::Cast(submitInfo.pCommandBuffers[j])->submit(executionState);
@@ -60,21 +146,66 @@
 		}
 	}
 
-	// FIXME (b/117835459): signal the fence only once the work is completed
-	if(fence != VK_NULL_HANDLE)
+	if (task.pSubmits)
 	{
-		vk::Cast(fence)->signal();
+		toDelete.put(task.pSubmits);
+	}
+
+	if(task.fence)
+	{
+		task.fence->done();
 	}
 }
 
-void Queue::waitIdle()
+void Queue::taskLoop()
 {
-	// equivalent to submitting a fence to a queue and waiting
-	// with an infinite timeout for that fence to signal
+	while(true)
+	{
+		Task task = pending.take();
 
-	// FIXME (b/117835459): implement once we have working fences
+		switch(task.type)
+		{
+		case Task::KILL_THREAD:
+			ASSERT_MSG(pending.count() == 0, "queue has remaining work!");
+			return;
+		case Task::SUBMIT_QUEUE:
+			submitQueue(task);
+			break;
+		default:
+			UNIMPLEMENTED("task.type %d", static_cast<int>(task.type));
+			break;
+		}
+	}
+}
 
+VkResult Queue::waitIdle()
+{
+	// Wait for task queue to flush.
+	vk::Fence fence;
+	fence.add();
+
+	Task task;
+	task.fence = &fence;
+	pending.put(task);
+
+	fence.wait();
+
+	// Wait for all draw operations to complete, if any
 	renderer->synchronize();
+
+	garbageCollect();
+
+	return VK_SUCCESS;
+}
+
+void Queue::garbageCollect()
+{
+	while (true)
+	{
+		auto v = toDelete.tryTake();
+		if (!v.second) { break; }
+		vk::deallocate(v.first, DEVICE_MEMORY);
+	}
 }
 
 #ifndef __ANDROID__
diff --git a/src/Vulkan/VkQueue.hpp b/src/Vulkan/VkQueue.hpp
index 95ff39c..f9664b7 100644
--- a/src/Vulkan/VkQueue.hpp
+++ b/src/Vulkan/VkQueue.hpp
@@ -16,6 +16,9 @@
 #define VK_QUEUE_HPP_
 
 #include "VkObject.hpp"
+#include "Device/Renderer.hpp"
+#include <queue>
+#include <thread>
 #include <vulkan/vk_icd.h>
 
 namespace sw
@@ -27,12 +30,97 @@
 namespace vk
 {
 
+// Chan is a thread-safe FIFO queue of type T.
+// Chan takes its name after Golang's chan.
+template <typename T>
+class Chan
+{
+public:
+	Chan();
+
+	// take returns the next item in the chan, blocking until an item is
+	// available.
+	T take();
+
+	// tryTake returns a <T, bool> pair.
+	// If the chan is not empty, then the next item and true are returned.
+	// If the chan is empty, then a default-initialized T and false are returned.
+	std::pair<T, bool> tryTake();
+
+	// put places an item into the chan, blocking if the chan is bounded and
+	// full.
+	void put(const T &v);
+
+	// Returns the number of items in the chan.
+	// Note: that this may change as soon as the function returns, so should
+	// only be used for debugging.
+	size_t count();
+
+private:
+	std::queue<T> queue;
+	std::mutex mutex;
+	std::condition_variable added;
+	std::condition_variable removed;
+};
+
+template <typename T>
+Chan<T>::Chan() {}
+
+template <typename T>
+T Chan<T>::take()
+{
+	std::unique_lock<std::mutex> lock(mutex);
+	if (queue.size() == 0)
+	{
+		// Chan empty. Wait for item to be added.
+		added.wait(lock, [this] { return queue.size() > 0; });
+	}
+	T out = queue.front();
+	queue.pop();
+	lock.unlock();
+	removed.notify_one();
+	return out;
+}
+
+template <typename T>
+std::pair<T, bool> Chan<T>::tryTake()
+{
+	std::unique_lock<std::mutex> lock(mutex);
+	if (queue.size() == 0)
+	{
+		return std::make_pair(T{}, false);
+	}
+	T out = queue.front();
+	queue.pop();
+	lock.unlock();
+	removed.notify_one();
+	return std::make_pair(out, true);
+}
+
+template <typename T>
+void Chan<T>::put(const T &item)
+{
+	std::unique_lock<std::mutex> lock(mutex);
+	queue.push(item);
+	lock.unlock();
+	added.notify_one();
+}
+
+template <typename T>
+size_t Chan<T>::count()
+{
+	std::unique_lock<std::mutex> lock(mutex);
+	auto out = queue.size();
+	lock.unlock();
+	return out;
+}
+
 class Queue
 {
 	VK_LOADER_DATA loaderData = { ICD_LOADER_MAGIC };
 
 public:
-	Queue(uint32_t pFamilyIndex, float pPriority);
+	Queue();
 	~Queue() = delete;
 
 	operator VkQueue()
@@ -41,17 +129,33 @@
 	}
 
 	void destroy();
-	void submit(uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence);
-	void waitIdle();
+	VkResult submit(uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence);
+	VkResult waitIdle();
 #ifndef __ANDROID__
 	void present(const VkPresentInfoKHR* presentInfo);
 #endif
 
 private:
-	sw::Context* context = nullptr;
-	sw::Renderer* renderer = nullptr;
-	uint32_t familyIndex = 0;
-	float    priority = 0.0f;
+	struct Task
+	{
+		uint32_t submitCount = 0;
+		VkSubmitInfo* pSubmits = nullptr;
+		Fence* fence = nullptr;
+
+		enum Type { KILL_THREAD, SUBMIT_QUEUE };
+		Type type = SUBMIT_QUEUE;
+	};
+
+	static void TaskLoop(vk::Queue* queue);
+	void taskLoop();
+	void garbageCollect();
+	void submitQueue(const Task& task);
+
+	std::unique_ptr<sw::Context> context;
+	std::unique_ptr<sw::Renderer> renderer;
+	Chan<Task> pending;
+	Chan<VkSubmitInfo*> toDelete;
+	std::thread queueThread;
 };
 
 static inline Queue* Cast(VkQueue object)
diff --git a/src/Vulkan/libVulkan.cpp b/src/Vulkan/libVulkan.cpp
index d28188e..e6116f6 100644
--- a/src/Vulkan/libVulkan.cpp
+++ b/src/Vulkan/libVulkan.cpp
@@ -522,27 +522,21 @@
 	TRACE("(VkQueue queue = %p, uint32_t submitCount = %d, const VkSubmitInfo* pSubmits = %p, VkFence fence = %p)",
 	      queue, submitCount, pSubmits, fence);
 
-	vk::Cast(queue)->submit(submitCount, pSubmits, fence);
-
-	return VK_SUCCESS;
+	return vk::Cast(queue)->submit(submitCount, pSubmits, fence);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL vkQueueWaitIdle(VkQueue queue)
 {
 	TRACE("(VkQueue queue = %p)", queue);
 
-	vk::Cast(queue)->waitIdle();
-
-	return VK_SUCCESS;
+	return vk::Cast(queue)->waitIdle();
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle(VkDevice device)
 {
 	TRACE("(VkDevice device = %p)", device);
 
-	vk::Cast(device)->waitIdle();
-
-	return VK_SUCCESS;
+	return vk::Cast(device)->waitIdle();
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL vkAllocateMemory(VkDevice device, const VkMemoryAllocateInfo* pAllocateInfo, const VkAllocationCallbacks* pAllocator, VkDeviceMemory* pMemory)
@@ -725,7 +719,6 @@
 	TRACE("(VkDevice device = %p, VkFence fence = %p, const VkAllocationCallbacks* pAllocator = %p)",
 		    device, fence, pAllocator);
 
-
 	vk::destroy(fence, pAllocator);
 }
 
@@ -754,9 +747,7 @@
 	TRACE("(VkDevice device = %p, uint32_t fenceCount = %d, const VkFence* pFences = %p, VkBool32 waitAll = %d, uint64_t timeout = %d)",
 		device, int(fenceCount), pFences, int(waitAll), int(timeout));
 
-	vk::Cast(device)->waitForFences(fenceCount, pFences, waitAll, timeout);
-
-	return VK_SUCCESS;
+	return vk::Cast(device)->waitForFences(fenceCount, pFences, waitAll, timeout);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore(VkDevice device, const VkSemaphoreCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSemaphore* pSemaphore)