Query implementation

Re-enabled the Renderer object's occlusion queries and linked them
to the queries in the query pool. Removed the old Query structure.

Passes all tests in:
Tests: dEQP-VK.query_pool.*

Note: The dEQP-VK.query_pool.*_discard tests currently fail as discard
      appears to disable the occlusion queries. Will fix in next cl.

Bug b/129706526

Change-Id: I937dcf64d2990758d31a1ed6a13af5cf9f0a627b
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/28288
Tested-by: Alexis Hétu <sugoi@google.com>
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 4311ebb..983faae 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -30,6 +30,7 @@
 #include "Vulkan/VkConfig.h"
 #include "Vulkan/VkDebug.hpp"
 #include "Vulkan/VkImageView.hpp"
+#include "Vulkan/VkQueryPool.hpp"
 #include "Pipeline/SpirvShader.hpp"
 #include "Vertex.hpp"
 
@@ -317,6 +318,16 @@
 			return;
 		}
 
+		context->occlusionEnabled = false;
+		for(auto query : queries)
+		{
+			if(query->type == VK_QUERY_TYPE_OCCLUSION)
+			{
+				context->occlusionEnabled = true;
+				break;
+			}
+		}
+
 		sync->lock(sw::PRIVATE);
 
 		if(update || oldMultiSampleMask != context->multiSampleMask)
@@ -373,7 +384,7 @@
 
 		if(queries.size() != 0)
 		{
-			draw->queries = new std::list<Query*>();
+			draw->queries = new std::list<vk::Query*>();
 			for(auto &query : queries)
 			{
 				++query->reference; // Atomic
@@ -850,22 +861,30 @@
 				{
 					for(auto &query : *(draw.queries))
 					{
+						std::unique_lock<std::mutex> mutexLock(query->mutex);
+
 						switch(query->type)
 						{
-						case Query::FRAGMENTS_PASSED:
+						case VK_QUERY_TYPE_OCCLUSION:
 							for(int cluster = 0; cluster < clusterCount; cluster++)
 							{
 								query->data += data.occlusion[cluster];
 							}
 							break;
-						case Query::TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN:
-							query->data += processedPrimitives;
-							break;
 						default:
 							break;
 						}
 
-						--query->reference; // Atomic
+						int queryRef = --query->reference; // Atomic
+						if(queryRef == 0)
+						{
+							query->state = vk::Query::FINISHED;
+						}
+
+						// Manual unlocking is done before notifying, to avoid

+						// waking up the waiting thread only to block again

+						mutexLock.unlock();

+						query->condition.notify_one();
 					}
 
 					delete draw.queries;
@@ -1408,12 +1427,12 @@
 		context->vertexShader = shader;
 	}
 
-	void Renderer::addQuery(Query *query)
+	void Renderer::addQuery(vk::Query *query)
 	{
 		queries.push_back(query);
 	}
 
-	void Renderer::removeQuery(Query *query)
+	void Renderer::removeQuery(vk::Query *query)
 	{
 		queries.remove(query);
 	}
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index 2f1fa61..4a594ad 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -30,6 +30,7 @@
 namespace vk
 {
 	class DescriptorSet;
+	struct Query;
 }
 
 namespace sw
@@ -85,32 +86,6 @@
 		false,   // colorsDefaultToZero
 	};
 
-	struct Query
-	{
-		enum Type { FRAGMENTS_PASSED, TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN };
-
-		Query(Type type) : building(false), reference(0), data(0), type(type)
-		{
-		}
-
-		void begin()
-		{
-			building = true;
-			data = 0;
-		}
-
-		void end()
-		{
-			building = false;
-		}
-
-		bool building;
-		AtomicInt reference;
-		AtomicInt data;
-
-		const Type type;
-	};
-
 	struct DrawData
 	{
 		const Constants *constants;
@@ -249,8 +224,8 @@
 		void setViewport(const VkViewport &viewport);
 		void setScissor(const VkRect2D &scissor);
 
-		void addQuery(Query *query);
-		void removeQuery(Query *query);
+		void addQuery(vk::Query *query);
+		void removeQuery(vk::Query *query);
 
 		void advanceInstanceAttributes();
 
@@ -343,7 +318,7 @@
 
 		SwiftConfig *swiftConfig;
 
-		std::list<Query*> queries;
+		std::list<vk::Query*> queries;
 		Resource *sync;
 
 		VertexProcessor::State vertexState;
@@ -380,7 +355,7 @@
 		vk::ImageView *depthBuffer;
 		vk::ImageView *stencilBuffer;
 
-		std::list<Query*> *queries;
+		std::list<vk::Query*> *queries;
 
 		AtomicInt primitive;    // Current primitive to enter pipeline
 		AtomicInt count;        // Number of primitives to render
diff --git a/src/Vulkan/VkBuffer.hpp b/src/Vulkan/VkBuffer.hpp
index 1338854..bbd0fdc 100644
--- a/src/Vulkan/VkBuffer.hpp
+++ b/src/Vulkan/VkBuffer.hpp
@@ -37,6 +37,7 @@
 	void fill(VkDeviceSize dstOffset, VkDeviceSize fillSize, uint32_t data);
 	void update(VkDeviceSize dstOffset, VkDeviceSize dataSize, const void* pData);
 	void* getOffsetPointer(VkDeviceSize offset) const;
+	inline VkDeviceSize getSize() const { return size; }
 	uint8_t* end() const;
 
 	// DataOffset is the offset in bytes from the Buffer to the pointer to the
diff --git a/src/Vulkan/VkCommandBuffer.cpp b/src/Vulkan/VkCommandBuffer.cpp
index 75a76cd..bd3b6f5 100644
--- a/src/Vulkan/VkCommandBuffer.cpp
+++ b/src/Vulkan/VkCommandBuffer.cpp
@@ -20,6 +20,7 @@
 #include "VkImageView.hpp"
 #include "VkPipeline.hpp"
 #include "VkPipelineLayout.hpp"
+#include "VkQueryPool.hpp"
 #include "VkRenderPass.hpp"
 #include "Device/Renderer.hpp"
 
@@ -738,6 +739,104 @@
 	unsigned char data[MAX_PUSH_CONSTANT_SIZE];
 };
 
+struct BeginQuery : public CommandBuffer::Command
+{
+	BeginQuery(VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags)
+		: queryPool(queryPool), query(query), flags(flags)
+	{
+	}
+
+	void play(CommandBuffer::ExecutionState& executionState)
+	{
+		executionState.renderer->addQuery(Cast(queryPool)->getQuery(query));
+		Cast(queryPool)->begin(query, flags);
+	}
+
+private:
+	VkQueryPool queryPool;
+	uint32_t query;
+	VkQueryControlFlags flags;
+};
+
+struct EndQuery : public CommandBuffer::Command
+{
+	EndQuery(VkQueryPool queryPool, uint32_t query)
+		: queryPool(queryPool), query(query)
+	{
+	}
+
+	void play(CommandBuffer::ExecutionState& executionState)
+	{
+		executionState.renderer->removeQuery(Cast(queryPool)->getQuery(query));
+		Cast(queryPool)->end(query);
+	}
+
+private:
+	VkQueryPool queryPool;
+	uint32_t query;
+};
+
+struct ResetQueryPool : public CommandBuffer::Command
+{
+	ResetQueryPool(VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
+		: queryPool(queryPool), firstQuery(firstQuery), queryCount(queryCount)
+	{
+	}
+
+	void play(CommandBuffer::ExecutionState& executionState)
+	{
+		Cast(queryPool)->reset(firstQuery, queryCount);
+	}
+
+private:
+	VkQueryPool queryPool;
+	uint32_t firstQuery;
+	uint32_t queryCount;
+};
+
+struct WriteTimeStamp : public CommandBuffer::Command
+{
+	WriteTimeStamp(VkQueryPool queryPool, uint32_t query)
+		: queryPool(queryPool), query(query)
+	{
+	}
+
+	void play(CommandBuffer::ExecutionState& executionState)
+	{
+		Cast(queryPool)->writeTimestamp(query);
+	}
+
+private:
+	VkQueryPool queryPool;
+	uint32_t query;
+};
+
+struct CopyQueryPoolResults : public CommandBuffer::Command
+{
+	CopyQueryPoolResults(VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount,
+		VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags)
+		: queryPool(queryPool), firstQuery(firstQuery), queryCount(queryCount), dstBuffer(dstBuffer),
+		  dstOffset(dstOffset), stride(stride), flags(flags)
+	{
+	}
+
+	void play(CommandBuffer::ExecutionState& executionState)
+	{
+		vk::Buffer* buffer = Cast(dstBuffer);
+		Cast(queryPool)->getResults(firstQuery, queryCount, buffer->getSize() - dstOffset,
+		                            buffer->getOffsetPointer(dstOffset), stride, flags);
+	}
+
+private:
+	VkQueryPool queryPool;
+	uint32_t firstQuery;
+	uint32_t queryCount;
+	VkBuffer dstBuffer;
+	VkDeviceSize dstOffset;
+	VkDeviceSize stride;
+	VkQueryResultFlags flags;
+};
+
 CommandBuffer::CommandBuffer(VkCommandBufferLevel pLevel) : level(pLevel)
 {
 	// FIXME (b/119409619): replace this vector by an allocator so we can control all memory allocations
@@ -877,28 +976,28 @@
 
 void CommandBuffer::beginQuery(VkQueryPool queryPool, uint32_t query, VkQueryControlFlags flags)
 {
-	UNIMPLEMENTED("beginQuery");
+	addCommand<BeginQuery>(queryPool, query, flags);
 }
 
 void CommandBuffer::endQuery(VkQueryPool queryPool, uint32_t query)
 {
-	UNIMPLEMENTED("endQuery");
+	addCommand<EndQuery>(queryPool, query);
 }
 
 void CommandBuffer::resetQueryPool(VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount)
 {
-	UNIMPLEMENTED("resetQueryPool");
+	addCommand<ResetQueryPool>(queryPool, firstQuery, queryCount);
 }
 
 void CommandBuffer::writeTimestamp(VkPipelineStageFlagBits pipelineStage, VkQueryPool queryPool, uint32_t query)
 {
-	UNIMPLEMENTED("writeTimestamp");
+	addCommand<WriteTimeStamp>(queryPool, query);
 }
 
 void CommandBuffer::copyQueryPoolResults(VkQueryPool queryPool, uint32_t firstQuery, uint32_t queryCount,
 	VkBuffer dstBuffer, VkDeviceSize dstOffset, VkDeviceSize stride, VkQueryResultFlags flags)
 {
-	UNIMPLEMENTED("copyQueryPoolResults");
+	addCommand<CopyQueryPoolResults>(queryPool, firstQuery, queryCount, dstBuffer, dstOffset, stride, flags);
 }
 
 void CommandBuffer::pushConstants(VkPipelineLayout layout, VkShaderStageFlags stageFlags,
diff --git a/src/Vulkan/VkQueryPool.cpp b/src/Vulkan/VkQueryPool.cpp
index da4627a..864ddb2 100644
--- a/src/Vulkan/VkQueryPool.cpp
+++ b/src/Vulkan/VkQueryPool.cpp
@@ -13,11 +13,17 @@
 // limitations under the License.
 
 #include "VkQueryPool.hpp"
+#include "Common/Thread.hpp"
+
+#include <chrono>
+#include <cstring>
+#include <new>
 
 namespace vk
 {
 	QueryPool::QueryPool(const VkQueryPoolCreateInfo* pCreateInfo, void* mem) :
-		queryCount(pCreateInfo->queryCount)
+		pool(reinterpret_cast<Query*>(mem)), type(pCreateInfo->queryType),
+		count(pCreateInfo->queryCount)
 	{
 		// According to the Vulkan spec, section 34.1. Features:
 		// "pipelineStatisticsQuery specifies whether the pipeline statistics
@@ -25,30 +31,147 @@
 		//  type VK_QUERY_TYPE_PIPELINE_STATISTICS cannot be created, and
 		//  none of the VkQueryPipelineStatisticFlagBits bits can be set in the
 		//  pipelineStatistics member of the VkQueryPoolCreateInfo structure."
-		if(pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
+		if(type == VK_QUERY_TYPE_PIPELINE_STATISTICS)
 		{
 			UNIMPLEMENTED("pCreateInfo->queryType");
 		}
+
+		// Construct all queries
+		for(uint32_t i = 0; i < count; i++)
+		{
+			new (&pool[i]) Query();
+		}
+	}
+
+	void QueryPool::destroy(const VkAllocationCallbacks* pAllocator)
+	{
+		vk::deallocate(pool, pAllocator);
 	}
 
 	size_t QueryPool::ComputeRequiredAllocationSize(const VkQueryPoolCreateInfo* pCreateInfo)
 	{
-		return 0;
+		return sizeof(Query) * pCreateInfo->queryCount;
 	}
 
-	void QueryPool::getResults(uint32_t pFirstQuery, uint32_t pQueryCount, size_t pDataSize,
-	                           void* pData, VkDeviceSize pStride, VkQueryResultFlags pFlags) const
+	VkResult QueryPool::getResults(uint32_t firstQuery, uint32_t queryCount, size_t dataSize,
+	                               void* pData, VkDeviceSize stride, VkQueryResultFlags flags) const
 	{
 		// dataSize must be large enough to contain the result of each query
-		ASSERT(static_cast<size_t>(pStride * pQueryCount) <= pDataSize);
+		ASSERT(static_cast<size_t>(stride * queryCount) <= dataSize);
 
 		// The sum of firstQuery and queryCount must be less than or equal to the number of queries
-		ASSERT((pFirstQuery + pQueryCount) <= queryCount);
-
-		char* data = static_cast<char*>(pData);
-		for(uint32_t i = 0; i < pQueryCount; i++, data += pStride)
+		ASSERT((firstQuery + queryCount) <= count);
+		
+		VkResult result = VK_SUCCESS;
+		uint8_t* data = static_cast<uint8_t*>(pData);
+		for(uint32_t i = firstQuery; i < (firstQuery + queryCount); i++, data += stride)
 		{
-			UNIMPLEMENTED("queries");
+			// If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are both not set
+			// then no result values are written to pData for queries that are in the
+			// unavailable state at the time of the call, and vkGetQueryPoolResults returns
+			// VK_NOT_READY. However, availability state is still written to pData for those
+			// queries if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
+			auto &query = pool[i];
+			std::unique_lock<std::mutex> mutexLock(query.mutex);
+			if(flags & VK_QUERY_RESULT_WAIT_BIT) // Must wait for query to finish
+			{
+				query.condition.wait(mutexLock, [&query] { return query.state != Query::ACTIVE; });
+			}
+
+			bool writeResult = true;
+			if(pool[i].state == Query::ACTIVE)
+			{
+				result = VK_NOT_READY;
+				writeResult = (flags & VK_QUERY_RESULT_PARTIAL_BIT); // Allow writing partial results
+			}
+
+			if(flags & VK_QUERY_RESULT_64_BIT)
+			{
+				uint64_t* result64 = reinterpret_cast<uint64_t*>(data);
+				if(writeResult)
+				{
+					result64[0] = pool[i].data;
+				}
+				if(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) // Output query availablity
+				{
+					result64[1] = pool[i].state;
+				}
+			}
+			else
+			{
+				uint32_t* result32 = reinterpret_cast<uint32_t*>(data);
+				if(writeResult)
+				{
+					result32[0] = static_cast<uint32_t>(pool[i].data);
+				}
+				if(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) // Output query availablity
+				{
+					result32[1] = pool[i].state;
+				}
+			}
 		}
+
+		return result;
+	}
+
+	void QueryPool::begin(uint32_t query, VkQueryControlFlags flags)
+	{
+		ASSERT(query < count);
+
+		if(flags != 0)
+		{
+			UNIMPLEMENTED("flags");
+		}
+
+		ASSERT(pool[query].state == Query::UNAVAILABLE);
+		pool[query].state = Query::ACTIVE;
+		pool[query].data = 0;
+		pool[query].reference = 1;
+		pool[query].type = type;
+	}
+
+	void QueryPool::end(uint32_t query)
+	{
+		ASSERT(query < count);
+
+		#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+		{
+			std::unique_lock<std::mutex> mutexLock(pool[query].mutex);
+			ASSERT(pool[query].state == Query::ACTIVE);
+		}
+		#endif
+
+		int ref = --pool[query].reference;
+		if(ref == 0)
+		{
+			std::unique_lock<std::mutex> mutexLock(pool[query].mutex);
+			pool[query].state = Query::FINISHED;
+		}
+	}
+
+	void QueryPool::reset(uint32_t firstQuery, uint32_t queryCount)
+	{
+		// The sum of firstQuery and queryCount must be less than or equal to the number of queries
+		ASSERT((firstQuery + queryCount) <= count);
+
+		for(uint32_t i = firstQuery; i < (firstQuery + queryCount); i++)
+		{
+			std::unique_lock<std::mutex> mutexLock(pool[i].mutex);
+
+			ASSERT(pool[i].state != Query::ACTIVE);
+
+			pool[i].state = Query::UNAVAILABLE;
+			pool[i].data = 0;
+		}
+	}
+
+	void QueryPool::writeTimestamp(uint32_t query)
+	{
+		ASSERT(query < count);
+		ASSERT(type == VK_QUERY_TYPE_TIMESTAMP);
+
+		std::unique_lock<std::mutex> mutexLock(pool[query].mutex);
+		pool[query].data = std::chrono::time_point_cast<std::chrono::nanoseconds>(
+			std::chrono::system_clock::now()).time_since_epoch().count();
 	}
 } // namespace vk
diff --git a/src/Vulkan/VkQueryPool.hpp b/src/Vulkan/VkQueryPool.hpp
index 9f8072e..06161a9 100644
--- a/src/Vulkan/VkQueryPool.hpp
+++ b/src/Vulkan/VkQueryPool.hpp
@@ -16,23 +16,53 @@
 #define VK_QUERY_POOL_HPP_
 
 #include "VkObject.hpp"
+#include <atomic>
+#include <condition_variable>
+#include <mutex>

 
 namespace vk
 {
 
+struct Query
+{
+	enum State
+	{
+		UNAVAILABLE,
+		ACTIVE,
+		FINISHED
+	};
+
+	std::mutex mutex;

+	std::condition_variable condition;
+	State state;  // guarded by mutex
+	int64_t data; // guarded by mutex
+	std::atomic<int> reference;
+	VkQueryType type;
+};
+
 class QueryPool : public Object<QueryPool, VkQueryPool>
 {
 public:
 	QueryPool(const VkQueryPoolCreateInfo* pCreateInfo, void* mem);
 	~QueryPool() = delete;
+	void destroy(const VkAllocationCallbacks* pAllocator);
 
 	static size_t ComputeRequiredAllocationSize(const VkQueryPoolCreateInfo* pCreateInfo);
 
-	void getResults(uint32_t pFirstQuery, uint32_t pQueryCount, size_t pDataSize,
-		            void* pData, VkDeviceSize pStride, VkQueryResultFlags pFlags) const;
+	VkResult getResults(uint32_t firstQuery, uint32_t queryCount, size_t dataSize,
+		                void* pData, VkDeviceSize stride, VkQueryResultFlags flags) const;
+	void begin(uint32_t query, VkQueryControlFlags flags);
+	void end(uint32_t query);
+	void reset(uint32_t firstQuery, uint32_t queryCount);
+	
+	void writeTimestamp(uint32_t query);
+
+	inline Query* getQuery(uint32_t query) const { return &(pool[query]); }
 
 private:
-	uint32_t queryCount;
+	Query* pool;
+	VkQueryType type;
+	uint32_t count;
 };
 
 static inline QueryPool* Cast(VkQueryPool object)
diff --git a/src/Vulkan/VkQueue.cpp b/src/Vulkan/VkQueue.cpp
index f6e24aa..1b3ec53 100644
--- a/src/Vulkan/VkQueue.cpp
+++ b/src/Vulkan/VkQueue.cpp
@@ -72,6 +72,8 @@
 	// with an infinite timeout for that fence to signal
 
 	// FIXME (b/117835459): implement once we have working fences
+
+	renderer->synchronize();
 }
 
 #ifndef __ANDROID__
diff --git a/src/Vulkan/libVulkan.cpp b/src/Vulkan/libVulkan.cpp
index c5b9939..d71db4d 100644
--- a/src/Vulkan/libVulkan.cpp
+++ b/src/Vulkan/libVulkan.cpp
@@ -845,9 +845,7 @@
 	TRACE("(VkDevice device = 0x%X, VkQueryPool queryPool = 0x%X, uint32_t firstQuery = %d, uint32_t queryCount = %d, size_t dataSize = %d, void* pData = 0x%X, VkDeviceSize stride = 0x%X, VkQueryResultFlags flags = %d)",
 	      device, queryPool, firstQuery, queryCount, dataSize, pData, stride, flags);
 
-	vk::Cast(queryPool)->getResults(firstQuery, queryCount, dataSize, pData, stride, flags);
-
-	return VK_SUCCESS;
+	return vk::Cast(queryPool)->getResults(firstQuery, queryCount, dataSize, pData, stride, flags);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL vkCreateBuffer(VkDevice device, const VkBufferCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkBuffer* pBuffer)