Faster image sampler fetch from key This cl makes a tradeoff of a one time conversion of the LRUCache to a constant unordered map to save time on a costly per pixel imageSampler fetch operation. When the renderer is idle, the device copies the LRUCache to an unordered map, which has faster fetch times. This cache is always constant throughout any rendering operation, so it can be fetched without a mutex. This copy operation happens only if the LRUCache was modified since the last copy occurred, so, if all sampling variations happened on the first frame, all subsequent frames can render much faster. On MacOS, the Glass demo goes from 2.6 FPS to 20 FPS. Bug b/129523279 b/137649247 Change-Id: I195ca8b2ead59eb5cc9e75e8b0dc5119c794d717 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/34348 Presubmit-Ready: Alexis Hétu <sugoi@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: Alexis Hétu <sugoi@google.com> Reviewed-by: Ben Clayton <bclayton@google.com>

commit: 3575550a711be52a32201e3ddcf605ac61aa56f0 [log] [tgz]
author: Alexis Hetu <sugoi@google.com> Mon Jul 22 13:51:49 2019 -0400
committer: Alexis Hétu <sugoi@google.com> Fri Jul 26 14:30:24 2019 +0000
tree: 83baf1c0de5ff3a2b89dd1a8cb3e2a789fdf195b
parent: ec3039253ab805b040ac3a5a42eeec7e84d0377b [diff]
diff --git a/src/Device/LRUCache.hpp b/src/Device/LRUCache.hpp
index 180b5b9..1e3b2d9 100644
--- a/src/Device/LRUCache.hpp
+++ b/src/Device/LRUCache.hpp

@@ -19,6 +19,7 @@
 
 #include <cstring>
 #include <type_traits>
+#include <unordered_map>
 
 namespace sw
 {
@@ -28,15 +29,15 @@
 	public:
 		LRUCache(int n);
 
-		~LRUCache();
+		virtual ~LRUCache();
 
 		Data *query(const Key &key) const;
-		Data *add(const Key &key, Data *data);
+		virtual Data *add(const Key &key, Data *data);
 
 		int getSize() {return size;}
 		Key &getKey(int i) {return key[i];}
 
-	private:
+	protected:
 		int size;
 		int mask;
 		int top;
@@ -47,6 +48,29 @@
 		Data **data;
 	};
 
+	template<class Key, class Data>
+	class LRUConstCache : public LRUCache<Key, Data>
+	{
+		using LRUBase = LRUCache<Key, Data>;
+	public:
+		LRUConstCache(int n) : LRUBase(n) {}
+		~LRUConstCache() { clearConstCache(); }
+
+		Data *add(const Key &key, Data *data) override
+		{
+			constCacheNeedsUpdate = true;
+			return LRUBase::add(key, data);
+		}
+
+		void updateConstCache();
+		Data *queryConstCache(const Key &key) const;
+
+	private:
+		void clearConstCache();
+		bool constCacheNeedsUpdate = false;
+		std::unordered_map<Key, Data*> constCache;
+	};
+
 	// Helper class for clearing the memory of objects at construction.
 	// Useful as the first base class of cache keys which may contain padding bytes or bits otherwise left uninitialized.
 	template<class T>
@@ -183,6 +207,45 @@
 
 		return data;
 	}
+
+	template<class Key, class Data>
+	void LRUConstCache<Key, Data>::clearConstCache()
+	{
+		auto it = constCache.begin();
+		auto itEnd = constCache.end();
+		for(; it != itEnd; ++it)
+		{
+			it->second->unbind();
+		}
+		constCache.clear();
+	}
+
+	template<class Key, class Data>
+	void LRUConstCache<Key, Data>::updateConstCache()
+	{
+		if(constCacheNeedsUpdate)
+		{
+			clearConstCache();
+
+			for(int i = 0; i < LRUBase::size; i++)
+			{
+				if(LRUBase::data[i])
+				{
+					LRUBase::data[i]->bind();
+					constCache[*LRUBase::ref[i]] = LRUBase::data[i];
+				}
+			}
+
+			constCacheNeedsUpdate = false;
+		}
+	}
+
+	template<class Key, class Data>
+	Data *LRUConstCache<Key, Data>::queryConstCache(const Key &key) const
+	{
+		auto it = constCache.find(key);
+		return (it != constCache.end()) ? it->second : nullptr;
+	}
 }
 
 #endif   // sw_LRUCache_hpp

diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index f6f7d1f..ec6c43b 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp

@@ -26,6 +26,7 @@
 #include "System/Timer.hpp"
 #include "Vulkan/VkConfig.h"
 #include "Vulkan/VkDebug.hpp"
+#include "Vulkan/VkDevice.hpp"
 #include "Vulkan/VkFence.hpp"
 #include "Vulkan/VkImageView.hpp"
 #include "Vulkan/VkQueryPool.hpp"
@@ -162,7 +163,7 @@
 		deallocate(data);
 	}
 
-	Renderer::Renderer()
+	Renderer::Renderer(vk::Device* device) : device(device)
 	{
 		for(int i = 0; i < 16; i++)
 		{
@@ -733,6 +734,7 @@
 	void Renderer::synchronize()
 	{
 		sync.wait();
+		device->updateSamplingRoutineConstCache();
 	}
 
 	void Renderer::finishRendering(Task &pixelTask)

diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index d666339..c19010b 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp

@@ -32,6 +32,7 @@
 namespace vk
 {
 	class DescriptorSet;
+	class Device;
 	class Query;
 }
 
@@ -156,7 +157,7 @@
 		};
 
 	public:
-		Renderer();
+		Renderer(vk::Device* device);
 
 		virtual ~Renderer();
 
@@ -254,6 +255,8 @@
 		Routine *vertexRoutine;
 		Routine *setupRoutine;
 		Routine *pixelRoutine;
+
+		vk::Device* device;
 	};
 
 	struct DrawCall

diff --git a/src/Pipeline/SpirvShaderSampling.cpp b/src/Pipeline/SpirvShaderSampling.cpp
index ba6f2ac..400bb55 100644
--- a/src/Pipeline/SpirvShaderSampling.cpp
+++ b/src/Pipeline/SpirvShaderSampling.cpp

@@ -40,10 +40,16 @@
 
 	ASSERT(imageDescriptor->device);
 
+	rr::Routine* routine = imageDescriptor->device->findInConstCache(key);
+	if(routine)
+	{
+		return (ImageSampler*)(routine->getEntry());
+	}
+
 	std::unique_lock<std::mutex> lock(imageDescriptor->device->getSamplingRoutineCacheMutex());
 	vk::Device::SamplingRoutineCache* cache = imageDescriptor->device->getSamplingRoutineCache();
 
-	rr::Routine* routine = cache->query(key);
+	routine = cache->query(key);
 	if(routine)
 	{
 		return (ImageSampler*)(routine->getEntry());

diff --git a/src/Vulkan/VkDevice.cpp b/src/Vulkan/VkDevice.cpp
index 435985c..a260882 100644
--- a/src/Vulkan/VkDevice.cpp
+++ b/src/Vulkan/VkDevice.cpp

@@ -47,7 +47,17 @@
 	cache.add(hash(key), routine);
 }
 
-std::size_t Device::SamplingRoutineCache::hash(const vk::Device::SamplingRoutineCache::Key &key) const
+rr::Routine* Device::SamplingRoutineCache::queryConst(const vk::Device::SamplingRoutineCache::Key& key) const
+{
+	return cache.queryConstCache(hash(key));
+}
+
+void Device::SamplingRoutineCache::updateConstCache()
+{
+	cache.updateConstCache();
+}
+
+std::size_t Device::SamplingRoutineCache::hash(const vk::Device::SamplingRoutineCache::Key &key)
 {
 	return (key.instruction << 16) ^ (key.sampler << 8) ^ key.imageView;
 }
@@ -71,7 +81,7 @@
 
 		for(uint32_t j = 0; j < queueCreateInfo.queueCount; j++, queueID++)
 		{
-			new (&queues[queueID]) Queue();
+			new (&queues[queueID]) Queue(this);
 		}
 	}
 
@@ -89,6 +99,7 @@
 
 	// FIXME (b/119409619): use an allocator here so we can control all memory allocations
 	blitter.reset(new sw::Blitter());
+	samplingRoutineCache.reset(new SamplingRoutineCache());
 }
 
 void Device::destroy(const VkAllocationCallbacks* pAllocator)
@@ -235,15 +246,22 @@
 	}
 }
 
-Device::SamplingRoutineCache* Device::getSamplingRoutineCache()
+Device::SamplingRoutineCache* Device::getSamplingRoutineCache() const
 {
-	if(!samplingRoutineCache.get())
-	{
-		samplingRoutineCache.reset(new SamplingRoutineCache());
-	}
 	return samplingRoutineCache.get();
 }
 
+rr::Routine* Device::findInConstCache(const SamplingRoutineCache::Key& key) const
+{
+	return samplingRoutineCache->queryConst(key);
+}
+
+void Device::updateSamplingRoutineConstCache()
+{
+	std::unique_lock<std::mutex> lock(samplingRoutineCacheMutex);
+	samplingRoutineCache->updateConstCache();
+}
+
 std::mutex& Device::getSamplingRoutineCacheMutex()
 {
 	return samplingRoutineCacheMutex;

diff --git a/src/Vulkan/VkDevice.hpp b/src/Vulkan/VkDevice.hpp
index 52212f3..9bba54e 100644
--- a/src/Vulkan/VkDevice.hpp
+++ b/src/Vulkan/VkDevice.hpp

@@ -24,7 +24,6 @@
 namespace sw
 {
 	class Blitter;
-	class SamplingRoutineCache;
 }
 
 namespace vk
@@ -71,13 +70,19 @@
 		rr::Routine* query(const Key& key) const;
 		void add(const Key& key, rr::Routine* routine);
 
+		rr::Routine* queryConst(const Key& key) const;
+		void updateConstCache();
+
+		static std::size_t hash(const Key &key);
+
 	private:
-		std::size_t hash(const Key &key) const;
-		sw::LRUCache<std::size_t, rr::Routine> cache;
+		sw::LRUConstCache<std::size_t, rr::Routine> cache;
 	};
 
-	SamplingRoutineCache* getSamplingRoutineCache();
+	SamplingRoutineCache* getSamplingRoutineCache() const;
 	std::mutex& getSamplingRoutineCacheMutex();
+	rr::Routine* findInConstCache(const SamplingRoutineCache::Key& key) const;
+	void updateSamplingRoutineConstCache();
 
 private:
 	PhysicalDevice *const physicalDevice = nullptr;

diff --git a/src/Vulkan/VkQueue.cpp b/src/Vulkan/VkQueue.cpp
index 4c03198..3aee60a 100644
--- a/src/Vulkan/VkQueue.cpp
+++ b/src/Vulkan/VkQueue.cpp

@@ -74,7 +74,7 @@
 namespace vk
 {
 
-Queue::Queue() : renderer()
+Queue::Queue(Device* device) : renderer(device)
 {
 	queueThread = std::thread(TaskLoop, this);
 }

diff --git a/src/Vulkan/VkQueue.hpp b/src/Vulkan/VkQueue.hpp
index cfa462b..2926e10 100644
--- a/src/Vulkan/VkQueue.hpp
+++ b/src/Vulkan/VkQueue.hpp

@@ -31,6 +31,7 @@
 namespace vk
 {
 
+class Device;
 class Fence;
 
 class Queue
@@ -38,7 +39,7 @@
 	VK_LOADER_DATA loaderData = { ICD_LOADER_MAGIC };
 
 public:
-	Queue();
+	Queue(Device* device);
 	~Queue();
 
 	operator VkQueue()
commit	3575550a711be52a32201e3ddcf605ac61aa56f0	[log] [tgz]
author	Alexis Hetu <sugoi@google.com>	Mon Jul 22 13:51:49 2019 -0400
committer	Alexis Hétu <sugoi@google.com>	Fri Jul 26 14:30:24 2019 +0000
tree	83baf1c0de5ff3a2b89dd1a8cb3e2a789fdf195b
parent	ec3039253ab805b040ac3a5a42eeec7e84d0377b [diff]