Faster image sampler fetch from key

This cl makes a tradeoff of a one time conversion of the
LRUCache to a constant unordered map to save time on a
costly per pixel imageSampler fetch operation.

When the renderer is idle, the device copies the LRUCache
to an unordered map, which has faster fetch times. This
cache is always constant throughout any rendering
operation, so it can be fetched without a mutex. This copy
operation happens only if the LRUCache was modified since
the last copy occurred, so, if all sampling variations
happened on the first frame, all subsequent frames can
render much faster.

On MacOS, the Glass demo goes from 2.6 FPS to 20 FPS.

Bug b/129523279 b/137649247

Change-Id: I195ca8b2ead59eb5cc9e75e8b0dc5119c794d717
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/34348
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Ben Clayton <bclayton@google.com>
diff --git a/src/Device/LRUCache.hpp b/src/Device/LRUCache.hpp
index 180b5b9..1e3b2d9 100644
--- a/src/Device/LRUCache.hpp
+++ b/src/Device/LRUCache.hpp
@@ -19,6 +19,7 @@
 
 #include <cstring>
 #include <type_traits>
+#include <unordered_map>
 
 namespace sw
 {
@@ -28,15 +29,15 @@
 	public:
 		LRUCache(int n);
 
-		~LRUCache();
+		virtual ~LRUCache();
 
 		Data *query(const Key &key) const;
-		Data *add(const Key &key, Data *data);
+		virtual Data *add(const Key &key, Data *data);
 
 		int getSize() {return size;}
 		Key &getKey(int i) {return key[i];}
 
-	private:
+	protected:
 		int size;
 		int mask;
 		int top;
@@ -47,6 +48,29 @@
 		Data **data;
 	};
 
+	template<class Key, class Data>
+	class LRUConstCache : public LRUCache<Key, Data>
+	{
+		using LRUBase = LRUCache<Key, Data>;
+	public:
+		LRUConstCache(int n) : LRUBase(n) {}
+		~LRUConstCache() { clearConstCache(); }
+
+		Data *add(const Key &key, Data *data) override
+		{
+			constCacheNeedsUpdate = true;
+			return LRUBase::add(key, data);
+		}
+
+		void updateConstCache();
+		Data *queryConstCache(const Key &key) const;
+
+	private:
+		void clearConstCache();
+		bool constCacheNeedsUpdate = false;
+		std::unordered_map<Key, Data*> constCache;
+	};
+
 	// Helper class for clearing the memory of objects at construction.
 	// Useful as the first base class of cache keys which may contain padding bytes or bits otherwise left uninitialized.
 	template<class T>
@@ -183,6 +207,45 @@
 
 		return data;
 	}
+
+	template<class Key, class Data>
+	void LRUConstCache<Key, Data>::clearConstCache()
+	{
+		auto it = constCache.begin();
+		auto itEnd = constCache.end();
+		for(; it != itEnd; ++it)
+		{
+			it->second->unbind();
+		}
+		constCache.clear();
+	}
+
+	template<class Key, class Data>
+	void LRUConstCache<Key, Data>::updateConstCache()
+	{
+		if(constCacheNeedsUpdate)
+		{
+			clearConstCache();
+
+			for(int i = 0; i < LRUBase::size; i++)
+			{
+				if(LRUBase::data[i])
+				{
+					LRUBase::data[i]->bind();
+					constCache[*LRUBase::ref[i]] = LRUBase::data[i];
+				}
+			}
+
+			constCacheNeedsUpdate = false;
+		}
+	}
+
+	template<class Key, class Data>
+	Data *LRUConstCache<Key, Data>::queryConstCache(const Key &key) const
+	{
+		auto it = constCache.find(key);
+		return (it != constCache.end()) ? it->second : nullptr;
+	}
 }
 
 #endif   // sw_LRUCache_hpp
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index f6f7d1f..ec6c43b 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -26,6 +26,7 @@
 #include "System/Timer.hpp"
 #include "Vulkan/VkConfig.h"
 #include "Vulkan/VkDebug.hpp"
+#include "Vulkan/VkDevice.hpp"
 #include "Vulkan/VkFence.hpp"
 #include "Vulkan/VkImageView.hpp"
 #include "Vulkan/VkQueryPool.hpp"
@@ -162,7 +163,7 @@
 		deallocate(data);
 	}
 
-	Renderer::Renderer()
+	Renderer::Renderer(vk::Device* device) : device(device)
 	{
 		for(int i = 0; i < 16; i++)
 		{
@@ -733,6 +734,7 @@
 	void Renderer::synchronize()
 	{
 		sync.wait();
+		device->updateSamplingRoutineConstCache();
 	}
 
 	void Renderer::finishRendering(Task &pixelTask)
diff --git a/src/Device/Renderer.hpp b/src/Device/Renderer.hpp
index d666339..c19010b 100644
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -32,6 +32,7 @@
 namespace vk
 {
 	class DescriptorSet;
+	class Device;
 	class Query;
 }
 
@@ -156,7 +157,7 @@
 		};
 
 	public:
-		Renderer();
+		Renderer(vk::Device* device);
 
 		virtual ~Renderer();
 
@@ -254,6 +255,8 @@
 		Routine *vertexRoutine;
 		Routine *setupRoutine;
 		Routine *pixelRoutine;
+
+		vk::Device* device;
 	};
 
 	struct DrawCall
diff --git a/src/Pipeline/SpirvShaderSampling.cpp b/src/Pipeline/SpirvShaderSampling.cpp
index ba6f2ac..400bb55 100644
--- a/src/Pipeline/SpirvShaderSampling.cpp
+++ b/src/Pipeline/SpirvShaderSampling.cpp
@@ -40,10 +40,16 @@
 
 	ASSERT(imageDescriptor->device);
 
+	rr::Routine* routine = imageDescriptor->device->findInConstCache(key);
+	if(routine)
+	{
+		return (ImageSampler*)(routine->getEntry());
+	}
+
 	std::unique_lock<std::mutex> lock(imageDescriptor->device->getSamplingRoutineCacheMutex());
 	vk::Device::SamplingRoutineCache* cache = imageDescriptor->device->getSamplingRoutineCache();
 
-	rr::Routine* routine = cache->query(key);
+	routine = cache->query(key);
 	if(routine)
 	{
 		return (ImageSampler*)(routine->getEntry());
diff --git a/src/Vulkan/VkDevice.cpp b/src/Vulkan/VkDevice.cpp
index 435985c..a260882 100644
--- a/src/Vulkan/VkDevice.cpp
+++ b/src/Vulkan/VkDevice.cpp
@@ -47,7 +47,17 @@
 	cache.add(hash(key), routine);
 }
 
-std::size_t Device::SamplingRoutineCache::hash(const vk::Device::SamplingRoutineCache::Key &key) const
+rr::Routine* Device::SamplingRoutineCache::queryConst(const vk::Device::SamplingRoutineCache::Key& key) const
+{
+	return cache.queryConstCache(hash(key));
+}
+
+void Device::SamplingRoutineCache::updateConstCache()
+{
+	cache.updateConstCache();
+}
+
+std::size_t Device::SamplingRoutineCache::hash(const vk::Device::SamplingRoutineCache::Key &key)
 {
 	return (key.instruction << 16) ^ (key.sampler << 8) ^ key.imageView;
 }
@@ -71,7 +81,7 @@
 
 		for(uint32_t j = 0; j < queueCreateInfo.queueCount; j++, queueID++)
 		{
-			new (&queues[queueID]) Queue();
+			new (&queues[queueID]) Queue(this);
 		}
 	}
 
@@ -89,6 +99,7 @@
 
 	// FIXME (b/119409619): use an allocator here so we can control all memory allocations
 	blitter.reset(new sw::Blitter());
+	samplingRoutineCache.reset(new SamplingRoutineCache());
 }
 
 void Device::destroy(const VkAllocationCallbacks* pAllocator)
@@ -235,15 +246,22 @@
 	}
 }
 
-Device::SamplingRoutineCache* Device::getSamplingRoutineCache()
+Device::SamplingRoutineCache* Device::getSamplingRoutineCache() const
 {
-	if(!samplingRoutineCache.get())
-	{
-		samplingRoutineCache.reset(new SamplingRoutineCache());
-	}
 	return samplingRoutineCache.get();
 }
 
+rr::Routine* Device::findInConstCache(const SamplingRoutineCache::Key& key) const
+{
+	return samplingRoutineCache->queryConst(key);
+}
+
+void Device::updateSamplingRoutineConstCache()
+{
+	std::unique_lock<std::mutex> lock(samplingRoutineCacheMutex);
+	samplingRoutineCache->updateConstCache();
+}
+
 std::mutex& Device::getSamplingRoutineCacheMutex()
 {
 	return samplingRoutineCacheMutex;
diff --git a/src/Vulkan/VkDevice.hpp b/src/Vulkan/VkDevice.hpp
index 52212f3..9bba54e 100644
--- a/src/Vulkan/VkDevice.hpp
+++ b/src/Vulkan/VkDevice.hpp
@@ -24,7 +24,6 @@
 namespace sw
 {
 	class Blitter;
-	class SamplingRoutineCache;
 }
 
 namespace vk
@@ -71,13 +70,19 @@
 		rr::Routine* query(const Key& key) const;
 		void add(const Key& key, rr::Routine* routine);
 
+		rr::Routine* queryConst(const Key& key) const;
+		void updateConstCache();
+
+		static std::size_t hash(const Key &key);
+
 	private:
-		std::size_t hash(const Key &key) const;
-		sw::LRUCache<std::size_t, rr::Routine> cache;
+		sw::LRUConstCache<std::size_t, rr::Routine> cache;
 	};
 
-	SamplingRoutineCache* getSamplingRoutineCache();
+	SamplingRoutineCache* getSamplingRoutineCache() const;
 	std::mutex& getSamplingRoutineCacheMutex();
+	rr::Routine* findInConstCache(const SamplingRoutineCache::Key& key) const;
+	void updateSamplingRoutineConstCache();
 
 private:
 	PhysicalDevice *const physicalDevice = nullptr;
diff --git a/src/Vulkan/VkQueue.cpp b/src/Vulkan/VkQueue.cpp
index 4c03198..3aee60a 100644
--- a/src/Vulkan/VkQueue.cpp
+++ b/src/Vulkan/VkQueue.cpp
@@ -74,7 +74,7 @@
 namespace vk
 {
 
-Queue::Queue() : renderer()
+Queue::Queue(Device* device) : renderer(device)
 {
 	queueThread = std::thread(TaskLoop, this);
 }
diff --git a/src/Vulkan/VkQueue.hpp b/src/Vulkan/VkQueue.hpp
index cfa462b..2926e10 100644
--- a/src/Vulkan/VkQueue.hpp
+++ b/src/Vulkan/VkQueue.hpp
@@ -31,6 +31,7 @@
 namespace vk
 {
 
+class Device;
 class Fence;
 
 class Queue
@@ -38,7 +39,7 @@
 	VK_LOADER_DATA loaderData = { ICD_LOADER_MAGIC };
 
 public:
-	Queue();
+	Queue(Device* device);
 	~Queue();
 
 	operator VkQueue()