Add a fast multisample resolve code path

For whole-image 4x8-bit normalized format multisample resolves, use a
specialized code path instead of a generic blit routine.

Benchmark results:

Run on (48 X 2594 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x24)
  L1 Instruction 32 KiB (x24)
  L2 Unified 256 KiB (x24)
  L3 Unified 30720 KiB (x2)
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
(LLVM, before)
Triangle/Hello             1.02 ms        0.500 ms         1000
Triangle/Multisample       19.3 ms        0.984 ms         1000

(LLVM, after)
Triangle/Hello            0.845 ms        0.439 ms         1673
Triangle/Multisample       6.95 ms        0.781 ms         1000

(Subzero, before)
Triangle/Hello             1.15 ms        0.516 ms         1120
Triangle/Multisample       40.3 ms        0.469 ms          100

(Subzero, after)
Triangle/Hello             1.19 ms        0.474 ms         1120
Triangle/Multisample       11.8 ms        0.920 ms          747

Bug: b/147802090
Change-Id: I15729552f01a509a5cfce20cd7de06d0b764cf0a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/47969
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 6d4b1ee..fca6cbd 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -1876,6 +1876,12 @@
 
 void Blitter::resolve(const vk::Image *src, vk::Image *dst, VkImageResolve region)
 {
+	if(fastResolve(src, dst, region))
+	{
+		return;
+	}
+
+	// Fall back to a generic blit which performs the resolve.
 	VkImageBlit blitRegion;
 
 	blitRegion.srcOffsets[0] = blitRegion.srcOffsets[1] = region.srcOffset;
@@ -1894,6 +1900,117 @@
 	blit(src, dst, blitRegion, VK_FILTER_NEAREST);
 }
 
+static inline uint32_t averageByte4(uint32_t x, uint32_t y)
+{
+	return (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F) + ((x ^ y) & 0x01010101);
+}
+
+bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve region)
+{
+	// "The aspectMask member of srcSubresource and dstSubresource must only contain VK_IMAGE_ASPECT_COLOR_BIT"
+	ASSERT(region.srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+	ASSERT(region.dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+	ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
+
+	if(region.dstOffset != VkOffset3D{ 0, 0, 0 })
+	{
+		return false;
+	}
+
+	if(region.srcOffset != VkOffset3D{ 0, 0, 0 })
+	{
+		return false;
+	}
+
+	if(region.srcSubresource.layerCount != 1)
+	{
+		return false;
+	}
+
+	if(region.extent != src->getExtent() ||
+	   region.extent != dst->getExtent() ||
+	   region.extent.depth != 1)
+	{
+		return false;
+	}
+
+	VkImageSubresource srcSubresource = {
+		region.srcSubresource.aspectMask,
+		region.srcSubresource.mipLevel,
+		region.srcSubresource.baseArrayLayer
+	};
+
+	VkImageSubresource dstSubresource = {
+		region.dstSubresource.aspectMask,
+		region.dstSubresource.mipLevel,
+		region.dstSubresource.baseArrayLayer
+	};
+
+	VkImageSubresourceRange dstSubresourceRange = {
+		region.dstSubresource.aspectMask,
+		region.dstSubresource.mipLevel,
+		1,  // levelCount
+		region.dstSubresource.baseArrayLayer,
+		region.dstSubresource.layerCount
+	};
+
+	void *source = src->getTexelPointer({ 0, 0, 0 }, srcSubresource);
+	uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getTexelPointer({ 0, 0, 0 }, dstSubresource));
+
+	auto format = src->getFormat();
+	auto samples = src->getSampleCountFlagBits();
+	auto extent = src->getExtent();
+
+	int width = extent.width;
+	int height = extent.height;
+	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
+	int slice = src->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
+
+	uint8_t *source0 = (uint8_t *)source;
+	uint8_t *source1 = source0 + slice;
+	uint8_t *source2 = source1 + slice;
+	uint8_t *source3 = source2 + slice;
+
+	if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
+	{
+		if(samples == 4)
+		{
+			for(int y = 0; y < height; y++)
+			{
+				for(int x = 0; x < width; x++)
+				{
+					uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
+					uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
+					uint32_t c2 = *(uint32_t *)(source2 + 4 * x);
+					uint32_t c3 = *(uint32_t *)(source3 + 4 * x);
+
+					uint32_t c01 = averageByte4(c0, c1);
+					uint32_t c23 = averageByte4(c2, c3);
+					uint32_t c03 = averageByte4(c01, c23);
+
+					*(uint32_t *)(dest + 4 * x) = c03;
+				}
+
+				source0 += pitch;
+				source1 += pitch;
+				source2 += pitch;
+				source3 += pitch;
+				dest += pitch;
+			}
+		}
+		else
+			UNSUPPORTED("Samples: %d", samples);
+	}
+	else
+	{
+		return false;
+	}
+
+	dst->contentsChanged(dstSubresourceRange);
+
+	return true;
+}
+
 void Blitter::copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch)
 {
 	VkExtent3D extent = src->getExtent();
diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
index b114d73..51b280d 100644
--- a/src/Device/Blitter.hpp
+++ b/src/Device/Blitter.hpp
@@ -159,6 +159,7 @@
 	};
 
 	bool fastClear(void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea);
+	bool fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve region);
 
 	Float4 readFloat4(Pointer<Byte> element, const State &state);
 	void write(Float4 &color, Pointer<Byte> element, const State &state);
diff --git a/src/Vulkan/VkImage.hpp b/src/Vulkan/VkImage.hpp
index 5af8790..92979d7 100644
--- a/src/Vulkan/VkImage.hpp
+++ b/src/Vulkan/VkImage.hpp
@@ -194,4 +194,28 @@
 
 }  // namespace vk
 
+inline bool operator==(const VkExtent3D &lhs, const VkExtent3D &rhs)
+{
+	return lhs.width == rhs.width &&
+	       lhs.height == rhs.height &&
+	       lhs.depth == rhs.depth;
+}
+
+inline bool operator!=(const VkExtent3D &lhs, const VkExtent3D &rhs)
+{
+	return !(lhs == rhs);
+}
+
+inline bool operator==(const VkOffset3D &lhs, const VkOffset3D &rhs)
+{
+	return lhs.x == rhs.x &&
+	       lhs.y == rhs.y &&
+	       lhs.z == rhs.z;
+}
+
+inline bool operator!=(const VkOffset3D &lhs, const VkOffset3D &rhs)
+{
+	return !(lhs == rhs);
+}
+
 #endif  // VK_IMAGE_HPP_