Add a fast multisample resolve code path
For whole-image 4x8-bit normalized format multisample resolves, use a
specialized code path instead of a generic blit routine.
Benchmark results:
Run on (48 X 2594 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x24)
L1 Instruction 32 KiB (x24)
L2 Unified 256 KiB (x24)
L3 Unified 30720 KiB (x2)
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
(LLVM, before)
Triangle/Hello 1.02 ms 0.500 ms 1000
Triangle/Multisample 19.3 ms 0.984 ms 1000
(LLVM, after)
Triangle/Hello 0.845 ms 0.439 ms 1673
Triangle/Multisample 6.95 ms 0.781 ms 1000
(Subzero, before)
Triangle/Hello 1.15 ms 0.516 ms 1120
Triangle/Multisample 40.3 ms 0.469 ms 100
(Subzero, after)
Triangle/Hello 1.19 ms 0.474 ms 1120
Triangle/Multisample 11.8 ms 0.920 ms 747
Bug: b/147802090
Change-Id: I15729552f01a509a5cfce20cd7de06d0b764cf0a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/47969
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 6d4b1ee..fca6cbd 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -1876,6 +1876,12 @@
void Blitter::resolve(const vk::Image *src, vk::Image *dst, VkImageResolve region)
{
+ if(fastResolve(src, dst, region))
+ {
+ return;
+ }
+
+ // Fall back to a generic blit which performs the resolve.
VkImageBlit blitRegion;
blitRegion.srcOffsets[0] = blitRegion.srcOffsets[1] = region.srcOffset;
@@ -1894,6 +1900,117 @@
blit(src, dst, blitRegion, VK_FILTER_NEAREST);
}
+static inline uint32_t averageByte4(uint32_t x, uint32_t y)
+{
+ return (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F) + ((x ^ y) & 0x01010101);
+}
+
+bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve region)
+{
+ // "The aspectMask member of srcSubresource and dstSubresource must only contain VK_IMAGE_ASPECT_COLOR_BIT"
+ ASSERT(region.srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+ ASSERT(region.dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
+ ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
+
+ if(region.dstOffset != VkOffset3D{ 0, 0, 0 })
+ {
+ return false;
+ }
+
+ if(region.srcOffset != VkOffset3D{ 0, 0, 0 })
+ {
+ return false;
+ }
+
+ if(region.srcSubresource.layerCount != 1)
+ {
+ return false;
+ }
+
+ if(region.extent != src->getExtent() ||
+ region.extent != dst->getExtent() ||
+ region.extent.depth != 1)
+ {
+ return false;
+ }
+
+ VkImageSubresource srcSubresource = {
+ region.srcSubresource.aspectMask,
+ region.srcSubresource.mipLevel,
+ region.srcSubresource.baseArrayLayer
+ };
+
+ VkImageSubresource dstSubresource = {
+ region.dstSubresource.aspectMask,
+ region.dstSubresource.mipLevel,
+ region.dstSubresource.baseArrayLayer
+ };
+
+ VkImageSubresourceRange dstSubresourceRange = {
+ region.dstSubresource.aspectMask,
+ region.dstSubresource.mipLevel,
+ 1, // levelCount
+ region.dstSubresource.baseArrayLayer,
+ region.dstSubresource.layerCount
+ };
+
+ void *source = src->getTexelPointer({ 0, 0, 0 }, srcSubresource);
+ uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getTexelPointer({ 0, 0, 0 }, dstSubresource));
+
+ auto format = src->getFormat();
+ auto samples = src->getSampleCountFlagBits();
+ auto extent = src->getExtent();
+
+ int width = extent.width;
+ int height = extent.height;
+ int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
+ int slice = src->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
+
+ uint8_t *source0 = (uint8_t *)source;
+ uint8_t *source1 = source0 + slice;
+ uint8_t *source2 = source1 + slice;
+ uint8_t *source3 = source2 + slice;
+
+ if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
+ {
+ if(samples == 4)
+ {
+ for(int y = 0; y < height; y++)
+ {
+ for(int x = 0; x < width; x++)
+ {
+ uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
+ uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
+ uint32_t c2 = *(uint32_t *)(source2 + 4 * x);
+ uint32_t c3 = *(uint32_t *)(source3 + 4 * x);
+
+ uint32_t c01 = averageByte4(c0, c1);
+ uint32_t c23 = averageByte4(c2, c3);
+ uint32_t c03 = averageByte4(c01, c23);
+
+ *(uint32_t *)(dest + 4 * x) = c03;
+ }
+
+ source0 += pitch;
+ source1 += pitch;
+ source2 += pitch;
+ source3 += pitch;
+ dest += pitch;
+ }
+ }
+ else
+ UNSUPPORTED("Samples: %d", samples);
+ }
+ else
+ {
+ return false;
+ }
+
+ dst->contentsChanged(dstSubresourceRange);
+
+ return true;
+}
+
void Blitter::copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch)
{
VkExtent3D extent = src->getExtent();
diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
index b114d73..51b280d 100644
--- a/src/Device/Blitter.hpp
+++ b/src/Device/Blitter.hpp
@@ -159,6 +159,7 @@
};
bool fastClear(void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea);
+ bool fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve region);
Float4 readFloat4(Pointer<Byte> element, const State &state);
void write(Float4 &color, Pointer<Byte> element, const State &state);
diff --git a/src/Vulkan/VkImage.hpp b/src/Vulkan/VkImage.hpp
index 5af8790..92979d7 100644
--- a/src/Vulkan/VkImage.hpp
+++ b/src/Vulkan/VkImage.hpp
@@ -194,4 +194,28 @@
} // namespace vk
+inline bool operator==(const VkExtent3D &lhs, const VkExtent3D &rhs)
+{
+ return lhs.width == rhs.width &&
+ lhs.height == rhs.height &&
+ lhs.depth == rhs.depth;
+}
+
+inline bool operator!=(const VkExtent3D &lhs, const VkExtent3D &rhs)
+{
+ return !(lhs == rhs);
+}
+
+inline bool operator==(const VkOffset3D &lhs, const VkOffset3D &rhs)
+{
+ return lhs.x == rhs.x &&
+ lhs.y == rhs.y &&
+ lhs.z == rhs.z;
+}
+
+inline bool operator!=(const VkOffset3D &lhs, const VkOffset3D &rhs)
+{
+ return !(lhs == rhs);
+}
+
#endif // VK_IMAGE_HPP_