Optimize multisample resolve with SSE2 instructions
Benchmark results:
Run on (48 X 2594 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x24)
L1 Instruction 32 KiB (x24)
L2 Unified 256 KiB (x24)
L3 Unified 30720 KiB (x2)
---------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------
(LLVM, before)
Triangle/Hello 0.845 ms 0.439 ms 1673
Triangle/Multisample 6.95 ms 0.781 ms 1000
(LLVM, after)
Triangle/Hello 0.861 ms 0.450 ms 1493
Triangle/Multisample 4.03 ms 0.753 ms 747
(Subzero, before)
Triangle/Hello 1.19 ms 0.474 ms 1120
Triangle/Multisample 11.8 ms 0.920 ms 747
(Subzero, after)
Triangle/Hello 0.907 ms 0.486 ms 1673
Triangle/Multisample 4.62 ms 0.781 ms 1000
Bug: b/147802090
Change-Id: Iea8498f2b745c86cf578db5c0f7ef2329b73c736
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/47970
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index fca6cbd..29767d1 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -16,6 +16,7 @@
#include "Pipeline/ShaderCore.hpp"
#include "Reactor/Reactor.hpp"
+#include "System/CPUID.hpp"
#include "System/Debug.hpp"
#include "System/Half.hpp"
#include "System/Memory.hpp"
@@ -24,6 +25,11 @@
#include <utility>
+#if defined(__i386__) || defined(__x86_64__)
+# include <xmmintrin.h>
+# include <emmintrin.h>
+#endif
+
namespace {
rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
{
@@ -1971,13 +1977,36 @@
uint8_t *source2 = source1 + slice;
uint8_t *source3 = source2 + slice;
+ const bool SSE2 = CPUID::supportsSSE2();
+
if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
{
if(samples == 4)
{
for(int y = 0; y < height; y++)
{
- for(int x = 0; x < width; x++)
+ int x = 0;
+
+#if defined(__i386__) || defined(__x86_64__)
+ if(SSE2)
+ {
+ for(; (x + 3) < width; x += 4)
+ {
+ __m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
+ __m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
+ __m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
+ __m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
+
+ c0 = _mm_avg_epu8(c0, c1);
+ c2 = _mm_avg_epu8(c2, c3);
+ c0 = _mm_avg_epu8(c0, c2);
+
+ _mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
+ }
+ }
+#endif
+
+ for(; x < width; x++)
{
uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
@@ -1996,6 +2025,10 @@
source2 += pitch;
source3 += pitch;
dest += pitch;
+
+ ASSERT(source0 < src->end());
+ ASSERT(source3 < src->end());
+ ASSERT(dest < dst->end());
}
}
else