Optimize multisample resolve with SSE2 instructions Benchmark results: Run on (48 X 2594 MHz CPU s) CPU Caches: L1 Data 32 KiB (x24) L1 Instruction 32 KiB (x24) L2 Unified 256 KiB (x24) L3 Unified 30720 KiB (x2) --------------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------------- (LLVM, before) Triangle/Hello 0.845 ms 0.439 ms 1673 Triangle/Multisample 6.95 ms 0.781 ms 1000 (LLVM, after) Triangle/Hello 0.861 ms 0.450 ms 1493 Triangle/Multisample 4.03 ms 0.753 ms 747 (Subzero, before) Triangle/Hello 1.19 ms 0.474 ms 1120 Triangle/Multisample 11.8 ms 0.920 ms 747 (Subzero, after) Triangle/Hello 0.907 ms 0.486 ms 1673 Triangle/Multisample 4.62 ms 0.781 ms 1000 Bug: b/147802090 Change-Id: Iea8498f2b745c86cf578db5c0f7ef2329b73c736 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/47970 Presubmit-Ready: Nicolas Capens <nicolascapens@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Alexis Hétu <sugoi@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com>

commit: a2e6c1a149f4ee03f46e11afbb72cfa10e5526e3 [log] [tgz]
author: Nicolas Capens <capn@google.com> Wed Aug 26 15:44:50 2020 -0400
committer: Nicolas Capens <nicolascapens@google.com> Fri Aug 28 20:46:40 2020 +0000
tree: c7683f80b91df8450ee06da1fa17f37235147034
parent: 4487e589eb749c70ceffdfdbfd433cc1628a735b [diff]
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index fca6cbd..29767d1 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp

@@ -16,6 +16,7 @@
 
 #include "Pipeline/ShaderCore.hpp"
 #include "Reactor/Reactor.hpp"
+#include "System/CPUID.hpp"
 #include "System/Debug.hpp"
 #include "System/Half.hpp"
 #include "System/Memory.hpp"
@@ -24,6 +25,11 @@
 
 #include <utility>
 
+#if defined(__i386__) || defined(__x86_64__)
+#	include <xmmintrin.h>
+#	include <emmintrin.h>
+#endif
+
 namespace {
 rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
 {
@@ -1971,13 +1977,36 @@
 	uint8_t *source2 = source1 + slice;
 	uint8_t *source3 = source2 + slice;
 
+	const bool SSE2 = CPUID::supportsSSE2();
+
 	if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
 	{
 		if(samples == 4)
 		{
 			for(int y = 0; y < height; y++)
 			{
-				for(int x = 0; x < width; x++)
+				int x = 0;
+
+#if defined(__i386__) || defined(__x86_64__)
+				if(SSE2)
+				{
+					for(; (x + 3) < width; x += 4)
+					{
+						__m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
+						__m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
+						__m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
+						__m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
+
+						c0 = _mm_avg_epu8(c0, c1);
+						c2 = _mm_avg_epu8(c2, c3);
+						c0 = _mm_avg_epu8(c0, c2);
+
+						_mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
+					}
+				}
+#endif
+
+				for(; x < width; x++)
 				{
 					uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
 					uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
@@ -1996,6 +2025,10 @@
 				source2 += pitch;
 				source3 += pitch;
 				dest += pitch;
+
+				ASSERT(source0 < src->end());
+				ASSERT(source3 < src->end());
+				ASSERT(dest < dst->end());
 			}
 		}
 		else
commit	a2e6c1a149f4ee03f46e11afbb72cfa10e5526e3	[log] [tgz]
author	Nicolas Capens <capn@google.com>	Wed Aug 26 15:44:50 2020 -0400
committer	Nicolas Capens <nicolascapens@google.com>	Fri Aug 28 20:46:40 2020 +0000
tree	c7683f80b91df8450ee06da1fa17f37235147034
parent	4487e589eb749c70ceffdfdbfd433cc1628a735b [diff]