Optimize multisample resolve with SSE2 instructions

Benchmark results:

Run on (48 X 2594 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x24)
  L1 Instruction 32 KiB (x24)
  L2 Unified 256 KiB (x24)
  L3 Unified 30720 KiB (x2)
---------------------------------------------------------------
Benchmark                     Time             CPU   Iterations
---------------------------------------------------------------
(LLVM, before)
Triangle/Hello            0.845 ms        0.439 ms         1673
Triangle/Multisample       6.95 ms        0.781 ms         1000

(LLVM, after)
Triangle/Hello            0.861 ms        0.450 ms         1493
Triangle/Multisample       4.03 ms        0.753 ms          747

(Subzero, before)
Triangle/Hello             1.19 ms        0.474 ms         1120
Triangle/Multisample       11.8 ms        0.920 ms          747

(Subzero, after)
Triangle/Hello            0.907 ms        0.486 ms         1673
Triangle/Multisample       4.62 ms        0.781 ms         1000

Bug: b/147802090
Change-Id: Iea8498f2b745c86cf578db5c0f7ef2329b73c736
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/47970
Presubmit-Ready: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index fca6cbd..29767d1 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -16,6 +16,7 @@
 
 #include "Pipeline/ShaderCore.hpp"
 #include "Reactor/Reactor.hpp"
+#include "System/CPUID.hpp"
 #include "System/Debug.hpp"
 #include "System/Half.hpp"
 #include "System/Memory.hpp"
@@ -24,6 +25,11 @@
 
 #include <utility>
 
+#if defined(__i386__) || defined(__x86_64__)
+#	include <xmmintrin.h>
+#	include <emmintrin.h>
+#endif
+
 namespace {
 rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
 {
@@ -1971,13 +1977,36 @@
 	uint8_t *source2 = source1 + slice;
 	uint8_t *source3 = source2 + slice;
 
+	const bool SSE2 = CPUID::supportsSSE2();
+
 	if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
 	{
 		if(samples == 4)
 		{
 			for(int y = 0; y < height; y++)
 			{
-				for(int x = 0; x < width; x++)
+				int x = 0;
+
+#if defined(__i386__) || defined(__x86_64__)
+				if(SSE2)
+				{
+					for(; (x + 3) < width; x += 4)
+					{
+						__m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
+						__m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
+						__m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
+						__m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
+
+						c0 = _mm_avg_epu8(c0, c1);
+						c2 = _mm_avg_epu8(c2, c3);
+						c0 = _mm_avg_epu8(c0, c2);
+
+						_mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
+					}
+				}
+#endif
+
+				for(; x < width; x++)
 				{
 					uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
 					uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
@@ -1996,6 +2025,10 @@
 				source2 += pitch;
 				source3 += pitch;
 				dest += pitch;
+
+				ASSERT(source0 < src->end());
+				ASSERT(source3 < src->end());
+				ASSERT(dest < dst->end());
 			}
 		}
 		else