Use rep stos x86 assembly for fast clears.

Change-Id: I20b385c316f24b703da1d3071e393b4cde450173
Reviewed-on: https://swiftshader-review.googlesource.com/8811
Tested-by: Nicolas Capens <capn@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Common/Memory.cpp b/src/Common/Memory.cpp
index 832382c..79b27ce 100644
--- a/src/Common/Memory.cpp
+++ b/src/Common/Memory.cpp
@@ -22,6 +22,7 @@
 		#define WIN32_LEAN_AND_MEAN
 	#endif
 	#include <windows.h>
+	#include <intrin.h>
 #else
 	#include <sys/mman.h>
 	#include <unistd.h>
@@ -34,6 +35,10 @@
 #undef allocateZero
 #undef deallocateZero
 
+#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined (_M_X64)) && !defined(__x86__)
+#define __x86__
+#endif
+
 namespace sw
 {
 size_t memoryPageSize()
@@ -128,4 +133,32 @@
 
 	deallocate(memory);
 }
+
+void clear(uint16_t *memory, uint16_t element, size_t count)
+{
+	#if defined(_MSC_VER) && defined(__x86__)
+		__stosw(memory, element, count);
+	#elif defined(__GNUC__) && defined(__x86__)
+		__asm__("rep stosw" : : "D"(memory), "a"(element), "c"(count) : "%edi", "%ecx");
+	#else
+		for(size_t i = 0; i < count; i++)
+		{
+			memory[i] = element;
+		}
+	#endif
+}
+
+void clear(uint32_t *memory, uint32_t element, size_t count)
+{
+	#if defined(_MSC_VER) && defined(__x86__)
+		__stosd((unsigned long*)memory, element, count);
+	#elif defined(__GNUC__) && defined(__x86__)
+		__asm__("rep stosl" : : "D"(memory), "a"(element), "c"(count) : "%edi", "%ecx");
+	#else
+		for(size_t i = 0; i < count; i++)
+		{
+			memory[i] = element;
+		}
+	#endif
+}
 }
diff --git a/src/Common/Memory.hpp b/src/Common/Memory.hpp
index 3cf4be1..fc3fd55 100644
--- a/src/Common/Memory.hpp
+++ b/src/Common/Memory.hpp
@@ -16,6 +16,7 @@
 #define Memory_hpp
 
 #include <stddef.h>
+#include <stdint.h>
 
 namespace sw
 {
@@ -28,6 +29,9 @@
 void *allocateExecutable(size_t bytes);   // Allocates memory that can be made executable using markExecutable()
 void markExecutable(void *memory, size_t bytes);
 void deallocateExecutable(void *memory, size_t bytes);
+
+void clear(uint16_t *memory, uint16_t element, size_t count);

+void clear(uint32_t *memory, uint32_t element, size_t count);
 }
 
 #endif   // Memory_hpp
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index ced6432..4eea852 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -14,8 +14,9 @@
 
 #include "Blitter.hpp"
 
-#include "Common/Debug.hpp"
 #include "Reactor/Reactor.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Debug.hpp"
 
 namespace sw
 {
@@ -33,6 +34,11 @@
 
 	void Blitter::clear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
 	{
+		if(fastClear(pixel, format, dest, dRect, rgbaMask))
+		{
+			return;
+		}
+
 		sw::Surface color(1, 1, 1, format, pixel, sw::Surface::bytes(format), sw::Surface::bytes(format));
 		Blitter::Options clearOptions = static_cast<sw::Blitter::Options>((rgbaMask & 0xF) | CLEAR_OPERATION);
 		SliceRect sRect(dRect);
@@ -40,6 +46,88 @@
 		blit(&color, sRect, dest, dRect, clearOptions);
 	}
 
+	bool Blitter::fastClear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+	{
+		if(format != FORMAT_A32B32G32R32F)
+		{
+			return false;
+		}
+
+		float *color = (float*)pixel;
+		float r = color[0];
+		float g = color[1];
+		float b = color[2];
+		float a = color[3];
+
+		uint32_t packed;
+
+		switch(dest->getFormat())
+		{
+		case FORMAT_R5G6B5:
+			if((rgbaMask & 0x7) != 0x7) return false;
+			packed = ((uint16_t)(31 * b + 0.5f) << 0) |
+			         ((uint16_t)(63 * g + 0.5f) << 5) |
+			         ((uint16_t)(31 * r + 0.5f) << 11);
+			break;
+		case FORMAT_X8B8G8R8:
+			if((rgbaMask & 0x7) != 0x7) return false;
+			packed = ((uint32_t)(255) << 24) |
+			         ((uint32_t)(255 * b + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * r + 0.5f) << 0);
+			break;
+		case FORMAT_A8B8G8R8:
+			if((rgbaMask & 0xF) != 0xF) return false;
+			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+			         ((uint32_t)(255 * b + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * r + 0.5f) << 0);
+			break;
+		case FORMAT_X8R8G8B8:
+			if((rgbaMask & 0x7) != 0x7) return false;
+			packed = ((uint32_t)(255) << 24) |
+			         ((uint32_t)(255 * r + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * b + 0.5f) << 0);
+			break;
+		case FORMAT_A8R8G8B8:
+			if((rgbaMask & 0xF) != 0xF) return false;
+			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+			         ((uint32_t)(255 * r + 0.5f) << 16) |
+			         ((uint32_t)(255 * g + 0.5f) << 8) |
+			         ((uint32_t)(255 * b + 0.5f) << 0);
+			break;
+		default:
+			return false;
+		}
+
+		uint8_t *d = (uint8_t*)dest->lockInternal(dRect.x0, dRect.y0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC);
+
+		switch(Surface::bytes(dest->getFormat()))
+		{
+		case 2:
+			for(int i = dRect.y0; i < dRect.y1; i++)
+			{
+				sw::clear((uint16_t*)d, packed, dRect.x1 - dRect.x0);
+				d += dest->getInternalPitchB();
+			}
+			break;
+		case 4:
+			for(int i = dRect.y0; i < dRect.y1; i++)
+			{
+				sw::clear((uint32_t*)d, packed, dRect.x1 - dRect.x0);
+				d += dest->getInternalPitchB();
+			}
+			break;
+		default:
+			assert(false);
+		}
+
+		dest->unlockInternal();
+
+		return true;
+	}
+
 	void Blitter::blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil)
 	{
 		Blitter::Options options = WRITE_RGBA;
diff --git a/src/Renderer/Blitter.hpp b/src/Renderer/Blitter.hpp
index 02c0535..c72b21a 100644
--- a/src/Renderer/Blitter.hpp
+++ b/src/Renderer/Blitter.hpp
@@ -81,6 +81,8 @@
 		void blit3D(Surface *source, Surface *dest);
 
 	private:
+		bool fastClear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+
 		bool read(Float4 &color, Pointer<Byte> element, Format format);
 		bool write(Float4 &color, Pointer<Byte> element, Format format, const Blitter::Options& options);
 		bool read(Int4 &color, Pointer<Byte> element, Format format);