Use rep stos x86 assembly for fast clears.
Change-Id: I20b385c316f24b703da1d3071e393b4cde450173
Reviewed-on: https://swiftshader-review.googlesource.com/8811
Tested-by: Nicolas Capens <capn@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <capn@google.com>
diff --git a/src/Common/Memory.cpp b/src/Common/Memory.cpp
index 832382c..79b27ce 100644
--- a/src/Common/Memory.cpp
+++ b/src/Common/Memory.cpp
@@ -22,6 +22,7 @@
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
+ #include <intrin.h>
#else
#include <sys/mman.h>
#include <unistd.h>
@@ -34,6 +35,10 @@
#undef allocateZero
#undef deallocateZero
+#if (defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined (_M_X64)) && !defined(__x86__)
+#define __x86__
+#endif
+
namespace sw
{
size_t memoryPageSize()
@@ -128,4 +133,32 @@
deallocate(memory);
}
+
+void clear(uint16_t *memory, uint16_t element, size_t count)
+{
+ #if defined(_MSC_VER) && defined(__x86__)
+ __stosw(memory, element, count);
+ #elif defined(__GNUC__) && defined(__x86__)
+ __asm__("rep stosw" : : "D"(memory), "a"(element), "c"(count) : "%edi", "%ecx");
+ #else
+ for(size_t i = 0; i < count; i++)
+ {
+ memory[i] = element;
+ }
+ #endif
+}
+
+void clear(uint32_t *memory, uint32_t element, size_t count)
+{
+ #if defined(_MSC_VER) && defined(__x86__)
+ __stosd((unsigned long*)memory, element, count);
+ #elif defined(__GNUC__) && defined(__x86__)
+ __asm__("rep stosl" : : "D"(memory), "a"(element), "c"(count) : "%edi", "%ecx");
+ #else
+ for(size_t i = 0; i < count; i++)
+ {
+ memory[i] = element;
+ }
+ #endif
+}
}
diff --git a/src/Common/Memory.hpp b/src/Common/Memory.hpp
index 3cf4be1..fc3fd55 100644
--- a/src/Common/Memory.hpp
+++ b/src/Common/Memory.hpp
@@ -16,6 +16,7 @@
#define Memory_hpp
#include <stddef.h>
+#include <stdint.h>
namespace sw
{
@@ -28,6 +29,9 @@
void *allocateExecutable(size_t bytes); // Allocates memory that can be made executable using markExecutable()
void markExecutable(void *memory, size_t bytes);
void deallocateExecutable(void *memory, size_t bytes);
+
+void clear(uint16_t *memory, uint16_t element, size_t count);
+void clear(uint32_t *memory, uint32_t element, size_t count);
}
#endif // Memory_hpp
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index ced6432..4eea852 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -14,8 +14,9 @@
#include "Blitter.hpp"
-#include "Common/Debug.hpp"
#include "Reactor/Reactor.hpp"
+#include "Common/Memory.hpp"
+#include "Common/Debug.hpp"
namespace sw
{
@@ -33,6 +34,11 @@
void Blitter::clear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
{
+ if(fastClear(pixel, format, dest, dRect, rgbaMask))
+ {
+ return;
+ }
+
sw::Surface color(1, 1, 1, format, pixel, sw::Surface::bytes(format), sw::Surface::bytes(format));
Blitter::Options clearOptions = static_cast<sw::Blitter::Options>((rgbaMask & 0xF) | CLEAR_OPERATION);
SliceRect sRect(dRect);
@@ -40,6 +46,88 @@
blit(&color, sRect, dest, dRect, clearOptions);
}
+ bool Blitter::fastClear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask)
+ {
+ if(format != FORMAT_A32B32G32R32F)
+ {
+ return false;
+ }
+
+ float *color = (float*)pixel;
+ float r = color[0];
+ float g = color[1];
+ float b = color[2];
+ float a = color[3];
+
+ uint32_t packed;
+
+ switch(dest->getFormat())
+ {
+ case FORMAT_R5G6B5:
+ if((rgbaMask & 0x7) != 0x7) return false;
+ packed = ((uint16_t)(31 * b + 0.5f) << 0) |
+ ((uint16_t)(63 * g + 0.5f) << 5) |
+ ((uint16_t)(31 * r + 0.5f) << 11);
+ break;
+ case FORMAT_X8B8G8R8:
+ if((rgbaMask & 0x7) != 0x7) return false;
+ packed = ((uint32_t)(255) << 24) |
+ ((uint32_t)(255 * b + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * r + 0.5f) << 0);
+ break;
+ case FORMAT_A8B8G8R8:
+ if((rgbaMask & 0xF) != 0xF) return false;
+ packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+ ((uint32_t)(255 * b + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * r + 0.5f) << 0);
+ break;
+ case FORMAT_X8R8G8B8:
+ if((rgbaMask & 0x7) != 0x7) return false;
+ packed = ((uint32_t)(255) << 24) |
+ ((uint32_t)(255 * r + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * b + 0.5f) << 0);
+ break;
+ case FORMAT_A8R8G8B8:
+ if((rgbaMask & 0xF) != 0xF) return false;
+ packed = ((uint32_t)(255 * a + 0.5f) << 24) |
+ ((uint32_t)(255 * r + 0.5f) << 16) |
+ ((uint32_t)(255 * g + 0.5f) << 8) |
+ ((uint32_t)(255 * b + 0.5f) << 0);
+ break;
+ default:
+ return false;
+ }
+
+ uint8_t *d = (uint8_t*)dest->lockInternal(dRect.x0, dRect.y0, dRect.slice, sw::LOCK_WRITEONLY, sw::PUBLIC);
+
+ switch(Surface::bytes(dest->getFormat()))
+ {
+ case 2:
+ for(int i = dRect.y0; i < dRect.y1; i++)
+ {
+ sw::clear((uint16_t*)d, packed, dRect.x1 - dRect.x0);
+ d += dest->getInternalPitchB();
+ }
+ break;
+ case 4:
+ for(int i = dRect.y0; i < dRect.y1; i++)
+ {
+ sw::clear((uint32_t*)d, packed, dRect.x1 - dRect.x0);
+ d += dest->getInternalPitchB();
+ }
+ break;
+ default:
+ assert(false);
+ }
+
+ dest->unlockInternal();
+
+ return true;
+ }
+
void Blitter::blit(Surface *source, const SliceRect &sRect, Surface *dest, const SliceRect &dRect, bool filter, bool isStencil)
{
Blitter::Options options = WRITE_RGBA;
diff --git a/src/Renderer/Blitter.hpp b/src/Renderer/Blitter.hpp
index 02c0535..c72b21a 100644
--- a/src/Renderer/Blitter.hpp
+++ b/src/Renderer/Blitter.hpp
@@ -81,6 +81,8 @@
void blit3D(Surface *source, Surface *dest);
private:
+ bool fastClear(void* pixel, sw::Format format, Surface *dest, const SliceRect &dRect, unsigned int rgbaMask);
+
bool read(Float4 &color, Pointer<Byte> element, Format format);
bool write(Float4 &color, Pointer<Byte> element, Format format, const Blitter::Options& options);
bool read(Int4 &color, Pointer<Byte> element, Format format);