Optimize clearing of depth and stencil images

Extend Blitter::fastClear(), which is based on memset() instead of using
Reactor routines, to also handle D32_SFLOAT and S8_UINT formats.

Benchmark results:

Run on (48 X 2594 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x24)
  L1 Instruction 32 KiB (x24)
  L2 Unified 256 KiB (x24)
  L3 Unified 30720 KiB (x2)
--------------------------------------------------------------------------
Benchmark                                Time             CPU   Iterations
--------------------------------------------------------------------------
(LLVM, before)
ClearImage/VK_FORMAT_D32_SFLOAT       3.74 ms        0.016 ms         1000

(LLVM, after)
ClearImage/VK_FORMAT_D32_SFLOAT       1.08 ms        0.044 ms        10000

(Subzero, before)
ClearImage/VK_FORMAT_D32_SFLOAT       4.51 ms        0.063 ms         1000

(Subzero, after)
ClearImage/VK_FORMAT_D32_SFLOAT      0.963 ms        0.040 ms         7467

This change re-implements https://swiftshader-review.googlesource.com/c/SwiftShader/+/45888
which was reading 'clearValue' out of bounds when accessing color[1]
when it's only a single depth or stencil value.

Bug: b/159455503
Bug: chromium:1097740
Change-Id: Id3e74b4fa28ee0422540a8480814f8c9988f402a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/45949
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index a860aae..7ad251a 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -154,53 +154,77 @@
 	}
 }
 
-bool Blitter::fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
+bool Blitter::fastClear(void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
 {
-	if(format != VK_FORMAT_R32G32B32A32_SFLOAT)
+	if(clearFormat != VK_FORMAT_R32G32B32A32_SFLOAT &&
+	   clearFormat != VK_FORMAT_D32_SFLOAT &&
+	   clearFormat != VK_FORMAT_S8_UINT)
 	{
 		return false;
 	}
 
-	float *color = (float *)pixel;
-	float r = color[0];
-	float g = color[1];
-	float b = color[2];
-	float a = color[3];
+	union ClearValue
+	{
+		struct
+		{
+			float r;
+			float g;
+			float b;
+			float a;
+		};
 
-	uint32_t packed;
+		float rgb[3];
+
+		float d;
+		uint32_t d_as_u32;
+
+		uint32_t s;
+	};
+
+	ClearValue &c = *reinterpret_cast<ClearValue *>(clearValue);
+
+	uint32_t packed = 0;
 
 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
 	switch(viewFormat)
 	{
 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
-			packed = ((uint16_t)(31 * b + 0.5f) << 0) |
-			         ((uint16_t)(63 * g + 0.5f) << 5) |
-			         ((uint16_t)(31 * r + 0.5f) << 11);
+			packed = ((uint16_t)(31 * c.b + 0.5f) << 0) |
+			         ((uint16_t)(63 * c.g + 0.5f) << 5) |
+			         ((uint16_t)(31 * c.r + 0.5f) << 11);
 			break;
 		case VK_FORMAT_B5G6R5_UNORM_PACK16:
-			packed = ((uint16_t)(31 * r + 0.5f) << 0) |
-			         ((uint16_t)(63 * g + 0.5f) << 5) |
-			         ((uint16_t)(31 * b + 0.5f) << 11);
+			packed = ((uint16_t)(31 * c.r + 0.5f) << 0) |
+			         ((uint16_t)(63 * c.g + 0.5f) << 5) |
+			         ((uint16_t)(31 * c.b + 0.5f) << 11);
 			break;
 		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
 		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
 		case VK_FORMAT_R8G8B8A8_UNORM:
-			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
-			         ((uint32_t)(255 * b + 0.5f) << 16) |
-			         ((uint32_t)(255 * g + 0.5f) << 8) |
-			         ((uint32_t)(255 * r + 0.5f) << 0);
+			packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
+			         ((uint32_t)(255 * c.b + 0.5f) << 16) |
+			         ((uint32_t)(255 * c.g + 0.5f) << 8) |
+			         ((uint32_t)(255 * c.r + 0.5f) << 0);
 			break;
 		case VK_FORMAT_B8G8R8A8_UNORM:
-			packed = ((uint32_t)(255 * a + 0.5f) << 24) |
-			         ((uint32_t)(255 * r + 0.5f) << 16) |
-			         ((uint32_t)(255 * g + 0.5f) << 8) |
-			         ((uint32_t)(255 * b + 0.5f) << 0);
+			packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
+			         ((uint32_t)(255 * c.r + 0.5f) << 16) |
+			         ((uint32_t)(255 * c.g + 0.5f) << 8) |
+			         ((uint32_t)(255 * c.b + 0.5f) << 0);
 			break;
 		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
-			packed = R11G11B10F(color);
+			packed = R11G11B10F(c.rgb);
 			break;
 		case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
-			packed = RGB9E5(color);
+			packed = RGB9E5(c.rgb);
+			break;
+		case VK_FORMAT_D32_SFLOAT:
+			ASSERT(clearFormat == VK_FORMAT_D32_SFLOAT);
+			packed = c.d_as_u32;  // float reinterpreted as uint32
+			break;
+		case VK_FORMAT_S8_UINT:
+			ASSERT(clearFormat == VK_FORMAT_S8_UINT);
+			packed = static_cast<uint8_t>(c.s);
 			break;
 		default:
 			return false;
@@ -249,6 +273,14 @@
 
 					switch(viewFormat.bytes())
 					{
+						case 4:
+							for(uint32_t i = 0; i < area.extent.height; i++)
+							{
+								ASSERT(d < dest->end());
+								sw::clear((uint32_t *)d, packed, area.extent.width);
+								d += rowPitchBytes;
+							}
+							break;
 						case 2:
 							for(uint32_t i = 0; i < area.extent.height; i++)
 							{
@@ -257,11 +289,11 @@
 								d += rowPitchBytes;
 							}
 							break;
-						case 4:
+						case 1:
 							for(uint32_t i = 0; i < area.extent.height; i++)
 							{
 								ASSERT(d < dest->end());
-								sw::clear((uint32_t *)d, packed, area.extent.width);
+								memset(d, packed, area.extent.width);
 								d += rowPitchBytes;
 							}
 							break;
diff --git a/src/Device/Blitter.hpp b/src/Device/Blitter.hpp
index 2205c2b..d9cb93a 100644
--- a/src/Device/Blitter.hpp
+++ b/src/Device/Blitter.hpp
@@ -141,7 +141,7 @@
 	Blitter();
 	virtual ~Blitter();
 
-	void clear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea = nullptr);
+	void clear(void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea = nullptr);
 
 	void blit(const vk::Image *src, vk::Image *dst, VkImageBlit region, VkFilter filter);
 	void copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch);
@@ -157,7 +157,7 @@
 		LEFT
 	};
 
-	bool fastClear(void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea);
+	bool fastClear(void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea);
 
 	Float4 readFloat4(Pointer<Byte> element, const State &state);
 	void write(Float4 &color, Pointer<Byte> element, const State &state);