Fix unaligned access on depth values

This was tested by using __writeeflags(__readeflags() | 0x40000) to
enable alignment checks on x86.

Bug: b/169957911
Change-Id: Ie97b2fda281548fac94b13abe93213a9a1495b0c
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/48929
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Sean Risser <srisser@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 9fbfe80..d9e50c3 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -410,9 +410,7 @@
 
 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
 	{
-		// FIXME: Properly optimizes?
-		zValue.xy = *Pointer<Float4>(buffer);
-		zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
 	}
 
 	Int4 zTest;
@@ -489,9 +487,8 @@
 
 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
 	{
-		// FIXME: Properly optimizes?
-		zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
-		zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
+		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
+		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
 	}
 
 	Int4 zTest;
@@ -559,9 +556,13 @@
 	}
 
 	if(state.depthFormat == VK_FORMAT_D16_UNORM)
+	{
 		return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
+	}
 	else
+	{
 		return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
+	}
 }
 
 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha)
@@ -603,16 +604,13 @@
 
 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
 	{
-		// FIXME: Properly optimizes?
-		zValue.xy = *Pointer<Float4>(buffer);
-		zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
+		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
 	}
 
 	Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
 	zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
 	Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
 
-	// FIXME: Properly optimizes?
 	*Pointer<Float2>(buffer) = Float2(Z.xy);
 	*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
 }
@@ -638,20 +636,16 @@
 
 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
 	{
-		// FIXME: Properly optimizes?
-		zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
-		zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
+		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
+		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
 	}
 
 	Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
 	zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
 	Z = Z | zValue;
 
-	// FIXME: Properly optimizes?
-	*Pointer<Short>(buffer) = Extract(Z, 0);
-	*Pointer<Short>(buffer + 2) = Extract(Z, 1);
-	*Pointer<Short>(buffer + pitch) = Extract(Z, 2);
-	*Pointer<Short>(buffer + pitch + 2) = Extract(Z, 3);
+	*Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
+	*Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
 }
 
 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
@@ -662,9 +656,13 @@
 	}
 
 	if(state.depthFormat == VK_FORMAT_D16_UNORM)
+	{
 		writeDepth16(zBuffer, q, x, z, zMask);
+	}
 	else
+	{
 		writeDepth32F(zBuffer, q, x, z, zMask);
+	}
 }
 
 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask)
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp
index 97053dc..d342595 100644
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -4083,6 +4083,15 @@
 	*this = RValue<Float>(rhs.loadValue());
 }
 
+Float4::Float4(RValue<Float2> lo, RValue<Float2> hi)
+    : XYZW(this)
+{
+	int shuffle[4] = { 0, 1, 4, 5 };  // Real type is v4i32
+	Value *packed = Nucleus::createShuffleVector(lo.value(), hi.value(), shuffle);
+
+	storeValue(packed);
+}
+
 RValue<Float4> Float4::operator=(float x)
 {
 	return *this = Float4(x, x, x, x);
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 1e08698..e301976 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2254,6 +2254,7 @@
 	Float4(const Swizzle2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
 	template<int X, int Y>
 	Float4(const SwizzleMask2<Float4, X> &x, const SwizzleMask2<Float4, Y> &y);
+	Float4(RValue<Float2> lo, RValue<Float2> hi);
 
 	RValue<Float4> operator=(float replicate);
 	RValue<Float4> operator=(RValue<Float4> rhs);