Fix back-face culling for vertices near w=0

When the homogeneous w coordinate is near 0, the projected x or y
coordinates (after viewport and subpixel scaling) may be larger than
what's representable by 32-bit signed integers. On x86 this produces
the "indefinite" integer value 0x80000000. This negative value is
returned even for positive inputs. This causes the back-face culling
calculations to reject triangles which are in fact visible.

This change addresses that by introducing the rr::RoundIntClamped()
function, which guarantees that very large positive floating-point
values are converted to a positive integer value near INT_MAX.

Note that the Vulkan spec states that "[the determination whether the
triangle is back-facing or front-facing] is made based on the sign of
the (clipped or unclipped) polygon’s area computed in framebuffer
coordinates." Thus the alternative of performing culling using the
unprojected coordinates would have to take negative viewport dimensions
into account, and may not produce the same results as sub-pixel
snapped coordinates.

Adjust back-face culling of wireframe triangles to use the same
calculations as solid triangles.

Bug: b/177382194
Change-Id: Ice8493d4cfd164638f091bf5a39b5396d65458e2
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51648
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 3cec3b2..064df1f 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -712,11 +712,18 @@
 		const Vertex &v1 = triangles[i].v1;
 		const Vertex &v2 = triangles[i].v2;
 
-		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
-		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
-		          (v2.x * v1.y - v1.x * v2.y) * v0.w;
+		float A = ((float)v0.projected.y - (float)v2.projected.y) * (float)v1.projected.x +
+		          ((float)v2.projected.y - (float)v1.projected.y) * (float)v0.projected.x +
+		          ((float)v1.projected.y - (float)v0.projected.y) * (float)v2.projected.x;  // Area
 
-		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+		int w0w1w2 = bit_cast<int>(v0.w) ^
+		             bit_cast<int>(v1.w) ^
+		             bit_cast<int>(v2.w);
+
+		A = w0w1w2 < 0 ? -A : A;
+
+		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (A >= 0.0f) : (A <= 0.0f);
+
 		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 		{
 			if(frontFacing) continue;
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index a360184..f6c659a 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -81,18 +81,13 @@
 
 			Float A = (y0 - y2) * x1 + (y2 - y1) * x0 + (y1 - y0) * x2;  // Area
 
-			If(A == 0.0f)
-			{
-				Return(0);
-			}
-
 			Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, w)) ^
 			             *Pointer<Int>(v1 + OFFSET(Vertex, w)) ^
 			             *Pointer<Int>(v2 + OFFSET(Vertex, w));
 
 			A = IfThenElse(w0w1w2 < 0, -A, A);
 
-			Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? A > 0.0f : A < 0.0f;
+			Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (A >= 0.0f) : (A <= 0.0f);
 
 			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 			{
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 48928e2..cc095b7 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -546,8 +546,8 @@
 	Float4 rhw = Float4(1.0f) / w;
 
 	Vector4f proj;
-	proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
-	proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
+	proj.x = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
+	proj.y = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
 	proj.z = pos.z * rhw;
 	proj.w = rhw;
 
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 7d49146..bcb5416 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2674,6 +2674,21 @@
 #endif
 }
 
+RValue<Int4> RoundIntClamped(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	// cvtps2dq produces 0x80000000, a negative value, for input larger than
+	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
+	// saturate to 0x80000000.
+	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
+#else
+	// ARM saturates to the largest positive or negative integer. Unit tests
+	// verify that lowerRoundInt() behaves as desired.
+	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
+#endif
+}
+
 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index fbaceaa..d3e1d5a 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2007,7 +2007,14 @@
 }
 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
+// Convert to nearest integer. If a converted value is outside of the integer
+// range, the returned result is undefined.
 RValue<Int4> RoundInt(RValue<Float4> cast);
+// Rounds to the nearest integer, but clamps very large values to an
+// implementation-dependent range.
+// Specifically, on x86, values larger than 2147483583.0 are converted to
+// 2147483583 (0x7FFFFFBF) instead of producing 0x80000000.
+RValue<Int4> RoundIntClamped(RValue<Float4> cast);
 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
 RValue<Int> Extract(RValue<Int4> val, int i);
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index bcc359f..ce4475e 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3589,6 +3589,33 @@
 	}
 }
 
+RValue<Int4> RoundIntClamped(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	// cvtps2dq produces 0x80000000, a negative value, for input larger than
+	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
+	// saturate to 0x80000000.
+	RValue<Float4> clamped = Min(cast, Float4(0x7FFFFF80));
+
+	if(emulateIntrinsics || CPUID::ARM)
+	{
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return Int4((clamped + Float4(0x00C00000)) - Float4(0x00C00000));
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		nearbyint->addArg(clamped.value());
+		::basicBlock->appendInst(nearbyint);
+
+		return RValue<Int4>(V(result));
+	}
+}
+
 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
diff --git a/tests/ReactorUnitTests/ReactorUnitTests.cpp b/tests/ReactorUnitTests/ReactorUnitTests.cpp
index 7b5bb1b..3a42695 100644
--- a/tests/ReactorUnitTests/ReactorUnitTests.cpp
+++ b/tests/ReactorUnitTests/ReactorUnitTests.cpp
@@ -780,6 +780,40 @@
 	EXPECT_EQ(out[8][3], 0x00000000u);
 }
 
+TEST(ReactorUnitTests, RoundInt)
+{
+	FunctionT<int(void *)> function;
+	{
+		Pointer<Byte> out = function.Arg<0>();
+
+		*Pointer<Int4>(out + 0) = RoundInt(Float4(3.1f, 3.6f, -3.1f, -3.6f));
+		*Pointer<Int4>(out + 16) = RoundIntClamped(Float4(2147483648.0f, -2147483648.0f, 2147483520, -2147483520));
+
+		Return(0);
+	}
+
+	auto routine = function(testName().c_str());
+
+	int out[2][4];
+
+	memset(&out, 0, sizeof(out));
+
+	routine(&out);
+
+	EXPECT_EQ(out[0][0], 3);
+	EXPECT_EQ(out[0][1], 4);
+	EXPECT_EQ(out[0][2], -3);
+	EXPECT_EQ(out[0][3], -4);
+
+	// x86 returns 0x80000000 for values which cannot be represented in a 32-bit
+	// integer, but RoundIntClamped() clamps to ensure a positive value for
+	// positive input. ARM saturates to the largest representable integers.
+	EXPECT_GE(out[1][0], 2147483520);
+	EXPECT_LT(out[1][1], -2147483647);
+	EXPECT_EQ(out[1][2], 2147483520);
+	EXPECT_EQ(out[1][3], -2147483520);
+}
+
 TEST(ReactorUnitTests, FPtoUI)
 {
 	FunctionT<int(void *)> function;