Fix back-face culling for vertices near w=0 When the homogeneous w coordinate is near 0, the projected x or y coordinates (after viewport and subpixel scaling) may be larger than what's representable by 32-bit signed integers. On x86 this produces the "indefinite" integer value 0x80000000. This negative value is returned even for positive inputs. This causes the back-face culling calculations to reject triangles which are in fact visible. This change addresses that by introducing the rr::RoundIntClamped() function, which guarantees that very large positive floating-point values are converted to a positive integer value near INT_MAX. Note that the Vulkan spec states that "[the determination whether the triangle is back-facing or front-facing] is made based on the sign of the (clipped or unclipped) polygon’s area computed in framebuffer coordinates." Thus the alternative of performing culling using the unprojected coordinates would have to take negative viewport dimensions into account, and may not produce the same results as sub-pixel snapped coordinates. Adjust back-face culling of wireframe triangles to use the same calculations as solid triangles. Bug: b/177382194 Change-Id: Ice8493d4cfd164638f091bf5a39b5396d65458e2 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51648 Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Alexis Hétu <sugoi@google.com>

commit: eeb81843c74adbbb81d9b282a8233a5907e7955c [log] [tgz]
author: Nicolas Capens <capn@google.com> Tue Jan 12 17:44:40 2021 -0500
committer: Nicolas Capens <nicolascapens@google.com> Wed Jan 13 18:09:24 2021 +0000
tree: 1a338ed7c85f29df419eb647b366fd288c7820a4
parent: 3549479dc4ccd92ef3b9e179080dcec1a430b01d [diff]
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 3cec3b2..064df1f 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp

@@ -712,11 +712,18 @@
 		const Vertex &v1 = triangles[i].v1;
 		const Vertex &v2 = triangles[i].v2;
 
-		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
-		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
-		          (v2.x * v1.y - v1.x * v2.y) * v0.w;
+		float A = ((float)v0.projected.y - (float)v2.projected.y) * (float)v1.projected.x +
+		          ((float)v2.projected.y - (float)v1.projected.y) * (float)v0.projected.x +
+		          ((float)v1.projected.y - (float)v0.projected.y) * (float)v2.projected.x;  // Area
 
-		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
+		int w0w1w2 = bit_cast<int>(v0.w) ^
+		             bit_cast<int>(v1.w) ^
+		             bit_cast<int>(v2.w);
+
+		A = w0w1w2 < 0 ? -A : A;
+
+		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (A >= 0.0f) : (A <= 0.0f);
+
 		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 		{
 			if(frontFacing) continue;

diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index a360184..f6c659a 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp

@@ -81,18 +81,13 @@
 
 			Float A = (y0 - y2) * x1 + (y2 - y1) * x0 + (y1 - y0) * x2;  // Area
 
-			If(A == 0.0f)
-			{
-				Return(0);
-			}
-
 			Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, w)) ^
 			             *Pointer<Int>(v1 + OFFSET(Vertex, w)) ^
 			             *Pointer<Int>(v2 + OFFSET(Vertex, w));
 
 			A = IfThenElse(w0w1w2 < 0, -A, A);
 
-			Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? A > 0.0f : A < 0.0f;
+			Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (A >= 0.0f) : (A <= 0.0f);
 
 			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 			{

diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 48928e2..cc095b7 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp

@@ -546,8 +546,8 @@
 	Float4 rhw = Float4(1.0f) / w;
 
 	Vector4f proj;
-	proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
-	proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
+	proj.x = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
+	proj.y = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
 	proj.z = pos.z * rhw;
 	proj.w = rhw;
 

diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 7d49146..bcb5416 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp

@@ -2674,6 +2674,21 @@
 #endif
 }
 
+RValue<Int4> RoundIntClamped(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	// cvtps2dq produces 0x80000000, a negative value, for input larger than
+	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
+	// saturate to 0x80000000.
+	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
+#else
+	// ARM saturates to the largest positive or negative integer. Unit tests
+	// verify that lowerRoundInt() behaves as desired.
+	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
+#endif
+}
+
 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();

diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index fbaceaa..d3e1d5a 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp

@@ -2007,7 +2007,14 @@
 }
 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
+// Convert to nearest integer. If a converted value is outside of the integer
+// range, the returned result is undefined.
 RValue<Int4> RoundInt(RValue<Float4> cast);
+// Rounds to the nearest integer, but clamps very large values to an
+// implementation-dependent range.
+// Specifically, on x86, values larger than 2147483583.0 are converted to
+// 2147483583 (0x7FFFFFBF) instead of producing 0x80000000.
+RValue<Int4> RoundIntClamped(RValue<Float4> cast);
 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
 RValue<Int> Extract(RValue<Int4> val, int i);

diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index bcc359f..ce4475e 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp

@@ -3589,6 +3589,33 @@
 	}
 }
 
+RValue<Int4> RoundIntClamped(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+
+	// cvtps2dq produces 0x80000000, a negative value, for input larger than
+	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
+	// saturate to 0x80000000.
+	RValue<Float4> clamped = Min(cast, Float4(0x7FFFFF80));
+
+	if(emulateIntrinsics || CPUID::ARM)
+	{
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return Int4((clamped + Float4(0x00C00000)) - Float4(0x00C00000));
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		nearbyint->addArg(clamped.value());
+		::basicBlock->appendInst(nearbyint);
+
+		return RValue<Int4>(V(result));
+	}
+}
+
 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();

diff --git a/tests/ReactorUnitTests/ReactorUnitTests.cpp b/tests/ReactorUnitTests/ReactorUnitTests.cpp
index 7b5bb1b..3a42695 100644
--- a/tests/ReactorUnitTests/ReactorUnitTests.cpp
+++ b/tests/ReactorUnitTests/ReactorUnitTests.cpp

@@ -780,6 +780,40 @@
 	EXPECT_EQ(out[8][3], 0x00000000u);
 }
 
+TEST(ReactorUnitTests, RoundInt)
+{
+	FunctionT<int(void *)> function;
+	{
+		Pointer<Byte> out = function.Arg<0>();
+
+		*Pointer<Int4>(out + 0) = RoundInt(Float4(3.1f, 3.6f, -3.1f, -3.6f));
+		*Pointer<Int4>(out + 16) = RoundIntClamped(Float4(2147483648.0f, -2147483648.0f, 2147483520, -2147483520));
+
+		Return(0);
+	}
+
+	auto routine = function(testName().c_str());
+
+	int out[2][4];
+
+	memset(&out, 0, sizeof(out));
+
+	routine(&out);
+
+	EXPECT_EQ(out[0][0], 3);
+	EXPECT_EQ(out[0][1], 4);
+	EXPECT_EQ(out[0][2], -3);
+	EXPECT_EQ(out[0][3], -4);
+
+	// x86 returns 0x80000000 for values which cannot be represented in a 32-bit
+	// integer, but RoundIntClamped() clamps to ensure a positive value for
+	// positive input. ARM saturates to the largest representable integers.
+	EXPECT_GE(out[1][0], 2147483520);
+	EXPECT_LT(out[1][1], -2147483647);
+	EXPECT_EQ(out[1][2], 2147483520);
+	EXPECT_EQ(out[1][3], -2147483520);
+}
+
 TEST(ReactorUnitTests, FPtoUI)
 {
 	FunctionT<int(void *)> function;
commit	eeb81843c74adbbb81d9b282a8233a5907e7955c	[log] [tgz]
author	Nicolas Capens <capn@google.com>	Tue Jan 12 17:44:40 2021 -0500
committer	Nicolas Capens <nicolascapens@google.com>	Wed Jan 13 18:09:24 2021 +0000
tree	1a338ed7c85f29df419eb647b366fd288c7820a4
parent	3549479dc4ccd92ef3b9e179080dcec1a430b01d [diff]