Vulkan: Implement shaderClipDistance and shaderCullDistance

The clip distance is used to perform per-fragment clipping - any fragments with a negative linear-interpolated distance are discarded.

The cull distance is used to perform per-primitive culling - any primitives with all vertices with a negative distance are discarded.

Bug: b/139207336
Tests: dEQP-VK.clipping.*
Change-Id: Ia6680601b27599152f68410df47aaaa726d0b349
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/34915
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Device/Config.hpp b/src/Device/Config.hpp
index 76d0077..e1e0235 100644
--- a/src/Device/Config.hpp
+++ b/src/Device/Config.hpp
@@ -50,7 +50,8 @@
 		OUTLINE_RESOLUTION = 8192,   // Maximum vertical resolution of the render target
 		MIPMAP_LEVELS = 14,
 		MAX_UNIFORM_BLOCK_SIZE = 16384,
-		MAX_CLIP_PLANES = 6,
+		MAX_CLIP_DISTANCES = 8,
+		MAX_CULL_DISTANCES = 8,
 		MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS = 64,
 		MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS = 64,
 		MIN_TEXEL_OFFSET = -8,
diff --git a/src/Device/Primitive.hpp b/src/Device/Primitive.hpp
index 0d23ec7..45b2e42 100644
--- a/src/Device/Primitive.hpp
+++ b/src/Device/Primitive.hpp
@@ -65,6 +65,9 @@
 		PlaneEquation w;
 		PlaneEquation V[MAX_INTERFACE_COMPONENTS];
 
+		PlaneEquation clipDistance[MAX_CLIP_DISTANCES];
+		PlaneEquation cullDistance[MAX_CULL_DISTANCES];
+
 		// Masks for two-sided stencil
 		int64_t clockwiseMask;
 		int64_t invClockwiseMask;
diff --git a/src/Device/QuadRasterizer.cpp b/src/Device/QuadRasterizer.cpp
index b070d97..ca4dc2d 100644
--- a/src/Device/QuadRasterizer.cpp
+++ b/src/Device/QuadRasterizer.cpp
@@ -157,6 +157,18 @@
 									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, V[interpolant].B), 16);
 						}
 					}
+
+					for (int i = 0; i < MAX_CLIP_DISTANCES; i++)
+					{
+						DclipDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].C), 16) +
+									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, clipDistance[i].B), 16);
+					}
+
+					for (int i = 0; i < MAX_CULL_DISTANCES; i++)
+					{
+						DcullDistance[i] = *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].C), 16) +
+									yyyy * *Pointer<Float4>(primitive + OFFSET(Primitive, cullDistance[i].B), 16);
+					}
 				}
 
 				Short4 xLeft[4];
diff --git a/src/Device/QuadRasterizer.hpp b/src/Device/QuadRasterizer.hpp
index 2dcff55..6d349e7 100644
--- a/src/Device/QuadRasterizer.hpp
+++ b/src/Device/QuadRasterizer.hpp
@@ -37,6 +37,8 @@
 		Float4 Dw;
 		Float4 Dv[MAX_INTERFACE_COMPONENTS];
 		Float4 Df;
+		Float4 DclipDistance[MAX_CLIP_DISTANCES];
+		Float4 DcullDistance[MAX_CULL_DISTANCES];
 
 		UInt occlusion;
 
diff --git a/src/Device/Renderer.cpp b/src/Device/Renderer.cpp
index 23946bb..46bed2a 100644
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -624,25 +624,32 @@
 			Vertex &v1 = triangles->v1;
 			Vertex &v2 = triangles->v2;
 
-			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
+			Polygon polygon(&v0.position, &v1.position, &v2.position);
+
+
+			if((v0.cullMask | v1.cullMask | v2.cullMask) == 0)
 			{
-				Polygon polygon(&v0.position, &v1.position, &v2.position);
+				continue;
+			}
 
-				int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
+			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) != Clipper::CLIP_FINITE)
+			{
+				continue;
+			}
 
-				if(clipFlagsOr != Clipper::CLIP_FINITE)
+			int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
+			if(clipFlagsOr != Clipper::CLIP_FINITE)
+			{
+				if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
 				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
-					{
-						continue;
-					}
+					continue;
 				}
+			}
 
-				if(drawCall->setupRoutine(primitives, triangles, &polygon, data))
-				{
-					primitives += ms;
-					visible++;
-				}
+			if(drawCall->setupRoutine(primitives, triangles, &polygon, data))
+			{
+				primitives += ms;
+				visible++;
 			}
 		}
 
@@ -793,6 +800,11 @@
 		Vertex &v0 = triangle.v0;
 		Vertex &v1 = triangle.v1;
 
+		if((v0.cullMask | v1.cullMask) == 0)
+		{
+			return false;
+		}
+
 		const float4 &P0 = v0.position;
 		const float4 &P1 = v1.position;
 
@@ -1082,6 +1094,11 @@
 
 		Vertex &v = triangle.v0;
 
+		if(v.cullMask == 0)
+		{
+			return false;
+		}
+
 		float pSize = v.pointSize;
 
 		pSize = clamp(pSize, 1.0f, static_cast<float>(vk::MAX_POINT_SIZE));
diff --git a/src/Device/Vertex.hpp b/src/Device/Vertex.hpp
index 611416b..050a925 100644
--- a/src/Device/Vertex.hpp
+++ b/src/Device/Vertex.hpp
@@ -39,6 +39,9 @@
 		float pointSize;
 
 		int clipFlags;
+		int cullMask;
+		float clipDistance[MAX_CLIP_DISTANCES];
+		float cullDistance[MAX_CLIP_DISTANCES];
 
 		alignas(16) struct
 		{
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 6908aa0..eb60628 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -162,6 +162,50 @@
 				}
 
 				setBuiltins(x, y, z, w, cMask);
+
+				for (uint32_t i = 0; i < MAX_CLIP_DISTANCES; i++)
+				{
+					auto distance = interpolate(xxxx, DclipDistance[i], rhw,
+												primitive + OFFSET(Primitive, clipDistance[i]),
+												false, true, false);
+
+					auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
+					for (auto ms = 0u; ms < state.multiSample; ms++)
+					{
+						// TODO: Fragments discarded by clipping do not exist at
+						// all -- they should not be counted in queries or have
+						// their Z/S effects performed when early fragment tests
+						// are enabled.
+						cMask[ms] &= clipMask;
+					}
+
+					if (spirvShader->getUsedCapabilities().ClipDistance)
+					{
+						auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
+						if(it != spirvShader->inputBuiltins.end())
+						{
+							if (i < it->second.SizeInComponents)
+							{
+								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
+							}
+						}
+					}
+				}
+
+				if (spirvShader->getUsedCapabilities().CullDistance)
+				{
+					auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
+					if(it != spirvShader->inputBuiltins.end())
+					{
+						for (uint32_t i = 0; i < it->second.SizeInComponents; i++)
+						{
+							routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
+									interpolate(xxxx, DcullDistance[i], rhw,
+												primitive + OFFSET(Primitive, cullDistance[i]),
+												false, true, false);
+						}
+					}
+				}
 			}
 
 			Bool alphaPass = true;
diff --git a/src/Pipeline/SetupRoutine.cpp b/src/Pipeline/SetupRoutine.cpp
index 2f7aab8..273f02c 100644
--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -452,17 +452,33 @@
 							OFFSET(Vertex, v[interpolant]),
 							OFFSET(Primitive, V[interpolant]),
 							state.gradient[interpolant].Flat,
-							!state.gradient[interpolant].NoPerspective, 0);
+							!state.gradient[interpolant].NoPerspective);
 				}
 			}
 
+			for (int i = 0; i < MAX_CLIP_DISTANCES; i++)
+			{
+				setupGradient(primitive, tri, w012, M, v0, v1, v2,
+						OFFSET(Vertex, clipDistance[i]),
+						OFFSET(Primitive, clipDistance[i]),
+						false, true);
+			}
+
+			for (int i = 0; i < MAX_CULL_DISTANCES; i++)
+			{
+				setupGradient(primitive, tri, w012, M, v0, v1, v2,
+						OFFSET(Vertex, cullDistance[i]),
+						OFFSET(Primitive, cullDistance[i]),
+						false, true);
+			}
+
 			Return(1);
 		}
 
 		routine = function("SetupRoutine");
 	}
 
-	void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool perspective, int component)
+	void SetupRoutine::setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flat, bool perspective)
 	{
 		if(!flat)
 		{
diff --git a/src/Pipeline/SetupRoutine.hpp b/src/Pipeline/SetupRoutine.hpp
index 953bf42..c871aff 100644
--- a/src/Pipeline/SetupRoutine.hpp
+++ b/src/Pipeline/SetupRoutine.hpp
@@ -33,7 +33,7 @@
 		SetupFunction::RoutineType getRoutine();
 
 	private:
-		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool perspective, int component);
+		void setupGradient(Pointer<Byte> &primitive, Pointer<Byte> &triangle, Float4 &w012, Float4 (&m)[3], Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2, int attribute, int planeEquation, bool flatShading, bool perspective);
 		void edge(Pointer<Byte> &primitive, Pointer<Byte> &data, const Int &Xa, const Int &Ya, const Int &Xb, const Int &Yb, Int &q);
 		void conditionalRotate1(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
 		void conditionalRotate2(Bool condition, Pointer<Byte> &v0, Pointer<Byte> &v1, Pointer<Byte> &v2);
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 8621985..bb118b0 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -351,21 +351,23 @@
 				{
 				case spv::CapabilityMatrix: capabilities.Matrix = true; break;
 				case spv::CapabilityShader: capabilities.Shader = true; break;
+				case spv::CapabilityClipDistance: capabilities.ClipDistance = true; break;
+				case spv::CapabilityCullDistance: capabilities.CullDistance = true; break;
 				case spv::CapabilityInputAttachment: capabilities.InputAttachment = true; break;
 				case spv::CapabilitySampled1D: capabilities.Sampled1D = true; break;
 				case spv::CapabilityImage1D: capabilities.Image1D = true; break;
 				case spv::CapabilitySampledBuffer: capabilities.SampledBuffer = true; break;
 				case spv::CapabilityImageBuffer: capabilities.ImageBuffer = true; break;
+				case spv::CapabilityStorageImageExtendedFormats: capabilities.StorageImageExtendedFormats = true; break;
 				case spv::CapabilityImageQuery: capabilities.ImageQuery = true; break;
 				case spv::CapabilityDerivativeControl: capabilities.DerivativeControl = true; break;
 				case spv::CapabilityGroupNonUniform: capabilities.GroupNonUniform = true; break;
-				case spv::CapabilityMultiView: capabilities.MultiView = true; break;
-				case spv::CapabilityDeviceGroup: capabilities.DeviceGroup = true; break;
 				case spv::CapabilityGroupNonUniformVote: capabilities.GroupNonUniformVote = true; break;
 				case spv::CapabilityGroupNonUniformBallot: capabilities.GroupNonUniformBallot = true; break;
 				case spv::CapabilityGroupNonUniformShuffle: capabilities.GroupNonUniformShuffle = true; break;
 				case spv::CapabilityGroupNonUniformShuffleRelative: capabilities.GroupNonUniformShuffleRelative = true; break;
-				case spv::CapabilityStorageImageExtendedFormats: capabilities.StorageImageExtendedFormats = true; break;
+				case spv::CapabilityDeviceGroup: capabilities.DeviceGroup = true; break;
+				case spv::CapabilityMultiView: capabilities.MultiView = true; break;
 				default:
 					UNSUPPORTED("Unsupported capability %u", insn.word(1));
 				}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index d453153..a99fc6f 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -484,21 +484,23 @@
 		{
 			bool Matrix : 1;
 			bool Shader : 1;
+			bool ClipDistance : 1;
+			bool CullDistance : 1;
 			bool InputAttachment : 1;
 			bool Sampled1D : 1;
 			bool Image1D : 1;
 			bool SampledBuffer : 1;
 			bool ImageBuffer : 1;
+			bool StorageImageExtendedFormats : 1;
 			bool ImageQuery : 1;
 			bool DerivativeControl : 1;
 			bool GroupNonUniform : 1;
-			bool MultiView : 1;
-			bool DeviceGroup : 1;
 			bool GroupNonUniformVote : 1;
 			bool GroupNonUniformBallot : 1;
 			bool GroupNonUniformShuffle : 1;
 			bool GroupNonUniformShuffleRelative : 1;
-			bool StorageImageExtendedFormats : 1;
+			bool DeviceGroup : 1;
+			bool MultiView : 1;
 		};
 
 		Capabilities const &getUsedCapabilities() const
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 1c90a6e..317c4c6 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -62,6 +62,7 @@
 				readInput(batch);
 				program(batch, vertexCount);
 				computeClipFlags();
+				computeCullMask();
 
 				writeCache(vertexCache, tagCache, batch);
 			}
@@ -143,6 +144,26 @@
 		clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
 	}
 
+	void VertexRoutine::computeCullMask()
+	{
+		cullMask = Int(15);
+
+		if (spirvShader->getUsedCapabilities().CullDistance)
+		{
+			auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
+			if (it != spirvShader->outputBuiltins.end())
+			{
+				auto &var = routine.getVariable(it->second.Id);
+				for (uint32_t i = 0; i < it->second.SizeInComponents; i++)
+				{
+					auto const &distance = var[it->second.FirstComponent + i];
+					auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
+					cullMask &= mask;
+				}
+			}
+		}
+	}
+
 	Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
 	                                   bool robustBufferAccess, UInt & robustnessSize, Int baseVertex)
 	{
@@ -154,9 +175,9 @@
 		//    bytes of memory past the end of the buffer, up to the end of the bound range)."
 		UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
 
-		Pointer<Byte> source0 = buffer + offsets.x;

-		Pointer<Byte> source1 = buffer + offsets.y;

-		Pointer<Byte> source2 = buffer + offsets.z;

+		Pointer<Byte> source0 = buffer + offsets.x;
+		Pointer<Byte> source1 = buffer + offsets.y;
+		Pointer<Byte> source2 = buffer + offsets.z;
 		Pointer<Byte> source3 = buffer + offsets.w;
 
 		UInt4 zero(0);
@@ -564,7 +585,7 @@
 		it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
 		if(it != spirvShader->outputBuiltins.end())
 		{
-			assert(it->second.SizeInComponents == 1);
+			ASSERT(it->second.SizeInComponents == 1);
 			auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
 
 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,pointSize)) = Extract(psize, 3);
@@ -573,11 +594,66 @@
 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0);
 		}
 
+		uint32_t clipIndex = 0;
+		if (spirvShader->getUsedCapabilities().ClipDistance)
+		{
+			it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
+			if(it != spirvShader->outputBuiltins.end())
+			{
+				ASSERT(it->second.SizeInComponents <= MAX_CLIP_DISTANCES);
+				for(; clipIndex < it->second.SizeInComponents; clipIndex++)
+				{
+					auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + clipIndex];
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 3);
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 2);
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 1);
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipDistance[clipIndex])) = Extract(dist, 0);
+				}
+			}
+		}
+		for(; clipIndex < MAX_CLIP_DISTANCES; clipIndex++)
+		{
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipDistance[clipIndex])) = Float(0);
+		}
+
+		uint32_t cullIndex = 0;
+		if (spirvShader->getUsedCapabilities().CullDistance)
+		{
+			it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
+			if(it != spirvShader->outputBuiltins.end())
+			{
+				ASSERT(it->second.SizeInComponents <= MAX_CULL_DISTANCES);
+				for(; cullIndex < it->second.SizeInComponents; cullIndex++)
+				{
+					auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + cullIndex];
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 3);
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 2);
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 1);
+					*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullDistance[cullIndex])) = Extract(dist, 0);
+				}
+			}
+		}
+		for(; cullIndex < MAX_CULL_DISTANCES; cullIndex++)
+		{
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
+			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullDistance[cullIndex])) = Float(0);
+		}
+
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 8)  & 0x0000000FF;
 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 0)  & 0x0000000FF;
 
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,cullMask)) = -((cullMask >> 3) & 1);
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,cullMask)) = -((cullMask >> 2) & 1);
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,cullMask)) = -((cullMask >> 1) & 1);
+		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,cullMask)) = -((cullMask >> 0) & 1);
+
 		transpose4x4(proj.x, proj.y, proj.z, proj.w);
 
 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,projected), 16) = proj.w;
@@ -614,6 +690,7 @@
 		*Pointer<Int>(vertex + OFFSET(Vertex,pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,pointSize));
 
 		*Pointer<Int>(vertex + OFFSET(Vertex,clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,clipFlags));
+		*Pointer<Int>(vertex + OFFSET(Vertex,cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex,cullMask));
 		*Pointer<Int4>(vertex + OFFSET(Vertex,projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex,projected));
 
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
@@ -623,5 +700,13 @@
 				*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
 			}
 		}
+		for(int i = 0; i < MAX_CLIP_DISTANCES; i++)
+		{
+			*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
+		}
+		for(int i = 0; i < MAX_CULL_DISTANCES; i++)
+		{
+			*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
+		}
 	}
 }
diff --git a/src/Pipeline/VertexRoutine.hpp b/src/Pipeline/VertexRoutine.hpp
index 69dfa7c..99e7191 100644
--- a/src/Pipeline/VertexRoutine.hpp
+++ b/src/Pipeline/VertexRoutine.hpp
@@ -55,6 +55,7 @@
 		Pointer<Byte> constants;
 
 		Int clipFlags;
+		Int cullMask;
 
 		SpirvRoutine routine;
 
@@ -70,6 +71,7 @@
 		                    bool robustBufferAccess, UInt& robustnessSize, Int baseVertex);
 		void readInput(Pointer<UInt> &batch);
 		void computeClipFlags();
+		void computeCullMask();
 		void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch);
 		void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry);
 	};
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 862bc20..c86853f 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -84,8 +84,8 @@
 		VK_FALSE,  // shaderSampledImageArrayDynamicIndexing
 		VK_FALSE,  // shaderStorageBufferArrayDynamicIndexing
 		VK_FALSE,  // shaderStorageImageArrayDynamicIndexing
-		VK_FALSE,  // shaderClipDistance
-		VK_FALSE,  // shaderCullDistance
+		VK_TRUE,   // shaderClipDistance
+		VK_TRUE,   // shaderCullDistance
 		VK_FALSE,  // shaderFloat64
 		VK_FALSE,  // shaderInt64
 		VK_FALSE,  // shaderInt16
@@ -269,9 +269,9 @@
 		1, // maxSampleMaskWords
 		VK_FALSE, // timestampComputeAndGraphics
 		60, // timestampPeriod
-		8, // maxClipDistances
-		8, // maxCullDistances
-		8, // maxCombinedClipAndCullDistances
+		sw::MAX_CLIP_DISTANCES, // maxClipDistances
+		sw::MAX_CULL_DISTANCES, // maxCullDistances
+		sw::MAX_CLIP_DISTANCES + sw::MAX_CULL_DISTANCES, // maxCombinedClipAndCullDistances
 		2, // discreteQueuePriorities
 		{ 1.0, vk::MAX_POINT_SIZE }, // pointSizeRange[2]
 		{ 1.0, 1.0 }, // lineWidthRange[2] (unsupported)