SpirvShader: Implement GroupNonUniformBallot capability

Implements the following opcodes:
• OpGroupNonUniformBroadcast
• OpGroupNonUniformBroadcastFirst
• OpGroupNonUniformBallot
• OpGroupNonUniformInverseBallot
• OpGroupNonUniformBallotBitExtract
• OpGroupNonUniformBallotBitCount
• OpGroupNonUniformBallotFindLSB
• OpGroupNonUniformBallotFindMSB

Also don't yield for subgroup scoping - this is pointless, and does not currently work for graphics shaders.

Bug: b/133510501
Tests: dEQP-VK.subgroups.*
Change-Id: I39470bfa9f2184344d1c22e36975db0e23e48cc9
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35033
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index 3f4d442..01608fe 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -97,6 +97,51 @@
 			value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(0, 1, 2, 3));
 		});
 
+		setInputBuiltin(routine, spv::BuiltInSubgroupEqMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		{
+			ASSERT(builtin.SizeInComponents == 4);
+			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 2, 4, 8));
+			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		});
+
+		setInputBuiltin(routine, spv::BuiltInSubgroupGeMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		{
+			ASSERT(builtin.SizeInComponents == 4);
+			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(15, 14, 12, 8));
+			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		});
+
+		setInputBuiltin(routine, spv::BuiltInSubgroupGtMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		{
+			ASSERT(builtin.SizeInComponents == 4);
+			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(14, 12, 8, 0));
+			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		});
+
+		setInputBuiltin(routine, spv::BuiltInSubgroupLeMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		{
+			ASSERT(builtin.SizeInComponents == 4);
+			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 3, 7, 15));
+			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		});
+
+		setInputBuiltin(routine, spv::BuiltInSubgroupLtMask, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
+		{
+			ASSERT(builtin.SizeInComponents == 4);
+			value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(0, 1, 3, 7));
+			value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			value[builtin.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		});
+
 		setInputBuiltin(routine, spv::BuiltInDeviceIndex, [&](const SpirvShader::BuiltinMapping& builtin, Array<SIMD::Float>& value)
 		{
 			ASSERT(builtin.SizeInComponents == 1);
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index f245ea8..8780c39 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -59,6 +59,56 @@
 			routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<SIMD::Float>(SIMD::Int(0, 1, 2, 3));
 		}
 
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupEqMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 2, 4, 8));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupGeMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(15, 14, 12, 8));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupGtMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(14, 12, 8, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupLeMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 3, 7, 15));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupLtMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(0, 1, 3, 7));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
 		it = spirvShader->inputBuiltins.find(spv::BuiltInDeviceIndex);
 		if (it != spirvShader->inputBuiltins.end())
 		{
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index adfd1c3..c15cd79 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -62,6 +62,21 @@
 		return std::make_pair(whole, frac);
 	}
 
+	// Returns the number of 1s in bits, per lane.
+	sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits)
+	{
+		// TODO: Add an intrinsic to reactor. Even if there isn't a
+		// single vector instruction, there may be target-dependent
+		// ways to make this faster.
+		// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+		sw::SIMD::UInt c = bits - ((bits >> 1) & sw::SIMD::UInt(0x55555555));
+		c = ((c >> 2) & sw::SIMD::UInt(0x33333333)) + (c & sw::SIMD::UInt(0x33333333));
+		c = ((c >> 4) + c) & sw::SIMD::UInt(0x0F0F0F0F);
+		c = ((c >> 8) + c) & sw::SIMD::UInt(0x00FF00FF);
+		c = ((c >> 16) + c) & sw::SIMD::UInt(0x0000FFFF);
+		return c;
+	}
+
 	// Returns 1 << bits.
 	// If the resulting bit overflows a 32 bit integer, 0 is returned.
 	rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits)
@@ -812,6 +827,7 @@
 				case spv::CapabilityGroupNonUniform: capabilities.GroupNonUniform = true; break;
 				case spv::CapabilityMultiView: capabilities.MultiView = true; break;
 				case spv::CapabilityDeviceGroup: capabilities.DeviceGroup = true; break;
+				case spv::CapabilityGroupNonUniformBallot: capabilities.GroupNonUniformBallot = true; break;
 				default:
 					UNSUPPORTED("Unsupported capability %u", insn.word(1));
 				}
@@ -1053,6 +1069,14 @@
 			case spv::OpImageRead:
 			case spv::OpImageTexelPointer:
 			case spv::OpGroupNonUniformElect:
+			case spv::OpGroupNonUniformBroadcast:
+			case spv::OpGroupNonUniformBroadcastFirst:
+			case spv::OpGroupNonUniformBallot:
+			case spv::OpGroupNonUniformInverseBallot:
+			case spv::OpGroupNonUniformBallotBitExtract:
+			case spv::OpGroupNonUniformBallotBitCount:
+			case spv::OpGroupNonUniformBallotFindLSB:
+			case spv::OpGroupNonUniformBallotFindMSB:
 			case spv::OpCopyObject:
 			case spv::OpArrayLength:
 				// Instructions that yield an intermediate value or divergent pointer
@@ -2699,6 +2723,14 @@
 			return EmitMemoryBarrier(insn, state);
 
 		case spv::OpGroupNonUniformElect:
+		case spv::OpGroupNonUniformBroadcast:
+		case spv::OpGroupNonUniformBroadcastFirst:
+		case spv::OpGroupNonUniformBallot:
+		case spv::OpGroupNonUniformInverseBallot:
+		case spv::OpGroupNonUniformBallotBitExtract:
+		case spv::OpGroupNonUniformBallotBitCount:
+		case spv::OpGroupNonUniformBallotFindLSB:
+		case spv::OpGroupNonUniformBallotFindMSB:
 			return EmitGroupNonUniform(insn, state);
 
 		case spv::OpArrayLength:
@@ -3289,20 +3321,8 @@
 				break;
 			}
 			case spv::OpBitCount:
-			{
-				// TODO: Add an intrinsic to reactor. Even if there isn't a
-				// single vector instruction, there may be target-dependent
-				// ways to make this faster.
-				// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-				auto v = src.UInt(i);
-				SIMD::UInt c = v - ((v >> 1) & SIMD::UInt(0x55555555));
-				c = ((c >> 2) & SIMD::UInt(0x33333333)) + (c & SIMD::UInt(0x33333333));
-				c = ((c >> 4) + c) & SIMD::UInt(0x0F0F0F0F);
-				c = ((c >> 8) + c) & SIMD::UInt(0x00FF00FF);
-				c = ((c >> 16) + c) & SIMD::UInt(0x0000FFFF);
-				dst.move(i, c);
+				dst.move(i, CountBits(src.UInt(i)));
 				break;
-			}
 			case spv::OpSNegate:
 				dst.move(i, -src.Int(i));
 				break;
@@ -5906,9 +5926,10 @@
 		switch (executionScope)
 		{
 		case spv::ScopeWorkgroup:
-		case spv::ScopeSubgroup:
 			Yield(YieldResult::ControlBarrier);
 			break;
+		case spv::ScopeSubgroup:
+			break;
 		default:
 			// See Vulkan 1.1 spec, Appendix A, Validation Rules within a Module.
 			UNREACHABLE("Scope for execution must be limited to Workgroup or Subgroup");
@@ -5938,6 +5959,8 @@
 
 	SpirvShader::EmitResult SpirvShader::EmitGroupNonUniform(InsnIterator insn, EmitState *state) const
 	{
+		static_assert(SIMD::Width == 4, "EmitGroupNonUniform makes many assumptions that the SIMD vector width is 4");
+
 		auto &type = getType(Type::ID(insn.word(1)));
 		Object::ID resultId = insn.word(2);
 		auto scope = spv::Scope(GetConstScalarInt(insn.word(3)));
@@ -5959,6 +5982,127 @@
 			dst.move(0, elect);
 			break;
 		}
+
+		case spv::OpGroupNonUniformBroadcast:
+		{
+			auto valueId = Object::ID(insn.word(4));
+			auto id = SIMD::Int(GetConstScalarInt(insn.word(5)));
+			GenericValue value(this, state, valueId);
+			auto mask = CmpEQ(id, SIMD::Int(0, 1, 2, 3));
+			for (auto i = 0u; i < type.sizeInComponents; i++)
+			{
+				auto oneVal = SIMD::Int(value.Int(i) & mask);
+				auto replVal = SIMD::Int(oneVal.xxzz | oneVal.yyww);
+				dst.move(i, replVal.xxyy | replVal.zzww);
+			}
+			break;
+		}
+
+		case spv::OpGroupNonUniformBroadcastFirst:
+		{
+			auto valueId = Object::ID(insn.word(4));
+			GenericValue value(this, state, valueId);
+			// Result is true only in the active invocation with the lowest id
+			// in the group, otherwise result is false.
+			SIMD::Int active = state->activeLaneMask();
+			// TODO: Would be nice if we could write this as:
+			//   elect = active & ~(active.Oxyz | active.OOxy | active.OOOx)
+			auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+			auto elect = active & ~(v0111 & (active.xxyz | active.xxxy | active.xxxx));
+			for (auto i = 0u; i < type.sizeInComponents; i++)
+			{
+				auto oneVal = SIMD::Int(value.Int(i) & elect);
+				auto replVal = SIMD::Int(oneVal.xxzz | oneVal.yyww);
+				dst.move(i, replVal.xxyy | replVal.zzww);
+			}
+			break;
+		}
+
+		case spv::OpGroupNonUniformBallot:
+		{
+			ASSERT(type.sizeInComponents == 4);
+			GenericValue predicate(this, state, insn.word(4));
+			dst.move(0, SIMD::Int(SignMask(state->activeLaneMask() & predicate.Int(0))));
+			dst.move(1, SIMD::Int(0));
+			dst.move(2, SIMD::Int(0));
+			dst.move(3, SIMD::Int(0));
+			break;
+		}
+
+		case spv::OpGroupNonUniformInverseBallot:
+		{
+			auto valueId = Object::ID(insn.word(4));
+			ASSERT(type.sizeInComponents == 1);
+			ASSERT(getType(getObject(valueId).type).sizeInComponents == 4);
+			GenericValue value(this, state, valueId);
+			auto bit = (value.Int(0) >> SIMD::Int(0, 1, 2, 3)) & SIMD::Int(1);
+			dst.move(0, -bit);
+			break;
+		}
+
+		case spv::OpGroupNonUniformBallotBitExtract:
+		{
+			auto valueId = Object::ID(insn.word(4));
+			auto indexId = Object::ID(insn.word(5));
+			ASSERT(type.sizeInComponents == 1);
+			ASSERT(getType(getObject(valueId).type).sizeInComponents == 4);
+			ASSERT(getType(getObject(indexId).type).sizeInComponents == 1);
+			GenericValue value(this, state, valueId);
+			GenericValue index(this, state, indexId);
+			auto vecIdx = index.Int(0) / SIMD::Int(32);
+			auto bitIdx = index.Int(0) & SIMD::Int(31);
+			auto bits =	(value.Int(0) & CmpEQ(vecIdx, SIMD::Int(0))) |
+			            (value.Int(1) & CmpEQ(vecIdx, SIMD::Int(1))) |
+			            (value.Int(2) & CmpEQ(vecIdx, SIMD::Int(2))) |
+			            (value.Int(3) & CmpEQ(vecIdx, SIMD::Int(3)));
+			dst.move(0, -((bits >> bitIdx) & SIMD::Int(1)));
+			break;
+		}
+
+		case spv::OpGroupNonUniformBallotBitCount:
+		{
+			auto operation = spv::GroupOperation(insn.word(4));
+			auto valueId = Object::ID(insn.word(5));
+			ASSERT(type.sizeInComponents == 1);
+			ASSERT(getType(getObject(valueId).type).sizeInComponents == 4);
+			GenericValue value(this, state, valueId);
+			switch (operation)
+			{
+			case spv::GroupOperationReduce:
+				dst.move(0, CountBits(value.UInt(0) & SIMD::UInt(15)));
+				break;
+			case spv::GroupOperationInclusiveScan:
+				dst.move(0, CountBits(value.UInt(0) & SIMD::UInt(1, 3, 7, 15)));
+				break;
+			case spv::GroupOperationExclusiveScan:
+				dst.move(0, CountBits(value.UInt(0) & SIMD::UInt(0, 1, 3, 7)));
+				break;
+			default:
+				UNSUPPORTED("GroupOperation %d", int(operation));
+			}
+			break;
+		}
+
+		case spv::OpGroupNonUniformBallotFindLSB:
+		{
+			auto valueId = Object::ID(insn.word(4));
+			ASSERT(type.sizeInComponents == 1);
+			ASSERT(getType(getObject(valueId).type).sizeInComponents == 4);
+			GenericValue value(this, state, valueId);
+			dst.move(0, Cttz(value.UInt(0) & SIMD::UInt(15), true));
+			break;
+		}
+
+		case spv::OpGroupNonUniformBallotFindMSB:
+		{
+			auto valueId = Object::ID(insn.word(4));
+			ASSERT(type.sizeInComponents == 1);
+			ASSERT(getType(getObject(valueId).type).sizeInComponents == 4);
+			GenericValue value(this, state, valueId);
+			dst.move(0, SIMD::UInt(31) - Ctlz(value.UInt(0) & SIMD::UInt(15), false));
+			break;
+		}
+
 		default:
 			UNIMPLEMENTED("EmitGroupNonUniform op: %s", OpcodeName(type.opcode()).c_str());
 		}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 44e252e..c3071bc 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -737,6 +737,7 @@
 			bool GroupNonUniform : 1;
 			bool MultiView : 1;
 			bool DeviceGroup : 1;
+			bool GroupNonUniformBallot : 1;
 		};
 
 		Capabilities const &getUsedCapabilities() const
diff --git a/src/Pipeline/VertexProgram.cpp b/src/Pipeline/VertexProgram.cpp
index e240e7f..cef5e8b 100644
--- a/src/Pipeline/VertexProgram.cpp
+++ b/src/Pipeline/VertexProgram.cpp
@@ -60,6 +60,56 @@
 			routine.getVariable(it->second.Id)[it->second.FirstComponent] = As<SIMD::Float>(SIMD::Int(0, 1, 2, 3));
 		}
 
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupEqMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 2, 4, 8));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupGeMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(15, 14, 12, 8));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupGtMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(14, 12, 8, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupLeMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(1, 3, 7, 15));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
+		it = spirvShader->inputBuiltins.find(spv::BuiltInSubgroupLtMask);
+		if (it != spirvShader->inputBuiltins.end())
+		{
+			ASSERT(it->second.SizeInComponents == 4);
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(0, 1, 3, 7));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+			routine.getVariable(it->second.Id)[it->second.FirstComponent + 3] = As<SIMD::Float>(SIMD::Int(0, 0, 0, 0));
+		}
+
 		it = spirvShader->inputBuiltins.find(spv::BuiltInDeviceIndex);
 		if (it != spirvShader->inputBuiltins.end())
 		{
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index d721c94..f38ff36 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -313,7 +313,7 @@
 {
 	properties->subgroupSize = sw::SIMD::Width;
 	properties->supportedStages = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT;
-	properties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT;
+	properties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT;
 	properties->quadOperationsInAllStages = VK_FALSE;
 }