SpirvShader: Implement GroupNonUniformShuffle capability

Implements the following opcodes:
• OpGroupNonUniformShuffle
• OpGroupNonUniformShuffleXor

Bug: b/133510501
Tests: dEQP-VK.subgroups.*
Change-Id: I85dcfebe3d56d7c5a73b7fdbcb88885e7e228747
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35089
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Tested-by: Ben Clayton <bclayton@google.com>
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 93344e8..594f8d7 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -845,6 +845,7 @@
 				case spv::CapabilityDeviceGroup: capabilities.DeviceGroup = true; break;
 				case spv::CapabilityGroupNonUniformVote: capabilities.GroupNonUniformVote = true; break;
 				case spv::CapabilityGroupNonUniformBallot: capabilities.GroupNonUniformBallot = true; break;
+				case spv::CapabilityGroupNonUniformShuffle: capabilities.GroupNonUniformShuffle = true; break;
 				default:
 					UNSUPPORTED("Unsupported capability %u", insn.word(1));
 				}
@@ -1097,6 +1098,8 @@
 			case spv::OpGroupNonUniformBallotBitCount:
 			case spv::OpGroupNonUniformBallotFindLSB:
 			case spv::OpGroupNonUniformBallotFindMSB:
+			case spv::OpGroupNonUniformShuffle:
+			case spv::OpGroupNonUniformShuffleXor:
 			case spv::OpCopyObject:
 			case spv::OpArrayLength:
 				// Instructions that yield an intermediate value or divergent pointer
@@ -2754,6 +2757,8 @@
 		case spv::OpGroupNonUniformBallotBitCount:
 		case spv::OpGroupNonUniformBallotFindLSB:
 		case spv::OpGroupNonUniformBallotFindMSB:
+		case spv::OpGroupNonUniformShuffle:
+		case spv::OpGroupNonUniformShuffleXor:
 			return EmitGroupNonUniform(insn, state);
 
 		case spv::OpArrayLength:
@@ -6156,6 +6161,38 @@
 			break;
 		}
 
+		case spv::OpGroupNonUniformShuffle:
+		{
+			GenericValue value(this, state, insn.word(4));
+			GenericValue id(this, state, insn.word(5));
+			auto x = CmpEQ(SIMD::Int(0), id.Int(0));
+			auto y = CmpEQ(SIMD::Int(1), id.Int(0));
+			auto z = CmpEQ(SIMD::Int(2), id.Int(0));
+			auto w = CmpEQ(SIMD::Int(3), id.Int(0));
+			for (auto i = 0u; i < type.sizeInComponents; i++)
+			{
+				SIMD::Int v = value.Int(i);
+				dst.move(i, (x & v.xxxx) | (y & v.yyyy) | (z & v.zzzz) | (w & v.wwww));
+			}
+			break;
+		}
+
+		case spv::OpGroupNonUniformShuffleXor:
+		{
+			GenericValue value(this, state, insn.word(4));
+			GenericValue mask(this, state, insn.word(5));
+			auto x = CmpEQ(SIMD::Int(0), SIMD::Int(0, 1, 2, 3) ^ mask.Int(0));
+			auto y = CmpEQ(SIMD::Int(1), SIMD::Int(0, 1, 2, 3) ^ mask.Int(0));
+			auto z = CmpEQ(SIMD::Int(2), SIMD::Int(0, 1, 2, 3) ^ mask.Int(0));
+			auto w = CmpEQ(SIMD::Int(3), SIMD::Int(0, 1, 2, 3) ^ mask.Int(0));
+			for (auto i = 0u; i < type.sizeInComponents; i++)
+			{
+				SIMD::Int v = value.Int(i);
+				dst.move(i, (x & v.xxxx) | (y & v.yyyy) | (z & v.zzzz) | (w & v.wwww));
+			}
+			break;
+		}
+
 		default:
 			UNIMPLEMENTED("EmitGroupNonUniform op: %s", OpcodeName(type.opcode()).c_str());
 		}
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index ea3f0ce..c668f8f 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -739,6 +739,7 @@
 			bool DeviceGroup : 1;
 			bool GroupNonUniformVote : 1;
 			bool GroupNonUniformBallot : 1;
+			bool GroupNonUniformShuffle : 1;
 		};
 
 		Capabilities const &getUsedCapabilities() const
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index 2a44d8e..ec022be 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -313,7 +313,11 @@
 {
 	properties->subgroupSize = sw::SIMD::Width;
 	properties->supportedStages = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT;
-	properties->supportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT;
+	properties->supportedOperations =
+		VK_SUBGROUP_FEATURE_BASIC_BIT |
+		VK_SUBGROUP_FEATURE_VOTE_BIT |
+		VK_SUBGROUP_FEATURE_BALLOT_BIT |
+		VK_SUBGROUP_FEATURE_SHUFFLE_BIT;
 	properties->quadOperationsInAllStages = VK_FALSE;
 }