SpirvShader: Implement OpControlBarrier.
Use the new coroutines to yield when hitting a ControlBarrier.
A barrier pushes the shader subgroup to the end of the compute invocation queue, which forces all subgroups to be brought to the fence before continuing.
Tests: dEQP-VK.spirv_assembly.instruction.compute.workgroup_memory.*
Tests: dEQP-VK.subgroups.basic.compute.*
Tests: dEQP-VK.compute.basic.*
Bug: b/131672705
Bug: b/132232716
Change-Id: Id78be9ce9d9455cb2cb7254482568985845b8b6a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/30851
Presubmit-Ready: Ben Clayton <bclayton@google.com>
Tested-by: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
diff --git a/src/Pipeline/ComputeProgram.cpp b/src/Pipeline/ComputeProgram.cpp
index 361cbb1..081831c 100644
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -18,6 +18,8 @@
#include "Vulkan/VkDebug.hpp"
#include "Vulkan/VkPipelineLayout.hpp"
+#include <queue>
+
namespace
{
enum { X, Y, Z };
@@ -154,17 +156,18 @@
void ComputeProgram::emit()
{
+ Int workgroupX = Arg<1>();
+ Int workgroupY = Arg<2>();
+ Int workgroupZ = Arg<3>();
+ Pointer<Byte> workgroupMemory = Arg<4>();
+ Int firstSubgroup = Arg<5>();
+ Int subgroupCount = Arg<6>();
+
routine.descriptorSets = data + OFFSET(Data, descriptorSets);
routine.descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
routine.pushConstants = data + OFFSET(Data, pushConstants);
routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
- routine.workgroupMemory = *Pointer<Pointer<Byte>>(data + OFFSET(Data, workgroupMemory));
-
- Int workgroupX = Arg<1>();
- Int workgroupY = Arg<2>();
- Int workgroupZ = Arg<3>();
- Int firstSubgroup = Arg<4>();
- Int subgroupCount = Arg<5>();
+ routine.workgroupMemory = workgroupMemory;
Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
@@ -210,8 +213,8 @@
auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
// We're sharing a buffer here across all workgroups.
- // We can only do this because we know workgroups are executed
- // serially.
+ // We can only do this because we know a single workgroup is in flight
+ // at any time.
std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
Data data;
@@ -230,19 +233,51 @@
data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
data.pushConstants = pushConstants;
data.constants = &sw::constants;
- data.workgroupMemory = workgroupMemory.data();
- // TODO(bclayton): Split work across threads.
for (uint32_t groupZ = 0; groupZ < groupCountZ; groupZ++)
{
for (uint32_t groupY = 0; groupY < groupCountY; groupY++)
{
for (uint32_t groupX = 0; groupX < groupCountX; groupX++)
{
- (*this)(&data, groupX, groupY, groupZ, 0, subgroupsPerWorkgroup);
- }
- }
- }
+
+ // TODO(bclayton): Split work across threads.
+ using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
+ std::queue<Coroutine> coroutines;
+
+ if (shader->getModes().ContainsControlBarriers)
+ {
+ // Make a function call per subgroup so each subgroup
+ // can yield, bringing all subgroups to the barrier
+ // together.
+ for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
+ {
+ auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
+ coroutines.push(std::move(coroutine));
+ }
+ }
+ else
+ {
+ auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
+ coroutines.push(std::move(coroutine));
+ }
+
+ while (coroutines.size() > 0)
+ {
+ auto coroutine = std::move(coroutines.front());
+ coroutines.pop();
+
+ SpirvShader::YieldResult result;
+ if (coroutine->await(result))
+ {
+ // TODO: Consider result (when the enum is more than 1 entry).
+ coroutines.push(std::move(coroutine));
+ }
+ }
+
+ } // groupX
+ } // groupY
+ } // groupZ
}
} // namespace sw
diff --git a/src/Pipeline/ComputeProgram.hpp b/src/Pipeline/ComputeProgram.hpp
index b2a3785..762a44b 100644
--- a/src/Pipeline/ComputeProgram.hpp
+++ b/src/Pipeline/ComputeProgram.hpp
@@ -37,11 +37,12 @@
struct Constants;
// ComputeProgram builds a SPIR-V compute shader.
- class ComputeProgram : public Coroutine<int(
+ class ComputeProgram : public Coroutine<SpirvShader::YieldResult(
void* data,
int32_t workgroupX,
int32_t workgroupY,
int32_t workgroupZ,
+ void* workgroupMemory,
int32_t firstSubgroup,
int32_t subgroupCount)>
{
@@ -80,7 +81,6 @@
uint32_t invocationsPerWorkgroup; // Total number of invocations per workgroup.
PushConstantStorage pushConstants;
const Constants *constants;
- uint8_t* workgroupMemory;
};
SpirvRoutine routine;
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 020309e..ae720a2 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -13,8 +13,9 @@
// limitations under the License.
#include "SpirvShader.hpp"
-
#include "SamplerCore.hpp"
+
+#include "Reactor/Coroutine.hpp"
#include "System/Math.hpp"
#include "Vulkan/VkBuffer.hpp"
#include "Vulkan/VkBufferView.hpp"
@@ -884,6 +885,10 @@
// Don't need to do anything during analysis pass
break;
+ case spv::OpControlBarrier:
+ modes.ContainsControlBarriers = true;
+ break;
+
case spv::OpExtension:
{
auto ext = reinterpret_cast<char const *>(insn.wordPointer(1));
@@ -2462,6 +2467,9 @@
case spv::OpCopyMemory:
return EmitCopyMemory(insn, state);
+ case spv::OpControlBarrier:
+ return EmitControlBarrier(insn, state);
+
case spv::OpMemoryBarrier:
return EmitMemoryBarrier(insn, state);
@@ -4889,6 +4897,11 @@
return ptr;
}
+ void SpirvShader::Yield(YieldResult res) const
+ {
+ rr::Yield(RValue<Int>(int(res)));
+ }
+
SpirvShader::EmitResult SpirvShader::EmitImageRead(InsnIterator insn, EmitState *state) const
{
auto &resultType = getType(Type::ID(insn.word(1)));
@@ -5468,6 +5481,29 @@
return EmitResult::Continue;
}
+ SpirvShader::EmitResult SpirvShader::EmitControlBarrier(InsnIterator insn, EmitState *state) const
+ {
+ auto executionScope = spv::Scope(GetConstScalarInt(insn.word(1)));
+ auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(3)));
+ // TODO: We probably want to consider the memory scope here. For now,
+ // just always emit the full fence.
+ Fence(semantics);
+
+ switch (executionScope)
+ {
+ case spv::ScopeWorkgroup:
+ case spv::ScopeSubgroup:
+ Yield(YieldResult::ControlBarrier);
+ break;
+ default:
+ // See Vulkan 1.1 spec, Appendix A, Validation Rules within a Module.
+ UNREACHABLE("Scope for execution must be limited to Workgroup or Subgroup");
+ break;
+ }
+
+ return EmitResult::Continue;
+ }
+
SpirvShader::EmitResult SpirvShader::EmitMemoryBarrier(InsnIterator insn, EmitState *state) const
{
auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(2)));
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index cb37cce..1eaa6f8 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -262,6 +262,11 @@
using ImageSampler = void(void* texture, void *sampler, void* uvsIn, void* texelOut, void* constants);
using GetImageSampler = ImageSampler*(const vk::ImageView *imageView, const vk::Sampler *sampler);
+ enum class YieldResult
+ {
+ ControlBarrier,
+ };
+
/* Pseudo-iterator over SPIRV instructions, designed to support range-based-for. */
class InsnIterator
{
@@ -543,6 +548,7 @@
bool DepthLess : 1;
bool DepthUnchanged : 1;
bool ContainsKill : 1;
+ bool ContainsControlBarriers : 1;
bool NeedsCentroid : 1;
// Compute workgroup dimensions
@@ -934,6 +940,7 @@
EmitResult EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const;
EmitResult EmitSampledImageCombineOrSplit(InsnIterator insn, EmitState *state) const;
EmitResult EmitCopyMemory(InsnIterator insn, EmitState *state) const;
+ EmitResult EmitControlBarrier(InsnIterator insn, EmitState *state) const;
EmitResult EmitMemoryBarrier(InsnIterator insn, EmitState *state) const;
EmitResult EmitGroupNonUniform(InsnIterator insn, EmitState *state) const;
@@ -944,6 +951,9 @@
// Emits a rr::Fence for the given MemorySemanticsMask.
void Fence(spv::MemorySemanticsMask semantics) const;
+ // Helper for calling rr::Yield with res cast to an rr::Int.
+ void Yield(YieldResult res) const;
+
// OpcodeName() returns the name of the opcode op.
// If NDEBUG is defined, then OpcodeName() will only return the numerical code.
static std::string OpcodeName(spv::Op op);