Implement fp16 texture formats
TODO: figure out what is happening with linear filtering. Possibly pre-existing breakage -- if we enable filtering support for R32_SFLOAT that fails in the same way.
Test: dEQP-VK.texture.*
Test: dEQP-VK.image.*
Test: dEQP-VK.pipeline.*
Change-Id: Ia461418d772eb5aceb101b84eaa239b0c0bce2c0
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/30288
Tested-by: Chris Forbes <chrisforbes@google.com>
Presubmit-Ready: Chris Forbes <chrisforbes@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index c29a185..e1c6485 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -195,11 +195,14 @@
if(componentCount < 4) c.w = Float4(1.0f);
break;
case VK_FORMAT_R32_SFLOAT:
+ case VK_FORMAT_R16_SFLOAT:
c.y = Float4(0.0f);
case VK_FORMAT_R32G32_SFLOAT:
+ case VK_FORMAT_R16G16_SFLOAT:
c.z = Float4(0.0f);
c.w = Float4(1.0f);
case VK_FORMAT_R32G32B32A32_SFLOAT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
break;
default:
ASSERT(false);
@@ -1830,42 +1833,95 @@
int f2 = state.textureType == TEXTURE_CUBE ? 2 : 0;
int f3 = state.textureType == TEXTURE_CUBE ? 3 : 0;
- // Read texels
- switch(textureComponentCount())
+ if (has16bitTextureComponents())
{
- case 4:
- c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
- c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
- c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
- c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
- transpose4x4(c.x, c.y, c.z, c.w);
- break;
- case 3:
- c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
- c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
- c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
- c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
- transpose4x3(c.x, c.y, c.z, c.w);
- break;
- case 2:
- // FIXME: Optimal shuffling?
- c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
- c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
- c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
- c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
- c.y = c.x;
- c.x = Float4(c.x.xz, c.z.xz);
- c.y = Float4(c.y.yw, c.z.yw);
- break;
- case 1:
- // FIXME: Optimal shuffling?
- c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
- c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
- c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
- c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
- break;
- default:
- ASSERT(false);
+ switch (textureComponentCount())
+ {
+ case 4:
+ {
+ UInt4 t0 = Int4(*Pointer<UShort4>(buffer[f0] + index[0] * 8));
+ UInt4 t1 = Int4(*Pointer<UShort4>(buffer[f1] + index[1] * 8));
+ UInt4 t2 = Int4(*Pointer<UShort4>(buffer[f2] + index[2] * 8));
+ UInt4 t3 = Int4(*Pointer<UShort4>(buffer[f3] + index[3] * 8));
+
+ c.x = As<Float4>(halfToFloatBits(t0));
+ c.y = As<Float4>(halfToFloatBits(t1));
+ c.z = As<Float4>(halfToFloatBits(t2));
+ c.w = As<Float4>(halfToFloatBits(t3));
+ transpose4x4(c.x, c.y, c.z, c.w);
+ break;
+ }
+ case 2:
+ {
+ UInt4 t0 = Int4(*Pointer<UShort4>(buffer[f0] + index[0] * 4));
+ UInt4 t1 = Int4(*Pointer<UShort4>(buffer[f1] + index[1] * 4));
+ UInt4 t2 = Int4(*Pointer<UShort4>(buffer[f2] + index[2] * 4));
+ UInt4 t3 = Int4(*Pointer<UShort4>(buffer[f3] + index[3] * 4));
+
+ // FIXME: shuffles
+ c.x = As<Float4>(halfToFloatBits(t0));
+ c.y = As<Float4>(halfToFloatBits(t1));
+ c.z = As<Float4>(halfToFloatBits(t2));
+ c.w = As<Float4>(halfToFloatBits(t3));
+ transpose4x4(c.x, c.y, c.z, c.w);
+ break;
+ }
+ case 1:
+ {
+ UInt4 t0 = Int4(*Pointer<UShort4>(buffer[f0] + index[0] * 2));
+ UInt4 t1 = Int4(*Pointer<UShort4>(buffer[f1] + index[1] * 2));
+ UInt4 t2 = Int4(*Pointer<UShort4>(buffer[f2] + index[2] * 2));
+ UInt4 t3 = Int4(*Pointer<UShort4>(buffer[f3] + index[3] * 2));
+
+ c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
+ c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
+ c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
+ c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
+ break;
+ }
+ default:
+ UNIMPLEMENTED("fp16 sampling %d components", textureComponentCount());
+ }
+ }
+ else
+ {
+ // Read texels
+ switch (textureComponentCount())
+ {
+ case 4:
+ c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+ c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+ c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+ c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+ transpose4x4(c.x, c.y, c.z, c.w);
+ break;
+ case 3:
+ c.x = *Pointer<Float4>(buffer[f0] + index[0] * 16, 16);
+ c.y = *Pointer<Float4>(buffer[f1] + index[1] * 16, 16);
+ c.z = *Pointer<Float4>(buffer[f2] + index[2] * 16, 16);
+ c.w = *Pointer<Float4>(buffer[f3] + index[3] * 16, 16);
+ transpose4x3(c.x, c.y, c.z, c.w);
+ break;
+ case 2:
+ // FIXME: Optimal shuffling?
+ c.x.xy = *Pointer<Float4>(buffer[f0] + index[0] * 8);
+ c.x.zw = *Pointer<Float4>(buffer[f1] + index[1] * 8 - 8);
+ c.z.xy = *Pointer<Float4>(buffer[f2] + index[2] * 8);
+ c.z.zw = *Pointer<Float4>(buffer[f3] + index[3] * 8 - 8);
+ c.y = c.x;
+ c.x = Float4(c.x.xz, c.z.xz);
+ c.y = Float4(c.y.yw, c.z.yw);
+ break;
+ case 1:
+ // FIXME: Optimal shuffling?
+ c.x.x = *Pointer<Float>(buffer[f0] + index[0] * 4);
+ c.x.y = *Pointer<Float>(buffer[f1] + index[1] * 4);
+ c.x.z = *Pointer<Float>(buffer[f2] + index[2] * 4);
+ c.x.w = *Pointer<Float>(buffer[f3] + index[3] * 4);
+ break;
+ default:
+ ASSERT(false);
+ }
}
if(state.compare != COMPARE_BYPASS)
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index e2ebf83..4f7886f 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -559,4 +559,17 @@
case 4: transpose4x4(row0, row1, row2, row3); break;
}
}
+
+ UInt4 halfToFloatBits(UInt4 halfBits)
+ {
+ static const uint32_t mask_nosign = 0x7FFF;
+ static const uint32_t magic = (254 - 15) << 23;
+ static const uint32_t was_infnan = 0x7BFF;
+ static const uint32_t exp_infnan = 255 << 23;
+
+ UInt4 expmant = halfBits & UInt4(mask_nosign);
+ return As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
+ ((halfBits ^ UInt4(expmant)) << 16) |
+ (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan));
+ }
}
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index 3f0ad27..e1da028 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -90,6 +90,8 @@
void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
+
+ UInt4 halfToFloatBits(UInt4 halfBits);
}
#endif // sw_ShaderCore_hpp
diff --git a/src/Pipeline/SpirvShader.cpp b/src/Pipeline/SpirvShader.cpp
index 0ed371a..11cf404 100644
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -3818,8 +3818,8 @@
case GLSLstd450UnpackHalf2x16:
{
auto val = GenericValue(this, routine, insn.word(5));
- dst.move(0, HalfToFloatBits(val.UInt(0) & SIMD::UInt(0x0000FFFF)));
- dst.move(1, HalfToFloatBits((val.UInt(0) & SIMD::UInt(0xFFFF0000)) >> 16));
+ dst.move(0, halfToFloatBits(val.UInt(0) & SIMD::UInt(0x0000FFFF)));
+ dst.move(1, halfToFloatBits((val.UInt(0) & SIMD::UInt(0xFFFF0000)) >> 16));
break;
}
case GLSLstd450Fma:
@@ -4325,19 +4325,6 @@
return storeInUpperBits ? ((joined << 16) | justsign) : joined | (justsign >> 16);
}
- SIMD::UInt SpirvShader::HalfToFloatBits(SIMD::UInt halfBits) const
- {
- static const uint32_t mask_nosign = 0x7FFF;
- static const uint32_t magic = (254 - 15) << 23;
- static const uint32_t was_infnan = 0x7BFF;
- static const uint32_t exp_infnan = 255 << 23;
-
- SIMD::UInt expmant = halfBits & SIMD::UInt(mask_nosign);
- return As<SIMD::UInt>(As<SIMD::Float>(expmant << 13) * As<SIMD::Float>(SIMD::UInt(magic))) |
- ((halfBits ^ SIMD::UInt(expmant)) << 16) |
- (CmpNLE(As<SIMD::UInt>(expmant), SIMD::UInt(was_infnan)) & SIMD::UInt(exp_infnan));
- }
-
std::pair<SIMD::Float, SIMD::Int> SpirvShader::Frexp(RValue<SIMD::Float> val) const
{
// Assumes IEEE 754
@@ -4868,10 +4855,10 @@
dst.move(3, (packed[1] >> 16) & SIMD::Int(0xffff));
break;
case VK_FORMAT_R16G16B16A16_SFLOAT:
- dst.move(0, HalfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
- dst.move(1, HalfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
- dst.move(2, HalfToFloatBits(As<SIMD::UInt>(packed[1]) & SIMD::UInt(0x0000FFFF)));
- dst.move(3, HalfToFloatBits((As<SIMD::UInt>(packed[1]) & SIMD::UInt(0xFFFF0000)) >> 16));
+ dst.move(0, halfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
+ dst.move(1, halfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
+ dst.move(2, halfToFloatBits(As<SIMD::UInt>(packed[1]) & SIMD::UInt(0x0000FFFF)));
+ dst.move(3, halfToFloatBits((As<SIMD::UInt>(packed[1]) & SIMD::UInt(0xFFFF0000)) >> 16));
break;
case VK_FORMAT_R8G8B8A8_SNORM:
dst.move(0, Min(Max(SIMD::Float(((packed[0]<<24) & SIMD::Int(0xFF000000))) * SIMD::Float(1.0f / float(0x7f000000)), SIMD::Float(-1.0f)), SIMD::Float(1.0f)));
@@ -4956,7 +4943,7 @@
dst.move(3, SIMD::Int(1));
break;
case VK_FORMAT_R16_SFLOAT:
- dst.move(0, HalfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
+ dst.move(0, halfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
dst.move(1, SIMD::Float(0));
dst.move(2, SIMD::Float(0));
dst.move(3, SIMD::Float(1));
@@ -4974,8 +4961,8 @@
dst.move(3, SIMD::Int(1));
break;
case VK_FORMAT_R16G16_SFLOAT:
- dst.move(0, HalfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
- dst.move(1, HalfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
+ dst.move(0, halfToFloatBits(As<SIMD::UInt>(packed[0]) & SIMD::UInt(0x0000FFFF)));
+ dst.move(1, halfToFloatBits((As<SIMD::UInt>(packed[0]) & SIMD::UInt(0xFFFF0000)) >> 16));
dst.move(2, SIMD::Float(0));
dst.move(3, SIMD::Float(1));
break;
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 29372f3..99b70e6 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -918,7 +918,6 @@
SIMD::Float Dot(unsigned numComponents, GenericValue const & x, GenericValue const & y) const;
SIMD::UInt FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const;
- SIMD::UInt HalfToFloatBits(SIMD::UInt halfBits) const;
// Splits x into a floating-point significand in the range [0.5, 1.0)
// and an integral exponent of two, such that:
diff --git a/src/Vulkan/VkFormat.cpp b/src/Vulkan/VkFormat.cpp
index 325b61a..80624a6 100644
--- a/src/Vulkan/VkFormat.cpp
+++ b/src/Vulkan/VkFormat.cpp
@@ -1727,10 +1727,13 @@
case VK_FORMAT_R16G16B16A16_UNORM:
case VK_FORMAT_R16_SINT:
case VK_FORMAT_R16_UINT:
+ case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R16G16_SINT:
case VK_FORMAT_R16G16_UINT:
+ case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SINT:
case VK_FORMAT_R16G16B16A16_UINT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
return false;
default:
@@ -1773,10 +1776,13 @@
case VK_FORMAT_R32G32B32A32_UINT:
case VK_FORMAT_R16_SINT:
case VK_FORMAT_R16_UINT:
+ case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R16G16_SINT:
case VK_FORMAT_R16G16_UINT:
+ case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SINT:
case VK_FORMAT_R16G16B16A16_UINT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
return false;
default:
@@ -1820,10 +1826,13 @@
case VK_FORMAT_R16G16B16A16_UNORM:
case VK_FORMAT_R16_SINT:
case VK_FORMAT_R16_UINT:
+ case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R16G16_SINT:
case VK_FORMAT_R16G16_UINT:
+ case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SINT:
case VK_FORMAT_R16G16B16A16_UINT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
return true;
default:
UNIMPLEMENTED("Format: %d", int(format));
@@ -1854,10 +1863,13 @@
case VK_FORMAT_R16G16B16A16_UNORM:
case VK_FORMAT_R16_SINT:
case VK_FORMAT_R16_UINT:
+ case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R16G16_SINT:
case VK_FORMAT_R16G16_UINT:
+ case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SINT:
case VK_FORMAT_R16G16B16A16_UINT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
case VK_FORMAT_R32_SFLOAT:
case VK_FORMAT_R32G32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SFLOAT:
@@ -1912,10 +1924,13 @@
case VK_FORMAT_R16G16B16A16_UNORM:
case VK_FORMAT_R16_SINT:
case VK_FORMAT_R16_UINT:
+ case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R16G16_SINT:
case VK_FORMAT_R16G16_UINT:
+ case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SINT:
case VK_FORMAT_R16G16B16A16_UINT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
return false;
default:
UNIMPLEMENTED("Format: %d", int(format));
@@ -1934,6 +1949,7 @@
case VK_FORMAT_R8_UINT:
case VK_FORMAT_R16_SINT:
case VK_FORMAT_R16_UINT:
+ case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R32_SINT:
case VK_FORMAT_R32_UINT:
case VK_FORMAT_R32_SFLOAT:
@@ -1945,6 +1961,7 @@
case VK_FORMAT_R16G16_SINT:
case VK_FORMAT_R16G16_UINT:
case VK_FORMAT_R16G16_UNORM:
+ case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R32G32_SINT:
case VK_FORMAT_R32G32_UINT:
case VK_FORMAT_R32G32_SFLOAT:
@@ -1959,6 +1976,7 @@
case VK_FORMAT_R16G16B16A16_UNORM:
case VK_FORMAT_R16G16B16A16_SINT:
case VK_FORMAT_R16G16B16A16_UINT:
+ case VK_FORMAT_R16G16B16A16_SFLOAT:
case VK_FORMAT_R32G32B32A32_SINT:
case VK_FORMAT_R32G32B32A32_UINT:
case VK_FORMAT_R32G32B32A32_SFLOAT: