R11G11B10F support
- Moved the R11G11B10F conversion code from the Blitter to
ShaderCore
- Used the conversion code in PixelRoutine where appropriate
to make the R11G11B10F work as a renderable format
- Added the VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT and
VK_FORMAT_FEATURE_BLIT_DST_BIT to the
VK_FORMAT_B10G11R11_UFLOAT_PACK32 format
This allows ANGLE to expose the GL_EXT_color_buffer_float
(and GL_EXT_color_buffer_half_float, which depends on
GL_EXT_color_buffer_float in ANGLE, due a driver issue).
Bug: b/146223877
Tests: dEQP-VK.*b10g11r11*
Change-Id: I04ad29f7b7d497705186ae290a05868abfc13c42
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/39550
Presubmit-Ready: Alexis Hétu <sugoi@google.com>
Reviewed-by: Antonio Maiorano <amaiorano@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
diff --git a/src/Device/Blitter.cpp b/src/Device/Blitter.cpp
index 0b4bdab..3f42166 100644
--- a/src/Device/Blitter.cpp
+++ b/src/Device/Blitter.cpp
@@ -398,19 +398,7 @@
c.x = Float(*Pointer<Half>(element));
break;
case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
- // 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
- // Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
- // copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
- // In this case, we have:
- // B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R
- // 1st Short: |xxxxxxxxxx---------------------|
- // 2nd Short: |xxxx---------------------xxxxxx|
- // 3rd Short: |--------------------xxxxxxxxxxxx|
- // These memory reads overlap, but each of them contains an entire channel, so we can read this without
- // any int -> short conversion.
- c.x = Float(As<Half>((*Pointer<UShort>(element + 0) & UShort(0x07FF)) << UShort(4)));
- c.y = Float(As<Half>((*Pointer<UShort>(element + 1) & UShort(0x3FF8)) << UShort(1)));
- c.z = Float(As<Half>((*Pointer<UShort>(element + 2) & UShort(0xFFC0)) >> UShort(1)));
+ c = r11g11b10Unpack(*Pointer<UInt>(element));
break;
case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
@@ -621,16 +609,7 @@
break;
case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
{
- // 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
- // Since the 16-bit half-precision float format also has a 5 bit exponent, we can extract these minifloats from them.
-
- // FIXME(b/138944025): Handle negative values, Inf, and NaN.
- // FIXME(b/138944025): Perform rounding before truncating the mantissa.
- UInt r = (UInt(As<UShort>(Half(c.x))) & 0x00007FF0) >> 4;
- UInt g = (UInt(As<UShort>(Half(c.y))) & 0x00007FF0) << 7;
- UInt b = (UInt(As<UShort>(Half(c.z))) & 0x00007FE0) << 17;
-
- UInt rgb = r | g | b;
+ UInt rgb = r11g11b10Pack(c);
UInt old = *Pointer<UInt>(element);
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 59f9cbb..81c137e 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -244,6 +244,7 @@
case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SFLOAT:
+ case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
case VK_FORMAT_R32_SFLOAT:
case VK_FORMAT_R32G32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SFLOAT:
@@ -329,6 +330,7 @@
case VK_FORMAT_R16_SFLOAT:
case VK_FORMAT_R16G16_SFLOAT:
case VK_FORMAT_R16G16B16A16_SFLOAT:
+ case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
case VK_FORMAT_R16_SINT:
case VK_FORMAT_R16G16_SINT:
case VK_FORMAT_R16G16B16A16_SINT:
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 13a9e0c..db57ae4 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -924,13 +924,13 @@
{
Short4 c01;
Short4 c23;
- Pointer<Byte> buffer;
+ Pointer<Byte> buffer = cBuffer;
Pointer<Byte> buffer2;
switch(state.targetFormat[index])
{
case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
- buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
@@ -948,7 +948,7 @@
pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
break;
case VK_FORMAT_R5G6B5_UNORM_PACK16:
- buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
@@ -967,7 +967,7 @@
break;
case VK_FORMAT_B8G8R8A8_UNORM:
case VK_FORMAT_B8G8R8A8_SRGB:
- buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
c01 = *Pointer<Short4>(buffer);
buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
c23 = *Pointer<Short4>(buffer);
@@ -987,7 +987,7 @@
break;
case VK_FORMAT_R8G8B8A8_UNORM:
case VK_FORMAT_R8G8B8A8_SRGB:
- buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
c01 = *Pointer<Short4>(buffer);
buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
c23 = *Pointer<Short4>(buffer);
@@ -1006,7 +1006,7 @@
pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
break;
case VK_FORMAT_R8_UNORM:
- buffer = cBuffer + 1 * x;
+ buffer += 1 * x;
pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
@@ -1016,7 +1016,7 @@
pixel.w = Short4(0xFFFFu);
break;
case VK_FORMAT_R8G8_UNORM:
- buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
@@ -1026,7 +1026,6 @@
pixel.w = Short4(0xFFFFu);
break;
case VK_FORMAT_R16G16B16A16_UNORM:
- buffer = cBuffer;
pixel.x = *Pointer<Short4>(buffer + 8 * x);
pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
@@ -1035,7 +1034,6 @@
transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
break;
case VK_FORMAT_R16G16_UNORM:
- buffer = cBuffer;
pixel.x = *Pointer<Short4>(buffer + 4 * x);
buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
pixel.y = *Pointer<Short4>(buffer + 4 * x);
@@ -1050,7 +1048,6 @@
break;
case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
{
- buffer = cBuffer;
Int4 v = Int4(0);
v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
@@ -1372,11 +1369,13 @@
xMask &= sMask;
}
+ Pointer<Byte> buffer = cBuffer;
+
switch(state.targetFormat[index])
{
case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
{
- Pointer<Byte> buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
Int value = *Pointer<Int>(buffer);
Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask5551Q[bgraWriteMask & 0xF][0]));
@@ -1403,7 +1402,7 @@
break;
case VK_FORMAT_R5G6B5_UNORM_PACK16:
{
- Pointer<Byte> buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
Int value = *Pointer<Int>(buffer);
Int channelMask = *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
@@ -1431,7 +1430,7 @@
case VK_FORMAT_B8G8R8A8_UNORM:
case VK_FORMAT_B8G8R8A8_SRGB:
{
- Pointer<Byte> buffer = cBuffer + x * 4;
+ buffer += x * 4;
Short4 value = *Pointer<Short4>(buffer);
Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
@@ -1458,7 +1457,7 @@
case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
{
- Pointer<Byte> buffer = cBuffer + x * 4;
+ buffer += x * 4;
Short4 value = *Pointer<Short4>(buffer);
Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
@@ -1483,7 +1482,7 @@
case VK_FORMAT_R8G8_UNORM:
if((rgbaWriteMask & 0x00000003) != 0x0)
{
- Pointer<Byte> buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
Int2 value;
value = Insert(value, *Pointer<Int>(buffer), 0);
Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
@@ -1508,7 +1507,7 @@
case VK_FORMAT_R8_UNORM:
if(rgbaWriteMask & 0x00000001)
{
- Pointer<Byte> buffer = cBuffer + 1 * x;
+ buffer += 1 * x;
Short4 value;
value = Insert(value, *Pointer<Short>(buffer), 0);
Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
@@ -1524,7 +1523,7 @@
break;
case VK_FORMAT_R16G16_UNORM:
{
- Pointer<Byte> buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
Short4 value = *Pointer<Short4>(buffer);
@@ -1561,7 +1560,7 @@
break;
case VK_FORMAT_R16G16B16A16_UNORM:
{
- Pointer<Byte> buffer = cBuffer + 8 * x;
+ buffer += 8 * x;
{
Short4 value = *Pointer<Short4>(buffer);
@@ -1636,9 +1635,8 @@
break;
case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
{
- Pointer<Byte> buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
- buffer = cBuffer + 4 * x;
Int2 value = *Pointer<Int2>(buffer, 16);
Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
if (rgbaWriteMask != 0xF)
@@ -1806,7 +1804,7 @@
return;
}
- Pointer<Byte> buffer;
+ Pointer<Byte> buffer = cBuffer;
// pixel holds four texel color values.
// Note: Despite the type being Vector4f, the colors may be stored as
@@ -1834,7 +1832,6 @@
case VK_FORMAT_R32_SINT:
case VK_FORMAT_R32_UINT:
case VK_FORMAT_R32_SFLOAT:
- buffer = cBuffer;
// FIXME: movlps
pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
@@ -1847,7 +1844,6 @@
case VK_FORMAT_R32G32_SINT:
case VK_FORMAT_R32G32_UINT:
case VK_FORMAT_R32G32_SFLOAT:
- buffer = cBuffer;
pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
@@ -1860,7 +1856,6 @@
case VK_FORMAT_R32G32B32A32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SINT:
case VK_FORMAT_R32G32B32A32_UINT:
- buffer = cBuffer;
pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
@@ -1869,7 +1864,6 @@
transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
break;
case VK_FORMAT_R16_SFLOAT:
- buffer = cBuffer;
pixel.x.x = Float(*Pointer<Half>(buffer + 2 * x + 0));
pixel.x.y = Float(*Pointer<Half>(buffer + 2 * x + 2));
buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
@@ -1878,7 +1872,6 @@
pixel.y = pixel.z = pixel.w = one;
break;
case VK_FORMAT_R16G16_SFLOAT:
- buffer = cBuffer;
pixel.x.x = Float(*Pointer<Half>(buffer + 4 * x + 0));
pixel.y.x = Float(*Pointer<Half>(buffer + 4 * x + 2));
pixel.x.y = Float(*Pointer<Half>(buffer + 4 * x + 4));
@@ -1891,7 +1884,6 @@
pixel.z = pixel.w = one;
break;
case VK_FORMAT_R16G16B16A16_SFLOAT:
- buffer = cBuffer;
pixel.x.x = Float(*Pointer<Half>(buffer + 8 * x + 0x0));
pixel.y.x = Float(*Pointer<Half>(buffer + 8 * x + 0x2));
pixel.z.x = Float(*Pointer<Half>(buffer + 8 * x + 0x4));
@@ -1910,6 +1902,14 @@
pixel.z.w = Float(*Pointer<Half>(buffer + 8 * x + 0xc));
pixel.w.w = Float(*Pointer<Half>(buffer + 8 * x + 0xe));
break;
+ case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+ pixel.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 0));
+ pixel.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 4));
+ buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
+ pixel.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 0));
+ pixel.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4 * x + 4));
+ transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
+ break;
default:
UNIMPLEMENTED("VkFormat: %d", int(state.targetFormat[index]));
}
@@ -2039,6 +2039,7 @@
oC.y = oC.z;
break;
case VK_FORMAT_R16G16B16A16_SFLOAT:
+ case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
case VK_FORMAT_R32G32B32A32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SINT:
case VK_FORMAT_R32G32B32A32_UINT:
@@ -2074,7 +2075,7 @@
auto targetFormat = state.targetFormat[index];
- Pointer<Byte> buffer;
+ Pointer<Byte> buffer = cBuffer;
Float4 value;
switch(targetFormat)
@@ -2084,7 +2085,7 @@
case VK_FORMAT_R32_UINT:
if(rgbaWriteMask & 0x00000001)
{
- buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
// FIXME: movlps
value.x = *Pointer<Float>(buffer + 0);
@@ -2114,7 +2115,7 @@
case VK_FORMAT_R16_SFLOAT:
if(rgbaWriteMask & 0x00000001)
{
- buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
@@ -2141,7 +2142,7 @@
case VK_FORMAT_R16_UINT:
if(rgbaWriteMask & 0x00000001)
{
- buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
UShort4 xyzw;
xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
@@ -2189,7 +2190,7 @@
case VK_FORMAT_R8_UINT:
if(rgbaWriteMask & 0x00000001)
{
- buffer = cBuffer + x;
+ buffer += x;
UInt xyzw, packedCol;
@@ -2219,7 +2220,7 @@
case VK_FORMAT_R32G32_SFLOAT:
case VK_FORMAT_R32G32_SINT:
case VK_FORMAT_R32G32_UINT:
- buffer = cBuffer + 8 * x;
+ buffer += 8 * x;
value = *Pointer<Float4>(buffer);
@@ -2258,7 +2259,7 @@
case VK_FORMAT_R16G16_SFLOAT:
if((rgbaWriteMask & 0x00000003) != 0x0)
{
- buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
UInt2 rgbaMask;
UInt2 packedCol;
@@ -2292,7 +2293,7 @@
case VK_FORMAT_R16G16_UINT:
if((rgbaWriteMask & 0x00000003) != 0x0)
{
- buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
UInt2 rgbaMask;
UShort4 packedCol = UShort4(As<Int4>(oC.x));
@@ -2322,7 +2323,7 @@
case VK_FORMAT_R8G8_UINT:
if((rgbaWriteMask & 0x00000003) != 0x0)
{
- buffer = cBuffer + 2 * x;
+ buffer += 2 * x;
Int2 xyzw, packedCol;
@@ -2357,7 +2358,7 @@
case VK_FORMAT_R32G32B32A32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SINT:
case VK_FORMAT_R32G32B32A32_UINT:
- buffer = cBuffer + 16 * x;
+ buffer += 16 * x;
{
value = *Pointer<Float4>(buffer, 16);
@@ -2432,7 +2433,7 @@
case VK_FORMAT_R16G16B16A16_SFLOAT:
if((rgbaWriteMask & 0x0000000F) != 0x0)
{
- buffer = cBuffer + 8 * x;
+ buffer += 8 * x;
UInt4 rgbaMask;
UInt4 value = *Pointer<UInt4>(buffer);
@@ -2465,11 +2466,31 @@
*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
}
break;
+ case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+ if((rgbaWriteMask & 0x7) != 0x0)
+ {
+ buffer += 4 * x;
+
+ unsigned int mask = ((rgbaWriteMask & 0x1) ? 0x000007FF : 0) |
+ ((rgbaWriteMask & 0x2) ? 0x003FF800 : 0) |
+ ((rgbaWriteMask & 0x4) ? 0xFFC00000 : 0);
+ UInt2 mergedMask(mask, mask);
+
+ UInt2 value;
+ value = Insert(value, r11g11b10Pack(oC.x), 0);
+ value = Insert(value, r11g11b10Pack(oC.y), 1);
+ *Pointer<UInt2>(buffer) = (value & mergedMask) | ((*Pointer<UInt2>(buffer)) & ~mergedMask);
+ buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+ value = Insert(value, r11g11b10Pack(oC.z), 0);
+ value = Insert(value, r11g11b10Pack(oC.w), 1);
+ *Pointer<UInt2>(buffer) = (value & mergedMask) | ((*Pointer<UInt2>(buffer)) & ~mergedMask);
+ }
+ break;
case VK_FORMAT_R16G16B16A16_SINT:
case VK_FORMAT_R16G16B16A16_UINT:
if((rgbaWriteMask & 0x0000000F) != 0x0)
{
- buffer = cBuffer + 8 * x;
+ buffer += 8 * x;
UInt4 rgbaMask;
UShort8 value = *Pointer<UShort8>(buffer);
@@ -2503,7 +2524,7 @@
{
UInt2 value, packedCol, mergedMask;
- buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
@@ -2551,7 +2572,7 @@
((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
((As<Int4>(oC.x) & Int4(0x3ff)));
- buffer = cBuffer + 4 * x;
+ buffer += 4 * x;
value = *Pointer<Int2>(buffer, 16);
mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
if (rgbaWriteMask != 0xF)
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 0f23096..10502fd 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -546,27 +546,72 @@
}
}
-UInt4 halfToFloatBits(UInt4 halfBits)
+SIMD::UInt halfToFloatBits(SIMD::UInt halfBits)
{
- auto magic = UInt4(126 << 23);
+ auto magic = SIMD::UInt(126 << 23);
- auto sign16 = halfBits & UInt4(0x8000);
- auto man16 = halfBits & UInt4(0x3FF);
- auto exp16 = halfBits & UInt4(0x7C00);
+ auto sign16 = halfBits & SIMD::UInt(0x8000);
+ auto man16 = halfBits & SIMD::UInt(0x03FF);
+ auto exp16 = halfBits & SIMD::UInt(0x7C00);
- auto isDnormOrZero = CmpEQ(exp16, UInt4(0));
- auto isInfOrNaN = CmpEQ(exp16, UInt4(0x7C00));
+ auto isDnormOrZero = CmpEQ(exp16, SIMD::UInt(0));
+ auto isInfOrNaN = CmpEQ(exp16, SIMD::UInt(0x7C00));
auto sign32 = sign16 << 16;
auto man32 = man16 << 13;
- auto exp32 = (exp16 + UInt4(0x1C000)) << 13;
- auto norm32 = (man32 | exp32) | (isInfOrNaN & UInt4(0x7F800000));
+ auto exp32 = (exp16 + SIMD::UInt(0x1C000)) << 13;
+ auto norm32 = (man32 | exp32) | (isInfOrNaN & SIMD::UInt(0x7F800000));
- auto denorm32 = As<UInt4>(As<Float4>(magic + man16) - As<Float4>(magic));
+ auto denorm32 = As<SIMD::UInt>(As<SIMD::Float>(magic + man16) - As<SIMD::Float>(magic));
return sign32 | (norm32 & ~isDnormOrZero) | (denorm32 & isDnormOrZero);
}
+SIMD::UInt floatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits)
+{
+ static const uint32_t mask_sign = 0x80000000u;
+ static const uint32_t mask_round = ~0xfffu;
+ static const uint32_t c_f32infty = 255 << 23;
+ static const uint32_t c_magic = 15 << 23;
+ static const uint32_t c_nanbit = 0x200;
+ static const uint32_t c_infty_as_fp16 = 0x7c00;
+ static const uint32_t c_clamp = (31 << 23) - 0x1000;
+
+ SIMD::UInt justsign = SIMD::UInt(mask_sign) & floatBits;
+ SIMD::UInt absf = floatBits ^ justsign;
+ SIMD::UInt b_isnormal = CmpNLE(SIMD::UInt(c_f32infty), absf);
+
+ // Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
+ // instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
+ SIMD::UInt joined = ((((As<SIMD::UInt>(Min(As<SIMD::Float>(absf & SIMD::UInt(mask_round)) * As<SIMD::Float>(SIMD::UInt(c_magic)),
+ As<SIMD::Float>(SIMD::UInt(c_clamp))))) - SIMD::UInt(mask_round)) >> 13) & b_isnormal) |
+ ((b_isnormal ^ SIMD::UInt(0xFFFFFFFF)) &
+ ((CmpNLE(absf, SIMD::UInt(c_f32infty)) & SIMD::UInt(c_nanbit)) | SIMD::UInt(c_infty_as_fp16)));
+
+ return storeInUpperBits ? ((joined << 16) | justsign) : joined | (justsign >> 16);
+}
+
+sw::SIMD::Float r11g11b10Unpack(UInt r11g11b10bits)
+{
+ // 10 (or 11) bit float formats are unsigned formats with a 5 bit exponent and a 5 (or 6) bit mantissa.
+ // Since the Half float format also has a 5 bit exponent, we can convert these formats to half by
+ // copy/pasting the bits so the the exponent bits and top mantissa bits are aligned to the half format.
+ // In this case, we have:
+ // MSB | B B B B B B B B B B G G G G G G G G G G G R R R R R R R R R R R | LSB
+ SIMD::UInt halfBits;
+ halfBits = Insert(halfBits, (r11g11b10bits & UInt(0x000007FFu)) << 4, 0);
+ halfBits = Insert(halfBits, (r11g11b10bits & UInt(0x003FF800u)) >> 7, 1);
+ halfBits = Insert(halfBits, (r11g11b10bits & UInt(0xFFC00000u)) >> 17, 2);
+ halfBits = Insert(halfBits, UInt(0x00003C00u), 3);
+ return As<sw::SIMD::Float>(halfToFloatBits(halfBits));
+}
+
+UInt r11g11b10Pack(sw::SIMD::Float &value)
+{
+ SIMD::UInt halfBits = floatToHalfBits(As<SIMD::UInt>(value), true) &
+ SIMD::UInt(0x7FF00000, 0x7FF00000, 0x7FE00000, 0);
+ return (UInt(halfBits.x) >> 20) | (UInt(halfBits.y) >> 9) | (UInt(halfBits.z) << 1);
+}
rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints)
{
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index 52522a5..fb056ff 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -190,7 +190,10 @@
void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
-UInt4 halfToFloatBits(UInt4 halfBits);
+sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits);
+sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
+sw::SIMD::Float r11g11b10Unpack(UInt r11g11b10bits);
+UInt r11g11b10Pack(sw::SIMD::Float &value);
rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
diff --git a/src/Pipeline/SpirvShader.hpp b/src/Pipeline/SpirvShader.hpp
index 1876dec..154ffd7 100644
--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -1087,8 +1087,6 @@
// Helper as we often need to take dot products as part of doing other things.
SIMD::Float Dot(unsigned numComponents, GenericValue const & x, GenericValue const & y) const;
- SIMD::UInt FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const;
-
// Splits x into a floating-point significand in the range [0.5, 1.0)
// and an integral exponent of two, such that:
// x = significand * 2^exponent
diff --git a/src/Pipeline/SpirvShaderArithmetic.cpp b/src/Pipeline/SpirvShaderArithmetic.cpp
index 947a109..5d9b980 100644
--- a/src/Pipeline/SpirvShaderArithmetic.cpp
+++ b/src/Pipeline/SpirvShaderArithmetic.cpp
@@ -544,30 +544,6 @@
return d;
}
-SIMD::UInt SpirvShader::FloatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits) const
-{
- static const uint32_t mask_sign = 0x80000000u;
- static const uint32_t mask_round = ~0xfffu;
- static const uint32_t c_f32infty = 255 << 23;
- static const uint32_t c_magic = 15 << 23;
- static const uint32_t c_nanbit = 0x200;
- static const uint32_t c_infty_as_fp16 = 0x7c00;
- static const uint32_t c_clamp = (31 << 23) - 0x1000;
-
- SIMD::UInt justsign = SIMD::UInt(mask_sign) & floatBits;
- SIMD::UInt absf = floatBits ^ justsign;
- SIMD::UInt b_isnormal = CmpNLE(SIMD::UInt(c_f32infty), absf);
-
- // Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
- // instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
- SIMD::UInt joined = ((((As<SIMD::UInt>(Min(As<SIMD::Float>(absf & SIMD::UInt(mask_round)) * As<SIMD::Float>(SIMD::UInt(c_magic)),
- As<SIMD::Float>(SIMD::UInt(c_clamp))))) - SIMD::UInt(mask_round)) >> 13) & b_isnormal) |
- ((b_isnormal ^ SIMD::UInt(0xFFFFFFFF)) & ((CmpNLE(absf, SIMD::UInt(c_f32infty)) & SIMD::UInt(c_nanbit)) |
- SIMD::UInt(c_infty_as_fp16)));
-
- return storeInUpperBits ? ((joined << 16) | justsign) : joined | (justsign >> 16);
-}
-
std::pair<SIMD::Float, SIMD::Int> SpirvShader::Frexp(RValue<SIMD::Float> val) const
{
// Assumes IEEE 754
diff --git a/src/Pipeline/SpirvShaderGLSLstd450.cpp b/src/Pipeline/SpirvShaderGLSLstd450.cpp
index f9485ca..858765f 100644
--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -431,7 +431,7 @@
case GLSLstd450PackHalf2x16:
{
auto val = GenericValue(this, state, insn.word(5));
- dst.move(0, FloatToHalfBits(val.UInt(0), false) | FloatToHalfBits(val.UInt(1), true));
+ dst.move(0, floatToHalfBits(val.UInt(0), false) | floatToHalfBits(val.UInt(1), true));
break;
}
case GLSLstd450UnpackSnorm4x8:
diff --git a/src/Pipeline/SpirvShaderImage.cpp b/src/Pipeline/SpirvShaderImage.cpp
index c64b317..f2828f8 100644
--- a/src/Pipeline/SpirvShaderImage.cpp
+++ b/src/Pipeline/SpirvShaderImage.cpp
@@ -900,8 +900,8 @@
break;
case spv::ImageFormatRgba16f:
texelSize = 8;
- packed[0] = FloatToHalfBits(texel.UInt(0), false) | FloatToHalfBits(texel.UInt(1), true);
- packed[1] = FloatToHalfBits(texel.UInt(2), false) | FloatToHalfBits(texel.UInt(3), true);
+ packed[0] = floatToHalfBits(texel.UInt(0), false) | floatToHalfBits(texel.UInt(1), true);
+ packed[1] = floatToHalfBits(texel.UInt(2), false) | floatToHalfBits(texel.UInt(3), true);
numPackedElements = 2;
break;
case spv::ImageFormatRgba16i:
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index b60e56e..61d3bf7 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -630,6 +630,7 @@
case VK_FORMAT_R32G32B32A32_UINT:
case VK_FORMAT_R32G32B32A32_SINT:
case VK_FORMAT_R32G32B32A32_SFLOAT:
+ case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
pFormatProperties->optimalTilingFeatures |=
VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
VK_FORMAT_FEATURE_BLIT_DST_BIT;