Handle VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16 sampling As https://swiftshader-review.googlesource.com/c/SwiftShader/+/63748 was incomplete and introduced some failures as seen in https://swiftshader-review.googlesource.com/c/SwiftShader/+/63628. Updates `SamplerCore::sampleTexel()` to handle both 8-bit and 10-bit formats by reading the 8-bit or 10-bit buffers values without 8.8 packing (e.g. packing the 8-bit raw `u` value from the buffer into both the high 8-bits and low 8-bits of `U`) and by using more generic range expansion (as opposed to the hardcoded constants for 8-bit). Bug: b/219756793 Test: dEQP-VK.ycbcr.* Change-Id: I6e1681cf2f40dbbc389652136711bc2f20b413a1 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/64268 Presubmit-Ready: Jason Macnak <natsu@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com> Tested-by: Jason Macnak <natsu@google.com> Commit-Queue: Jason Macnak <natsu@google.com>
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp index 4268810..f126b4b 100644 --- a/src/Pipeline/SamplerCore.cpp +++ b/src/Pipeline/SamplerCore.cpp
@@ -1727,12 +1727,14 @@ if(isYcbcrFormat()) { + // Generates 15-bit output. + // Pointers to the planes of YCbCr images are stored in consecutive mipmap levels. Pointer<Byte> bufferY = buffer; // *Pointer<Pointer<Byte>>(mipmap + 0 * sizeof(Mipmap) + OFFSET(Mipmap, buffer)); Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmap + 1 * sizeof(Mipmap) + OFFSET(Mipmap, buffer)); // U/V for 2-plane interleaved formats. Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmap + 2 * sizeof(Mipmap) + OFFSET(Mipmap, buffer)); - // Luminance + // Luminance (either 8-bit or 10-bit in bottom bits). UShort4 Y; { switch(state.textureFormat) @@ -1740,24 +1742,20 @@ case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: { - Int c0 = Int(bufferY[index[0]]); - Int c1 = Int(bufferY[index[1]]); - Int c2 = Int(bufferY[index[2]]); - Int c3 = Int(bufferY[index[3]]); - c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24); - Y = As<UShort4>(Unpack(As<Byte4>(c0))); + Y = Insert(Y, UShort(bufferY[index[0]]), 0); + Y = Insert(Y, UShort(bufferY[index[1]]), 1); + Y = Insert(Y, UShort(bufferY[index[2]]), 2); + Y = Insert(Y, UShort(bufferY[index[3]]), 3); } break; case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16: { - Y = Insert(Y, Pointer<UShort>(bufferY)[index[0]], 0); // TODO: Insert(UShort4, UShort) + Y = Insert(Y, Pointer<UShort>(bufferY)[index[0]], 0); Y = Insert(Y, Pointer<UShort>(bufferY)[index[1]], 1); Y = Insert(Y, Pointer<UShort>(bufferY)[index[2]], 2); Y = Insert(Y, Pointer<UShort>(bufferY)[index[3]], 3); // Top 10 bits of each 16 bits: Y = (Y & UShort4(0xFFC0u)) >> 6; - // Scale from 10 bits to 16 bits: - Y = Y << 6; } break; default: @@ -1766,7 +1764,7 @@ } } - // Chroma + // Chroma (either 8-bit or 10-bit in bottom bits). UShort4 Cb, Cr; { computeIndices(index, uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap + sizeof(Mipmap)); @@ -1776,30 +1774,27 @@ { case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: { - Int c0 = Int(bufferU[index[0]]); - Int c1 = Int(bufferU[index[1]]); - Int c2 = Int(bufferU[index[2]]); - Int c3 = Int(bufferU[index[3]]); - c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24); - U = As<UShort4>(Unpack(As<Byte4>(c0))); + U = Insert(U, UShort(bufferU[index[0]]), 0); + U = Insert(U, UShort(bufferU[index[1]]), 1); + U = Insert(U, UShort(bufferU[index[2]]), 2); + U = Insert(U, UShort(bufferU[index[3]]), 3); - c0 = Int(bufferV[index[0]]); - c1 = Int(bufferV[index[1]]); - c2 = Int(bufferV[index[2]]); - c3 = Int(bufferV[index[3]]); - c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24); - V = As<UShort4>(Unpack(As<Byte4>(c0))); + V = Insert(V, UShort(bufferV[index[0]]), 0); + V = Insert(V, UShort(bufferV[index[1]]), 1); + V = Insert(V, UShort(bufferV[index[2]]), 2); + V = Insert(V, UShort(bufferV[index[3]]), 3); } break; case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: { - Short4 UV; - UV = Insert(UV, Pointer<Short>(bufferU)[index[0]], 0); // TODO: Insert(UShort4, UShort) - UV = Insert(UV, Pointer<Short>(bufferU)[index[1]], 1); - UV = Insert(UV, Pointer<Short>(bufferU)[index[2]], 2); - UV = Insert(UV, Pointer<Short>(bufferU)[index[3]], 3); - U = (UV & Short4(0x00FFu)) | (UV << 8); - V = (UV & Short4(0xFF00u)) | As<Short4>(As<UShort4>(UV) >> 8); + UShort4 UV; + UV = Insert(UV, Pointer<UShort>(bufferU)[index[0]], 0); + UV = Insert(UV, Pointer<UShort>(bufferU)[index[1]], 1); + UV = Insert(UV, Pointer<UShort>(bufferU)[index[2]], 2); + UV = Insert(UV, Pointer<UShort>(bufferU)[index[3]], 3); + + U = (UV & UShort4(0x00FFu)); + V = (UV & UShort4(0xFF00u)) >> 8; } break; case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16: @@ -1809,13 +1804,10 @@ UV = Insert(UV, Pointer<UInt>(bufferU)[index[1]], 1); UV = Insert(UV, Pointer<UInt>(bufferU)[index[2]], 2); UV = Insert(UV, Pointer<UInt>(bufferU)[index[3]], 3); - // Top 10 bits of first 16 bits: - U = UShort4((UV & UInt4(0x0000FFC0u)) >> 6); // TODO: UnpackLower(UInt4) - // Top 10 bits of second 16 bits: - V = UShort4((UV & UInt4(0xFFC00000u)) >> 22); // TODO: UnpackUpper(UInt4) - // Scale from 10 bits to 16 bits: - U = U << 6; - V = V << 6; + // Top 10 bits of first 16-bits: + U = UShort4((UV & UInt4(0x0000FFC0u)) >> 6); + // Top 10 bits of second 16-bits: + V = UShort4((UV & UInt4(0xFFC00000u)) >> 22); } break; default: @@ -1835,31 +1827,65 @@ } } + uint8_t lumaBits = 8; + uint8_t chromaBits = 8; + switch(state.textureFormat) + { + case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM: + case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM: + lumaBits = 8; + chromaBits = 8; + break; + case VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16: + lumaBits = 10; + chromaBits = 10; + break; + default: + UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat); + break; + } + if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) { - // YCbCr formats are treated as signed 15-bit. - c.x = Cr >> 1; - c.y = Y >> 1; - c.z = Cb >> 1; + // Scale to the output 15-bit. + c.x = Cr << (15 - chromaBits); + c.y = Y << (15 - lumaBits); + c.z = Cb << (15 - chromaBits); } else { - // Scaling and bias for studio-swing range: Y = [16 .. 235], U/V = [16 .. 240] - // Scale down by 0x0101 to normalize the 8.8 samples, and up by 0x7FFF for signed 15-bit output. - float yOffset = static_cast<float>(state.studioSwing ? 16 * 0x0101 : 0); - float uvOffset = static_cast<float>(128 * 0x0101); - float yFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 219 * 0x0101 : 255 * 0x0101); - float uvFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 224 * 0x0101 : 255 * 0x0101); + const float twoPowLumaBits = static_cast<float>(0x1u << lumaBits); + const float twoPowLumaBitsMinus8 = static_cast<float>(0x1u << (lumaBits - 8)); + const float twoPowChromaBits = static_cast<float>(0x1u << chromaBits); + const float twoPowChromaBitsMinus1 = static_cast<float>(0x1u << (chromaBits - 1)); + const float twoPowChromaBitsMinus8 = static_cast<float>(0x1u << (chromaBits - 8)); - Float4 y = (Float4(Y) - Float4(yOffset)) * Float4(yFactor); - Float4 u = (Float4(Cb) - Float4(uvOffset)) * Float4(uvFactor); - Float4 v = (Float4(Cr) - Float4(uvOffset)) * Float4(uvFactor); + Float4 y = Float4(Y); + Float4 u = Float4(Cb); + Float4 v = Float4(Cr); + + if(state.studioSwing) + { + // See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_NARROW + y = ((y / Float4(twoPowLumaBitsMinus8)) - Float4(16.0f)) / Float4(219.0f); + u = ((u / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f); + v = ((v / Float4(twoPowChromaBitsMinus8)) - Float4(128.0f)) / Float4(224.0f); + } + else + { + // See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#QUANTIZATION_FULL + y = y / Float4(twoPowLumaBits - 1.0f); + u = (u - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f); + v = (v - Float4(twoPowChromaBitsMinus1)) / Float4(twoPowChromaBits - 1.0f); + } + + // Now, `y` is in [0, 1] and `u` and `v` are in [-0.5, 0.5]. if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY) { - c.x = Short4(v); - c.y = Short4(y); - c.z = Short4(u); + c.x = Short4(v * static_cast<float>(0x7FFF)); + c.y = Short4(y * static_cast<float>(0x7FFF)); + c.z = Short4(u * static_cast<float>(0x7FFF)); } else { @@ -1900,9 +1926,9 @@ Float4 g = y + Float4(Gb) * u + Float4(Gr) * v; Float4 b = y + Float4(Bb) * u; - c.x = Short4(r); - c.y = Short4(g); - c.z = Short4(b); + c.x = Short4(r * static_cast<float>(0x7FFF)); + c.y = Short4(g * static_cast<float>(0x7FFF)); + c.z = Short4(b * static_cast<float>(0x7FFF)); } } }
diff --git a/src/Reactor/Reactor.cpp b/src/Reactor/Reactor.cpp index 41268a9..3d6459f 100644 --- a/src/Reactor/Reactor.cpp +++ b/src/Reactor/Reactor.cpp
@@ -1061,6 +1061,13 @@ storeValue(integer); } +UShort::UShort(RValue<Byte> cast) +{ + Value *integer = Nucleus::createZExt(cast.value(), UShort::type()); + + storeValue(integer); +} + UShort::UShort(unsigned short x) { storeValue(Nucleus::createConstantShort(x)); @@ -1331,6 +1338,11 @@ return store(rhs.load()); } +RValue<Byte4> Insert(RValue<Byte4> val, RValue<Byte> element, int i) +{ + return RValue<Byte4>(Nucleus::createInsertElement(val.value(), element.value(), i)); +} + Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7) { int64_t constantVector[8] = { x0, x1, x2, x3, x4, x5, x6, x7 };
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp index 453684e..aeaad54 100644 --- a/src/Reactor/Reactor.hpp +++ b/src/Reactor/Reactor.hpp
@@ -561,6 +561,7 @@ explicit UShort(RValue<UInt> cast); explicit UShort(RValue<Int> cast); + explicit UShort(RValue<Byte> cast); UShort() = default; UShort(unsigned short x); @@ -632,6 +633,8 @@ static Type *type(); }; +RValue<Byte4> Insert(RValue<Byte4> val, RValue<Byte> element, int i); + // RValue<Byte4> operator+(RValue<Byte4> lhs, RValue<Byte4> rhs); // RValue<Byte4> operator-(RValue<Byte4> lhs, RValue<Byte4> rhs); // RValue<Byte4> operator*(RValue<Byte4> lhs, RValue<Byte4> rhs);