Refactor vector packing.
x86 vector packing instructions always treat the input as having signed
integer components, but can perform signed or unsigned saturation on
the output. In Reactor the Pack() intrinsic has overloads which
differentiate between them based on the signedness of the input, but
this is confusing.
Also simplify emulation of saturating add/subtract.
Bug b/37496082
Change-Id: I0625fff429ffb40f42baf9600c7760d9858b5d89
Reviewed-on: https://swiftshader-review.googlesource.com/12548
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Main/FrameBuffer.cpp b/src/Main/FrameBuffer.cpp
index e95f766..d3e6383 100644
--- a/src/Main/FrameBuffer.cpp
+++ b/src/Main/FrameBuffer.cpp
@@ -253,10 +253,10 @@
case FORMAT_A16B16G16R16:
For(, x < width - 1, x += 2)
{
- UShort4 c0 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 0), 0xC6)) >> 8;
- UShort4 c1 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 8), 0xC6)) >> 8;
+ Short4 c0 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 0), 0xC6)) >> 8;
+ Short4 c1 = As<UShort4>(Swizzle(*Pointer<Short4>(s + 8), 0xC6)) >> 8;
- *Pointer<Int2>(d) = As<Int2>(Pack(c0, c1));
+ *Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
s += 2 * sBytes;
d += 2 * dBytes;
@@ -300,9 +300,9 @@
break;
case FORMAT_A16B16G16R16:
{
- UShort4 c = As<UShort4>(Swizzle(*Pointer<Short4>(s), 0xC6)) >> 8;
+ Short4 c = As<UShort4>(Swizzle(*Pointer<Short4>(s), 0xC6)) >> 8;
- *Pointer<Int>(d) = Int(As<Int2>(Pack(c, c)));
+ *Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
}
break;
case FORMAT_R5G6B5:
@@ -361,10 +361,10 @@
case FORMAT_A16B16G16R16:
For(, x < width - 1, x += 2)
{
- UShort4 c0 = *Pointer<UShort4>(s + 0) >> 8;
- UShort4 c1 = *Pointer<UShort4>(s + 8) >> 8;
+ Short4 c0 = *Pointer<UShort4>(s + 0) >> 8;
+ Short4 c1 = *Pointer<UShort4>(s + 8) >> 8;
- *Pointer<Int2>(d) = As<Int2>(Pack(c0, c1));
+ *Pointer<Int2>(d) = As<Int2>(PackUnsigned(c0, c1));
s += 2 * sBytes;
d += 2 * dBytes;
@@ -408,9 +408,9 @@
break;
case FORMAT_A16B16G16R16:
{
- UShort4 c = *Pointer<UShort4>(s) >> 8;
+ Short4 c = *Pointer<UShort4>(s) >> 8;
- *Pointer<Int>(d) = Int(As<Int2>(Pack(c, c)));
+ *Pointer<Int>(d) = Int(As<Int2>(PackUnsigned(c, c)));
}
break;
case FORMAT_R5G6B5:
@@ -503,8 +503,8 @@
break;
case FORMAT_A16B16G16R16:
{
- UShort4 cc = *Pointer<UShort4>(s) >> 8;
- Int c = Int(As<Int2>(Pack(cc, cc)));
+ Short4 cc = *Pointer<UShort4>(s) >> 8;
+ Int c = Int(As<Int2>(PackUnsigned(cc, cc)));
*Pointer<Short>(d) = Short((c & 0x00F80000) >> 19 |
(c & 0x0000FC00) >> 5 |
@@ -615,7 +615,7 @@
{
case FORMAT_X8R8G8B8:
case FORMAT_A8R8G8B8:
- *Pointer<Byte4>(d) = Byte4(Pack(As<UShort4>(c1), As<UShort4>(c1)));
+ *Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
break;
case FORMAT_X8B8G8R8:
case FORMAT_A8B8G8R8:
@@ -624,12 +624,12 @@
{
c1 = Swizzle(c1, 0xC6);
- *Pointer<Byte4>(d) = Byte4(Pack(As<UShort4>(c1), As<UShort4>(c1)));
+ *Pointer<Byte4>(d) = Byte4(PackUnsigned(c1, c1));
}
break;
case FORMAT_R8G8B8:
{
- Int c = Int(As<Int2>(Pack(As<UShort4>(c1), As<UShort4>(c1))));
+ Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
*Pointer<Byte>(d + 0) = Byte(c >> 0);
*Pointer<Byte>(d + 1) = Byte(c >> 8);
@@ -638,7 +638,7 @@
break;
case FORMAT_R5G6B5:
{
- Int c = Int(As<Int2>(Pack(As<UShort4>(c1), As<UShort4>(c1))));
+ Int c = Int(As<Int2>(PackUnsigned(c1, c1)));
*Pointer<Short>(d) = Short((c & 0x00F80000) >> 8 |
(c & 0x0000FC00) >> 5 |
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 8abb17a..59e7e09 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2791,7 +2791,7 @@
RValue<Short4> RoundShort4(RValue<Float4> cast)
{
RValue<Int4> int4 = RoundInt(cast);
- return As<Short4>(Pack(int4, int4));
+ return As<Short4>(PackSigned(int4, int4));
}
RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
@@ -2824,13 +2824,20 @@
return x86::pmaddwd(x, y);
}
- RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
+ RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
{
auto result = x86::packsswb(x, y);
return As<SByte8>(Swizzle(As<Int4>(result), 0x88));
}
+ RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+ {
+ auto result = x86::packuswb(x, y);
+
+ return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
+ }
+
RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
{
int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11}; // Real type is v8i16
@@ -2899,7 +2906,7 @@
if(CPUID::supportsSSE4_1())
{
Int4 int4(Min(cast, Float4(0xFFFF))); // packusdw takes care of 0x0000 saturation
- *this = As<Short4>(Pack(As<UInt4>(int4), As<UInt4>(int4)));
+ *this = As<Short4>(PackUnsigned(int4, int4));
}
else
{
@@ -3093,13 +3100,6 @@
return x86::pavgw(x, y);
}
- RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
- {
- auto result = x86::packuswb(x, y);
-
- return As<Byte8>(Swizzle(As<Int4>(result), 0x88));
- }
-
Type *UShort4::getType()
{
return T(Type_v4i16);
@@ -4846,11 +4846,16 @@
return x86::cvtps2dq(cast);
}
- RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
+ RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
{
return x86::packssdw(x, y);
}
+ RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+ {
+ return x86::packusdw(x, y);
+ }
+
RValue<Int> Extract(RValue<Int4> x, int i)
{
return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
@@ -5180,11 +5185,6 @@
}
}
- RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
- {
- return x86::packusdw(As<Int4>(x), As<Int4>(y));
- }
-
Type *UInt4::getType()
{
return T(llvm::VectorType::get(T(UInt::getType()), 4));
@@ -6205,7 +6205,7 @@
return As<SByte8>(V(::builder->CreateCall2(packsswb, x.value, y.value)));
}
- RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
+ RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
{
llvm::Function *packuswb = llvm::Intrinsic::getDeclaration(::module, llvm::Intrinsic::x86_sse2_packuswb_128);
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 3f9fb3d..bd2ce7f 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -797,7 +797,8 @@
RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y);
RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y);
RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y);
- RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y);
+ RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y);
+ RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y);
RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y);
RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y);
RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select);
@@ -866,7 +867,6 @@
RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y);
RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y);
RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y);
- RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y);
class Short8 : public LValue<Short8>
{
@@ -1831,7 +1831,8 @@
RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
RValue<Int4> RoundInt(RValue<Float4> cast);
- RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y);
+ RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
+ RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
RValue<Int> Extract(RValue<Int4> val, int i);
RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
RValue<Int> SignMask(RValue<Int4> x);
@@ -1911,7 +1912,6 @@
RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y);
RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y);
// RValue<UInt4> RoundInt(RValue<Float4> cast);
- RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y);
class Float : public LValue<Float>
{
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index a770981..d67b182 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -2716,7 +2716,7 @@
return RValue<Byte8>(Nucleus::createInsertElement(val.value, element.value, i));
}
- RValue<Byte> Saturate(RValue<UShort> x)
+ RValue<Byte> SaturateUnsigned(RValue<Short> x)
{
return Byte(IfThenElse(Int(x) > 0xFF, Int(0xFF), Int(x)));
}
@@ -2726,14 +2726,14 @@
if(emulateIntrinsics)
{
Byte8 result;
- result = Insert(result, Saturate(UShort(Int(Extract(x, 0))) + UShort(Int(Extract(y, 0)))), 0);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 1))) + UShort(Int(Extract(y, 1)))), 1);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 2))) + UShort(Int(Extract(y, 2)))), 2);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 3))) + UShort(Int(Extract(y, 3)))), 3);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 4))) + UShort(Int(Extract(y, 4)))), 4);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 5))) + UShort(Int(Extract(y, 5)))), 5);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 6))) + UShort(Int(Extract(y, 6)))), 6);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 7))) + UShort(Int(Extract(y, 7)))), 7);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
return result;
}
@@ -2756,14 +2756,14 @@
if(emulateIntrinsics)
{
Byte8 result;
- result = Insert(result, Saturate(UShort(Int(Extract(x, 0))) - UShort(Int(Extract(y, 0)))), 0);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 1))) - UShort(Int(Extract(y, 1)))), 1);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 2))) - UShort(Int(Extract(y, 2)))), 2);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 3))) - UShort(Int(Extract(y, 3)))), 3);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 4))) - UShort(Int(Extract(y, 4)))), 4);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 5))) - UShort(Int(Extract(y, 5)))), 5);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 6))) - UShort(Int(Extract(y, 6)))), 6);
- result = Insert(result, Saturate(UShort(Int(Extract(x, 7))) - UShort(Int(Extract(y, 7)))), 7);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+ result = Insert(result, SaturateUnsigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
return result;
}
@@ -3043,7 +3043,7 @@
return RValue<SByte8>(Nucleus::createNot(val.value));
}
- RValue<SByte> Saturate(RValue<Short> x)
+ RValue<SByte> SaturateSigned(RValue<Short> x)
{
return SByte(IfThenElse(Int(x) > 0x7F, Int(0x7F), IfThenElse(Int(x) < -0x80, Int(0x80), Int(x))));
}
@@ -3053,14 +3053,14 @@
if(emulateIntrinsics)
{
SByte8 result;
- result = Insert(result, Saturate(Short(Int(Extract(x, 0))) + Short(Int(Extract(y, 0)))), 0);
- result = Insert(result, Saturate(Short(Int(Extract(x, 1))) + Short(Int(Extract(y, 1)))), 1);
- result = Insert(result, Saturate(Short(Int(Extract(x, 2))) + Short(Int(Extract(y, 2)))), 2);
- result = Insert(result, Saturate(Short(Int(Extract(x, 3))) + Short(Int(Extract(y, 3)))), 3);
- result = Insert(result, Saturate(Short(Int(Extract(x, 4))) + Short(Int(Extract(y, 4)))), 4);
- result = Insert(result, Saturate(Short(Int(Extract(x, 5))) + Short(Int(Extract(y, 5)))), 5);
- result = Insert(result, Saturate(Short(Int(Extract(x, 6))) + Short(Int(Extract(y, 6)))), 6);
- result = Insert(result, Saturate(Short(Int(Extract(x, 7))) + Short(Int(Extract(y, 7)))), 7);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) + Int(Extract(y, 0)))), 0);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) + Int(Extract(y, 1)))), 1);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) + Int(Extract(y, 2)))), 2);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) + Int(Extract(y, 3)))), 3);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) + Int(Extract(y, 4)))), 4);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) + Int(Extract(y, 5)))), 5);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) + Int(Extract(y, 6)))), 6);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) + Int(Extract(y, 7)))), 7);
return result;
}
@@ -3083,14 +3083,14 @@
if(emulateIntrinsics)
{
SByte8 result;
- result = Insert(result, Saturate(Short(Int(Extract(x, 0))) - Short(Int(Extract(y, 0)))), 0);
- result = Insert(result, Saturate(Short(Int(Extract(x, 1))) - Short(Int(Extract(y, 1)))), 1);
- result = Insert(result, Saturate(Short(Int(Extract(x, 2))) - Short(Int(Extract(y, 2)))), 2);
- result = Insert(result, Saturate(Short(Int(Extract(x, 3))) - Short(Int(Extract(y, 3)))), 3);
- result = Insert(result, Saturate(Short(Int(Extract(x, 4))) - Short(Int(Extract(y, 4)))), 4);
- result = Insert(result, Saturate(Short(Int(Extract(x, 5))) - Short(Int(Extract(y, 5)))), 5);
- result = Insert(result, Saturate(Short(Int(Extract(x, 6))) - Short(Int(Extract(y, 6)))), 6);
- result = Insert(result, Saturate(Short(Int(Extract(x, 7))) - Short(Int(Extract(y, 7)))), 7);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 0)) - Int(Extract(y, 0)))), 0);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 1)) - Int(Extract(y, 1)))), 1);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 2)) - Int(Extract(y, 2)))), 2);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 3)) - Int(Extract(y, 3)))), 3);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 4)) - Int(Extract(y, 4)))), 4);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 5)) - Int(Extract(y, 5)))), 5);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 6)) - Int(Extract(y, 6)))), 6);
+ result = Insert(result, SaturateSigned(Short(Int(Extract(x, 7)) - Int(Extract(y, 7)))), 7);
return result;
}
@@ -3491,7 +3491,7 @@
RValue<Short4> RoundShort4(RValue<Float4> cast)
{
RValue<Int4> int4 = RoundInt(cast);
- return As<Short4>(Pack(int4, int4));
+ return As<Short4>(PackSigned(int4, int4));
}
RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
@@ -3520,7 +3520,7 @@
return RValue<Short4>(V(result));
}
- RValue<Short> Saturate(RValue<Int> x)
+ RValue<Short> SaturateSigned(RValue<Int> x)
{
return Short(IfThenElse(x > 0x7FFF, Int(0x7FFF), IfThenElse(x < -0x8000, Int(0x8000), x)));
}
@@ -3530,10 +3530,10 @@
if(emulateIntrinsics)
{
Short4 result;
- result = Insert(result, Saturate(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
- result = Insert(result, Saturate(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
- result = Insert(result, Saturate(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
- result = Insert(result, Saturate(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 0)) + Int(Extract(y, 0))), 0);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 1)) + Int(Extract(y, 1))), 1);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 2)) + Int(Extract(y, 2))), 2);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 3)) + Int(Extract(y, 3))), 3);
return result;
}
@@ -3556,10 +3556,10 @@
if(emulateIntrinsics)
{
Short4 result;
- result = Insert(result, Saturate(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
- result = Insert(result, Saturate(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
- result = Insert(result, Saturate(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
- result = Insert(result, Saturate(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 0)) - Int(Extract(y, 0))), 0);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 1)) - Int(Extract(y, 1))), 1);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 2)) - Int(Extract(y, 2))), 2);
+ result = Insert(result, SaturateSigned(Int(Extract(x, 3)) - Int(Extract(y, 3))), 3);
return result;
}
@@ -3627,19 +3627,19 @@
}
}
- RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
+ RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
{
if(emulateIntrinsics)
{
SByte8 result;
- result = Insert(result, Saturate(Extract(x, 0)), 0);
- result = Insert(result, Saturate(Extract(x, 1)), 1);
- result = Insert(result, Saturate(Extract(x, 2)), 2);
- result = Insert(result, Saturate(Extract(x, 3)), 3);
- result = Insert(result, Saturate(Extract(y, 0)), 4);
- result = Insert(result, Saturate(Extract(y, 1)), 5);
- result = Insert(result, Saturate(Extract(y, 2)), 6);
- result = Insert(result, Saturate(Extract(y, 3)), 7);
+ result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+ result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+ result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+ result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+ result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+ result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+ result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+ result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
return result;
}
@@ -3657,6 +3657,36 @@
}
}
+ RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
+ {
+ if(emulateIntrinsics)
+ {
+ Byte8 result;
+ result = Insert(result, SaturateUnsigned(Extract(x, 0)), 0);
+ result = Insert(result, SaturateUnsigned(Extract(x, 1)), 1);
+ result = Insert(result, SaturateUnsigned(Extract(x, 2)), 2);
+ result = Insert(result, SaturateUnsigned(Extract(x, 3)), 3);
+ result = Insert(result, SaturateUnsigned(Extract(y, 0)), 4);
+ result = Insert(result, SaturateUnsigned(Extract(y, 1)), 5);
+ result = Insert(result, SaturateUnsigned(Extract(y, 2)), 6);
+ result = Insert(result, SaturateUnsigned(Extract(y, 3)), 7);
+
+ return result;
+ }
+ else
+ {
+ Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
+ const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+ auto target = ::context->getConstantUndef(Ice::IceType_i32);
+ auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+ pack->addArg(x.value);
+ pack->addArg(y.value);
+ ::basicBlock->appendInst(pack);
+
+ return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x88));
+ }
+ }
+
RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
{
int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11}; // Real type is v8i16
@@ -3725,7 +3755,7 @@
if(CPUID::SSE4_1)
{
Int4 int4(Min(cast, Float4(0xFFFF))); // packusdw takes care of 0x0000 saturation
- *this = As<Short4>(Pack(As<UInt4>(int4), As<UInt4>(int4)));
+ *this = As<UShort4>(PackUnsigned(int4, int4));
}
else
{
@@ -4035,36 +4065,6 @@
assert(false && "UNIMPLEMENTED"); return RValue<UShort4>(V(nullptr));
}
- RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
- {
- if(emulateIntrinsics)
- {
- Byte8 result;
- result = Insert(result, Saturate(Extract(x, 0)), 0);
- result = Insert(result, Saturate(Extract(x, 1)), 1);
- result = Insert(result, Saturate(Extract(x, 2)), 2);
- result = Insert(result, Saturate(Extract(x, 3)), 3);
- result = Insert(result, Saturate(Extract(y, 0)), 4);
- result = Insert(result, Saturate(Extract(y, 1)), 5);
- result = Insert(result, Saturate(Extract(y, 2)), 6);
- result = Insert(result, Saturate(Extract(y, 3)), 7);
-
- return result;
- }
- else
- {
- Ice::Variable *result = ::function->makeVariable(Ice::IceType_v16i8);
- const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
- auto target = ::context->getConstantUndef(Ice::IceType_i32);
- auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
- pack->addArg(x.value);
- pack->addArg(y.value);
- ::basicBlock->appendInst(pack);
-
- return As<Byte8>(Swizzle(As<Int4>(V(result)), 0x88));
- }
- }
-
Type *UShort4::getType()
{
return T(Type_v4i16);
@@ -5931,19 +5931,19 @@
}
}
- RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
+ RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
{
if(emulateIntrinsics)
{
Short8 result;
- result = Insert(result, Saturate(Extract(x, 0)), 0);
- result = Insert(result, Saturate(Extract(x, 1)), 1);
- result = Insert(result, Saturate(Extract(x, 2)), 2);
- result = Insert(result, Saturate(Extract(x, 3)), 3);
- result = Insert(result, Saturate(Extract(y, 0)), 4);
- result = Insert(result, Saturate(Extract(y, 1)), 5);
- result = Insert(result, Saturate(Extract(y, 2)), 6);
- result = Insert(result, Saturate(Extract(y, 3)), 7);
+ result = Insert(result, SaturateSigned(Extract(x, 0)), 0);
+ result = Insert(result, SaturateSigned(Extract(x, 1)), 1);
+ result = Insert(result, SaturateSigned(Extract(x, 2)), 2);
+ result = Insert(result, SaturateSigned(Extract(x, 3)), 3);
+ result = Insert(result, SaturateSigned(Extract(y, 0)), 4);
+ result = Insert(result, SaturateSigned(Extract(y, 1)), 5);
+ result = Insert(result, SaturateSigned(Extract(y, 2)), 6);
+ result = Insert(result, SaturateSigned(Extract(y, 3)), 7);
return result;
}
@@ -5961,6 +5961,32 @@
}
}
+ RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
+ {
+ if(CPUID::SSE4_1)
+ {
+ Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
+ const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
+ auto target = ::context->getConstantUndef(Ice::IceType_i32);
+ auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
+ pack->addArg(x.value);
+ pack->addArg(y.value);
+ ::basicBlock->appendInst(pack);
+
+ return RValue<UShort8>(V(result));
+ }
+ else
+ {
+ RValue<Int4> sx = As<Int4>(x);
+ RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
+
+ RValue<Int4> sy = As<Int4>(y);
+ RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
+
+ return PackUnsigned(bx, by) + UShort8(0x8000u);
+ }
+ }
+
RValue<Int> Extract(RValue<Int4> x, int i)
{
return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
@@ -6329,32 +6355,6 @@
return RValue<UInt4>(V(result));
}
- RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
- {
- if(CPUID::SSE4_1)
- {
- Ice::Variable *result = ::function->makeVariable(Ice::IceType_v8i16);
- const Ice::Intrinsics::IntrinsicInfo intrinsic = {Ice::Intrinsics::VectorPackUnsigned, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
- auto target = ::context->getConstantUndef(Ice::IceType_i32);
- auto pack = Ice::InstIntrinsicCall::create(::function, 2, result, target, intrinsic);
- pack->addArg(x.value);
- pack->addArg(y.value);
- ::basicBlock->appendInst(pack);
-
- return RValue<UShort8>(V(result));
- }
- else
- {
- RValue<Int4> sx = As<Int4>(x);
- RValue<Int4> bx = (sx & ~(sx >> 31)) - Int4(0x8000);
-
- RValue<Int4> sy = As<Int4>(y);
- RValue<Int4> by = (sy & ~(sy >> 31)) - Int4(0x8000);
-
- return As<UShort8>(Pack(bx, by) + Short8(0x8000u));
- }
- }
-
Type *UInt4::getType()
{
return T(Ice::IceType_v4i32);
diff --git a/src/Reactor/x86.hpp b/src/Reactor/x86.hpp
index 5e759b3..1b8786e 100644
--- a/src/Reactor/x86.hpp
+++ b/src/Reactor/x86.hpp
@@ -66,7 +66,7 @@
RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y);
RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y);
RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y);
- RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y);
+ RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y);
RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y);
diff --git a/src/Renderer/Blitter.cpp b/src/Renderer/Blitter.cpp
index 0c4a160..b2486fb 100644
--- a/src/Renderer/Blitter.cpp
+++ b/src/Renderer/Blitter.cpp
@@ -445,8 +445,8 @@
case FORMAT_A8R8G8B8:
if(writeRGBA)
{
- UShort4 c0 = As<UShort4>(RoundShort4(c.zyxw));
- *Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+ Short4 c0 = RoundShort4(c.zyxw);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
}
else
{
@@ -460,8 +460,8 @@
case FORMAT_SRGB8_A8:
if(writeRGBA)
{
- UShort4 c0 = As<UShort4>(RoundShort4(c));
- *Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+ Short4 c0 = RoundShort4(c);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
}
else
{
@@ -474,8 +474,8 @@
case FORMAT_X8R8G8B8:
if(writeRGBA)
{
- UShort4 c0 = As<UShort4>(RoundShort4(c.zyxw)) | UShort4(0x0000, 0x0000, 0x0000, 0xFFFFu);
- *Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+ Short4 c0 = RoundShort4(c.zyxw) | Short4(0x0000, 0x0000, 0x0000, 0xFFFFu);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
}
else
{
@@ -489,8 +489,8 @@
case FORMAT_SRGB8_X8:
if(writeRGBA)
{
- UShort4 c0 = As<UShort4>(RoundShort4(c)) | UShort4(0x0000, 0x0000, 0x0000, 0xFFFFu);
- *Pointer<Byte4>(element) = Byte4(Pack(c0, c0));
+ Short4 c0 = RoundShort4(c) | Short4(0x0000, 0x0000, 0x0000, 0xFFFFu);
+ *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
}
else
{
diff --git a/src/Renderer/QuadRasterizer.cpp b/src/Renderer/QuadRasterizer.cpp
index 4721591..2855e16 100644
--- a/src/Renderer/QuadRasterizer.cpp
+++ b/src/Renderer/QuadRasterizer.cpp
@@ -284,7 +284,7 @@
for(unsigned int q = 0; q < state.multiSample; q++)
{
Short4 mask = CmpGT(xxxx, xLeft[q]) & CmpGT(xRight[q], xxxx);
- cMask[q] = SignMask(Pack(mask, mask)) & 0x0000000F;
+ cMask[q] = SignMask(PackSigned(mask, mask)) & 0x0000000F;
}
quad(cBuffer, zBuffer, sBuffer, cMask, x, y);
diff --git a/src/Shader/PixelPipeline.cpp b/src/Shader/PixelPipeline.cpp
index 66d6a09..4ed1830 100644
--- a/src/Shader/PixelPipeline.cpp
+++ b/src/Shader/PixelPipeline.cpp
@@ -1665,7 +1665,7 @@
void PixelPipeline::TEXKILL(Int cMask[4], Vector4s &src)
{
Short4 test = src.x | src.y | src.z;
- Int kill = SignMask(Pack(test, test)) ^ 0x0000000F;
+ Int kill = SignMask(PackSigned(test, test)) ^ 0x0000000F;
for(unsigned int q = 0; q < state.multiSample; q++)
{
diff --git a/src/Shader/PixelRoutine.cpp b/src/Shader/PixelRoutine.cpp
index 48a86a1..90e2073 100644
--- a/src/Shader/PixelRoutine.cpp
+++ b/src/Shader/PixelRoutine.cpp
@@ -549,29 +549,29 @@
break;
case ALPHA_EQUAL:
cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
- aMask = SignMask(Pack(cmp, Short4(0x0000)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
break;
case ALPHA_NOTEQUAL: // a != b ~ !(a == b)
cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
- aMask = SignMask(Pack(cmp, Short4(0x0000)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
break;
case ALPHA_LESS: // a < b ~ b > a
cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
- aMask = SignMask(Pack(cmp, Short4(0x0000)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
break;
case ALPHA_GREATEREQUAL: // a >= b ~ (a > b) || (a == b) ~ !(b > a) // TODO: Approximate
equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
cmp |= equal;
- aMask = SignMask(Pack(cmp, Short4(0x0000)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
break;
case ALPHA_LESSEQUAL: // a <= b ~ !(a > b)
cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu); // FIXME
- aMask = SignMask(Pack(cmp, Short4(0x0000)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
break;
case ALPHA_GREATER: // a > b
cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
- aMask = SignMask(Pack(cmp, Short4(0x0000)));
+ aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
break;
default:
ASSERT(false);
@@ -1452,8 +1452,8 @@
current.y = As<Short4>(As<UShort4>(current.y) >> 8);
current.z = As<Short4>(As<UShort4>(current.z) >> 8);
- current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
- current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+ current.z = As<Short4>(PackUnsigned(current.z, current.x));
+ current.y = As<Short4>(PackUnsigned(current.y, current.y));
current.x = current.z;
current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1469,8 +1469,8 @@
current.z = As<Short4>(As<UShort4>(current.z) >> 8);
current.w = As<Short4>(As<UShort4>(current.w) >> 8);
- current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
- current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
+ current.z = As<Short4>(PackUnsigned(current.z, current.x));
+ current.y = As<Short4>(PackUnsigned(current.y, current.w));
current.x = current.z;
current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1490,8 +1490,8 @@
current.y = As<Short4>(As<UShort4>(current.y) >> 8);
current.z = As<Short4>(As<UShort4>(current.z) >> 8);
- current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
- current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+ current.z = As<Short4>(PackUnsigned(current.x, current.z));
+ current.y = As<Short4>(PackUnsigned(current.y, current.y));
current.x = current.z;
current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1507,8 +1507,8 @@
current.z = As<Short4>(As<UShort4>(current.z) >> 8);
current.w = As<Short4>(As<UShort4>(current.w) >> 8);
- current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
- current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
+ current.z = As<Short4>(PackUnsigned(current.x, current.z));
+ current.y = As<Short4>(PackUnsigned(current.y, current.w));
current.x = current.z;
current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
@@ -1521,17 +1521,17 @@
case FORMAT_G8R8:
current.x = As<Short4>(As<UShort4>(current.x) >> 8);
current.y = As<Short4>(As<UShort4>(current.y) >> 8);
- current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
- current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
+ current.x = As<Short4>(PackUnsigned(current.x, current.x));
+ current.y = As<Short4>(PackUnsigned(current.y, current.y));
current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
break;
case FORMAT_R8:
current.x = As<Short4>(As<UShort4>(current.x) >> 8);
- current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
+ current.x = As<Short4>(PackUnsigned(current.x, current.x));
break;
case FORMAT_A8:
current.w = As<Short4>(As<UShort4>(current.w) >> 8);
- current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
+ current.w = As<Short4>(PackUnsigned(current.w, current.w));
break;
case FORMAT_G16R16:
current.z = current.x;
@@ -2367,11 +2367,11 @@
Short4 tmpCol = Short4(As<Int4>(oC.x));
if(state.targetFormat[index] == FORMAT_R8I)
{
- tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
+ tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
}
else
{
- tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
+ tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
}
packedCol = Extract(As<Int2>(tmpCol), 0);
@@ -2466,11 +2466,11 @@
if(state.targetFormat[index] == FORMAT_G8R8I)
{
- packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+ packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
}
else
{
- packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
+ packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
}
UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
@@ -2604,11 +2604,11 @@
if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
{
- packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
+ packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
}
else
{
- packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
+ packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
}
value = *Pointer<UInt2>(buffer, 16);
mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
@@ -2622,11 +2622,11 @@
if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
{
- packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
+ packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
}
else
{
- packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
+ packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
}
value = *Pointer<UInt2>(buffer, 16);
mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
diff --git a/src/Shader/SamplerCore.cpp b/src/Shader/SamplerCore.cpp
index 62f76fa..290f402 100644
--- a/src/Shader/SamplerCore.cpp
+++ b/src/Shader/SamplerCore.cpp
@@ -570,7 +570,7 @@
void SamplerCore::border(Short4 &mask, Float4 &coordinates)
{
Int4 border = As<Int4>(CmpLT(Abs(coordinates - Float4(0.5f)), Float4(0.5f)));
- mask = As<Short4>(Int2(As<Int4>(Pack(border, border))));
+ mask = As<Short4>(Int2(As<Int4>(PackSigned(border, border))));
}
void SamplerCore::border(Int4 &mask, Float4 &coordinates)
@@ -2271,7 +2271,7 @@
// Clamp
convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
- convert = As<Int4>(Pack(convert, convert));
+ convert = As<Int4>(PackSigned(convert, convert));
return As<Short4>(Int2(convert)) + Short4(0x8000u);
}