Subzero MulHigh implementation for Int4/UInt4
Also add implementations of multiplication and right shift for Long type.
Bug b/126873455
Change-Id: I9952c2b9a3feca6a7741cd02e2295340935e4447
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/25988
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 6d6e9a4..27f5661 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -4560,6 +4560,16 @@
return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
}
+ RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
+ {
+ return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
+ }
+
+ RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
+ {
+ return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
+ }
+
RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
{
return lhs = lhs + rhs;
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 055b3a8..1bf1f7c 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -1099,14 +1099,14 @@
RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs);
RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs);
-// RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
+ RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
// RValue<Long> operator/(RValue<Long> lhs, RValue<Long> rhs);
// RValue<Long> operator%(RValue<Long> lhs, RValue<Long> rhs);
// RValue<Long> operator&(RValue<Long> lhs, RValue<Long> rhs);
// RValue<Long> operator|(RValue<Long> lhs, RValue<Long> rhs);
// RValue<Long> operator^(RValue<Long> lhs, RValue<Long> rhs);
// RValue<Long> operator<<(RValue<Long> lhs, RValue<Long> rhs);
-// RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
+ RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
RValue<Long> operator+=(Long &lhs, RValue<Long> rhs);
RValue<Long> operator-=(Long &lhs, RValue<Long> rhs);
// RValue<Long> operator*=(Long &lhs, RValue<Long> rhs);
@@ -1872,7 +1872,6 @@
UInt4(int x, int yzw);
UInt4(int x, int y, int zw);
UInt4(int x, int y, int z, int w);
- UInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
UInt4(RValue<UInt4> rhs);
UInt4(const UInt4 &rhs);
UInt4(const Reference<UInt4> &rhs);
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index 3d35802..c0abca1 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -925,14 +925,29 @@
{
Pointer<Byte> out = function.Arg<0>();
- *Pointer<Short4>(out + 8 * 0) =
- MulHigh(Short4(0x1aa, 0x2dd, 0x3ee, 0xF422),
- Short4(0x1bb, 0x2cc, 0x3ff, 0xF411));
- *Pointer<UShort4>(out + 8 * 1) =
- MulHigh(UShort4(0x1aa, 0x2dd, 0x3ee, 0xF422),
- UShort4(0x1bb, 0x2cc, 0x3ff, 0xF411));
+ *Pointer<Short4>(out + 16 * 0) =
+ MulHigh(Short4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+ Short4(0x01BB, 0x02CC, 0x03FF, 0xF411));
+ *Pointer<UShort4>(out + 16 * 1) =
+ MulHigh(UShort4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+ UShort4(0x01BB, 0x02CC, 0x03FF, 0xF411));
- // (U)Short8 variants are mentioned but unimplemented
+ *Pointer<Int4>(out + 16 * 2) =
+ MulHigh(Int4(0x000001AA, 0x000002DD, 0xC8000000, 0xF8000000),
+ Int4(0x000001BB, 0x84000000, 0x000003EE, 0xD7000000));
+ *Pointer<UInt4>(out + 16 * 3) =
+ MulHigh(UInt4(0x000001AAu, 0x000002DDu, 0xC8000000u, 0xD8000000u),
+ UInt4(0x000001BBu, 0x84000000u, 0x000003EEu, 0xD7000000u));
+
+ *Pointer<Int4>(out + 16 * 4) =
+ MulHigh(Int4(0x7FFFFFFF, 0x7FFFFFFF, 0x80008000, 0xFFFFFFFF),
+ Int4(0x7FFFFFFF, 0x80000000, 0x80008000, 0xFFFFFFFF));
+ *Pointer<UInt4>(out + 16 * 5) =
+ MulHigh(UInt4(0x7FFFFFFFu, 0x7FFFFFFFu, 0x80008000u, 0xFFFFFFFFu),
+ UInt4(0x7FFFFFFFu, 0x80000000u, 0x80008000u, 0xFFFFFFFFu));
+
+ // (U)Short8 variants currently unimplemented.
+
Return(0);
}
@@ -940,7 +955,7 @@
if(routine)
{
- unsigned int out[2][2];
+ unsigned int out[6][4];
memset(&out, 0, sizeof(out));
@@ -948,10 +963,30 @@
callable(&out);
EXPECT_EQ(out[0][0], 0x00080002u);
- EXPECT_EQ(out[0][1], 0x008D000fu);
+ EXPECT_EQ(out[0][1], 0x008D000Fu);
EXPECT_EQ(out[1][0], 0x00080002u);
- EXPECT_EQ(out[1][1], 0xe8C0000Fu);
+ EXPECT_EQ(out[1][1], 0xE8C0000Fu);
+
+ EXPECT_EQ(out[2][0], 0x00000000u);
+ EXPECT_EQ(out[2][1], 0xFFFFFE9Cu);
+ EXPECT_EQ(out[2][2], 0xFFFFFF23u);
+ EXPECT_EQ(out[2][3], 0x01480000u);
+
+ EXPECT_EQ(out[3][0], 0x00000000u);
+ EXPECT_EQ(out[3][1], 0x00000179u);
+ EXPECT_EQ(out[3][2], 0x00000311u);
+ EXPECT_EQ(out[3][3], 0xB5680000u);
+
+ EXPECT_EQ(out[4][0], 0x3FFFFFFFu);
+ EXPECT_EQ(out[4][1], 0xC0000000u);
+ EXPECT_EQ(out[4][2], 0x3FFF8000u);
+ EXPECT_EQ(out[4][3], 0x00000000u);
+
+ EXPECT_EQ(out[5][0], 0x3FFFFFFFu);
+ EXPECT_EQ(out[5][1], 0x3FFFFFFFu);
+ EXPECT_EQ(out[5][2], 0x40008000u);
+ EXPECT_EQ(out[5][3], 0xFFFFFFFEu);
}
}
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index b4f1971..ee4b036 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -4087,6 +4087,52 @@
}
}
+ RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
+ {
+ // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+ // Scalarized implementation.
+ Int4 result;
+ result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
+ result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
+ result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
+ result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
+
+ return result;
+ }
+
+ RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+ {
+ // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+ if(false) // Partial product based implementation.
+ {
+ auto xh = x >> 16;
+ auto yh = y >> 16;
+ auto xl = x & UInt4(0x0000FFFF);
+ auto yl = y & UInt4(0x0000FFFF);
+ auto xlyh = xl * yh;
+ auto xhyl = xh * yl;
+ auto xlyhh = xlyh >> 16;
+ auto xhylh = xhyl >> 16;
+ auto xlyhl = xlyh & UInt4(0x0000FFFF);
+ auto xhyll = xhyl & UInt4(0x0000FFFF);
+ auto xlylh = (xl * yl) >> 16;
+ auto oflow = (xlyhl + xhyll + xlylh) >> 16;
+
+ return (xh * yh) + (xlyhh + xhylh) + oflow;
+ }
+
+ // Scalarized implementation.
+ Int4 result;
+ result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
+ result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
+ result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
+ result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
+
+ return As<UInt4>(result);
+ }
+
RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
{
assert(false && "UNIMPLEMENTED"); return RValue<UShort4>(V(nullptr));
@@ -4777,6 +4823,16 @@
return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
}
+ RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
+ {
+ return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
+ }
+
+ RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
+ {
+ return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
+ }
+
RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
{
return lhs = lhs + rhs;