Subzero MulHigh implementation for Int4/UInt4

Also add implementations of multiplication and right shift for Long type.

Bug b/126873455

Change-Id: I9952c2b9a3feca6a7741cd02e2295340935e4447
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/25988
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Chris Forbes <chrisforbes@google.com>
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index 6d6e9a4..27f5661 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -4560,6 +4560,16 @@
 		return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
+	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
+	{
+		return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
+	}
+
+	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
+	{
+		return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
+	}
+
 	RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
 	{
 		return lhs = lhs + rhs;
diff --git a/src/Reactor/Reactor.hpp b/src/Reactor/Reactor.hpp
index 055b3a8..1bf1f7c 100644
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -1099,14 +1099,14 @@
 
 	RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs);
 	RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs);
-//	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
+	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator/(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator%(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator&(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator|(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator^(RValue<Long> lhs, RValue<Long> rhs);
 //	RValue<Long> operator<<(RValue<Long> lhs, RValue<Long> rhs);
-//	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
+	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs);
 	RValue<Long> operator+=(Long &lhs, RValue<Long> rhs);
 	RValue<Long> operator-=(Long &lhs, RValue<Long> rhs);
 //	RValue<Long> operator*=(Long &lhs, RValue<Long> rhs);
@@ -1872,7 +1872,6 @@
 		UInt4(int x, int yzw);
 		UInt4(int x, int y, int zw);
 		UInt4(int x, int y, int z, int w);
-		UInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
 		UInt4(RValue<UInt4> rhs);
 		UInt4(const UInt4 &rhs);
 		UInt4(const Reference<UInt4> &rhs);
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index 3d35802..c0abca1 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -925,14 +925,29 @@
 		{
 			Pointer<Byte> out = function.Arg<0>();
 
-			*Pointer<Short4>(out + 8 * 0) =
-				MulHigh(Short4(0x1aa, 0x2dd, 0x3ee, 0xF422),
-					Short4(0x1bb, 0x2cc, 0x3ff, 0xF411));
-			*Pointer<UShort4>(out + 8 * 1) =
-				MulHigh(UShort4(0x1aa, 0x2dd, 0x3ee, 0xF422),
-					UShort4(0x1bb, 0x2cc, 0x3ff, 0xF411));
+			*Pointer<Short4>(out + 16 * 0) =
+				MulHigh(Short4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+				        Short4(0x01BB, 0x02CC, 0x03FF, 0xF411));
+			*Pointer<UShort4>(out + 16 * 1) =
+				MulHigh(UShort4(0x01AA, 0x02DD, 0x03EE, 0xF422),
+				        UShort4(0x01BB, 0x02CC, 0x03FF, 0xF411));
 
-			// (U)Short8 variants are mentioned but unimplemented
+			*Pointer<Int4>(out + 16 * 2) =
+				MulHigh(Int4(0x000001AA, 0x000002DD, 0xC8000000, 0xF8000000),
+				        Int4(0x000001BB, 0x84000000, 0x000003EE, 0xD7000000));
+			*Pointer<UInt4>(out + 16 * 3) =
+				MulHigh(UInt4(0x000001AAu, 0x000002DDu, 0xC8000000u, 0xD8000000u),
+				        UInt4(0x000001BBu, 0x84000000u, 0x000003EEu, 0xD7000000u));
+
+			*Pointer<Int4>(out + 16 * 4) =
+				MulHigh(Int4(0x7FFFFFFF, 0x7FFFFFFF, 0x80008000, 0xFFFFFFFF),
+				        Int4(0x7FFFFFFF, 0x80000000, 0x80008000, 0xFFFFFFFF));
+			*Pointer<UInt4>(out + 16 * 5) =
+				MulHigh(UInt4(0x7FFFFFFFu, 0x7FFFFFFFu, 0x80008000u, 0xFFFFFFFFu),
+				        UInt4(0x7FFFFFFFu, 0x80000000u, 0x80008000u, 0xFFFFFFFFu));
+
+			// (U)Short8 variants currently unimplemented.
+
 			Return(0);
 		}
 
@@ -940,7 +955,7 @@
 
 		if(routine)
 		{
-			unsigned int out[2][2];
+			unsigned int out[6][4];
 
 			memset(&out, 0, sizeof(out));
 
@@ -948,10 +963,30 @@
 			callable(&out);
 
 			EXPECT_EQ(out[0][0], 0x00080002u);
-			EXPECT_EQ(out[0][1], 0x008D000fu);
+			EXPECT_EQ(out[0][1], 0x008D000Fu);
 
 			EXPECT_EQ(out[1][0], 0x00080002u);
-			EXPECT_EQ(out[1][1], 0xe8C0000Fu);
+			EXPECT_EQ(out[1][1], 0xE8C0000Fu);
+
+			EXPECT_EQ(out[2][0], 0x00000000u);
+			EXPECT_EQ(out[2][1], 0xFFFFFE9Cu);
+			EXPECT_EQ(out[2][2], 0xFFFFFF23u);
+			EXPECT_EQ(out[2][3], 0x01480000u);
+
+			EXPECT_EQ(out[3][0], 0x00000000u);
+			EXPECT_EQ(out[3][1], 0x00000179u);
+			EXPECT_EQ(out[3][2], 0x00000311u);
+			EXPECT_EQ(out[3][3], 0xB5680000u);
+
+			EXPECT_EQ(out[4][0], 0x3FFFFFFFu);
+			EXPECT_EQ(out[4][1], 0xC0000000u);
+			EXPECT_EQ(out[4][2], 0x3FFF8000u);
+			EXPECT_EQ(out[4][3], 0x00000000u);
+
+			EXPECT_EQ(out[5][0], 0x3FFFFFFFu);
+			EXPECT_EQ(out[5][1], 0x3FFFFFFFu);
+			EXPECT_EQ(out[5][2], 0x40008000u);
+			EXPECT_EQ(out[5][3], 0xFFFFFFFEu);
 		}
 	}
 
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index b4f1971..ee4b036 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -4087,6 +4087,52 @@
 		}
 	}
 
+	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
+	{
+		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+		// Scalarized implementation.
+		Int4 result;
+		result = Insert(result, Int((Long(Extract(x, 0)) * Long(Extract(y, 0))) >> Long(Int(32))), 0);
+		result = Insert(result, Int((Long(Extract(x, 1)) * Long(Extract(y, 1))) >> Long(Int(32))), 1);
+		result = Insert(result, Int((Long(Extract(x, 2)) * Long(Extract(y, 2))) >> Long(Int(32))), 2);
+		result = Insert(result, Int((Long(Extract(x, 3)) * Long(Extract(y, 3))) >> Long(Int(32))), 3);
+
+		return result;
+	}
+
+	RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
+	{
+		// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
+
+		if(false)  // Partial product based implementation.
+		{
+			auto xh = x >> 16;
+			auto yh = y >> 16;
+			auto xl = x & UInt4(0x0000FFFF);
+			auto yl = y & UInt4(0x0000FFFF);
+			auto xlyh = xl * yh;
+			auto xhyl = xh * yl;
+			auto xlyhh = xlyh >> 16;
+			auto xhylh = xhyl >> 16;
+			auto xlyhl = xlyh & UInt4(0x0000FFFF);
+			auto xhyll = xhyl & UInt4(0x0000FFFF);
+			auto xlylh = (xl * yl) >> 16;
+			auto oflow = (xlyhl + xhyll + xlylh) >> 16;
+
+			return (xh * yh) + (xlyhh + xhylh) + oflow;
+		}
+
+		// Scalarized implementation.
+		Int4 result;
+		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 0))) * Long(UInt(Extract(As<Int4>(y), 0)))) >> Long(Int(32))), 0);
+		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 1))) * Long(UInt(Extract(As<Int4>(y), 1)))) >> Long(Int(32))), 1);
+		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 2))) * Long(UInt(Extract(As<Int4>(y), 2)))) >> Long(Int(32))), 2);
+		result = Insert(result, Int((Long(UInt(Extract(As<Int4>(x), 3))) * Long(UInt(Extract(As<Int4>(y), 3)))) >> Long(Int(32))), 3);
+
+		return As<UInt4>(result);
+	}
+
 	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
 	{
 		assert(false && "UNIMPLEMENTED"); return RValue<UShort4>(V(nullptr));
@@ -4777,6 +4823,16 @@
 		return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
 	}
 
+	RValue<Long> operator*(RValue<Long> lhs, RValue<Long> rhs)
+	{
+		return RValue<Long>(Nucleus::createMul(lhs.value, rhs.value));
+	}
+
+	RValue<Long> operator>>(RValue<Long> lhs, RValue<Long> rhs)
+	{
+		return RValue<Long>(Nucleus::createAShr(lhs.value, rhs.value));
+	}
+
 	RValue<Long> operator+=(Long &lhs, RValue<Long> rhs)
 	{
 		return lhs = lhs + rhs;