Lower the rest of the vector arithmetic operations.
The instructions emitted by the lowering operations require memory
operands to be aligned to 16 bytes. Since there is no support for
aligning memory operands in Subzero, do the arithmetic in registers for
now.
Add vector arithmetic to the arith crosstest. Pass the -mstackrealign
parameter to the crosstest clang so that llc code called back from
Subzero code (helper calls) doesn't assume that the stack is aligned at
the entry to the call.
BUG=none
R=jvoung@chromium.org, stichnot@chromium.org
Review URL: https://codereview.chromium.org/397833002
diff --git a/crosstest/crosstest.py b/crosstest/crosstest.py
index be6c54c..c8e9442 100755
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -130,6 +130,7 @@
objs.append(bitcode)
linker = 'clang' if os.path.splitext(args.driver)[1] == '.c' else 'clang++'
- shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32', args.driver] +
- objs +
+ # TODO: Remove -mstackrealign after Subzero supports stack alignment.
+ shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32',
+ '-mstackrealign', args.driver] + objs +
['-lm', '-lpthread', '-o', os.path.join(args.dir, args.output)])
diff --git a/crosstest/test_arith.cpp b/crosstest/test_arith.cpp
index 18b4b57..ed6c9d3 100644
--- a/crosstest/test_arith.cpp
+++ b/crosstest/test_arith.cpp
@@ -10,7 +10,10 @@
uint8_t test##inst(uint8_t a, uint8_t b) { return a op b; } \
uint16_t test##inst(uint16_t a, uint16_t b) { return a op b; } \
uint32_t test##inst(uint32_t a, uint32_t b) { return a op b; } \
- uint64_t test##inst(uint64_t a, uint64_t b) { return a op b; }
+ uint64_t test##inst(uint64_t a, uint64_t b) { return a op b; } \
+ v4ui32 test##inst(v4ui32 a, v4ui32 b) { return a op b; } \
+ v8ui16 test##inst(v8ui16 a, v8ui16 b) { return a op b; } \
+ v16ui8 test##inst(v16ui8 a, v16ui8 b) { return a op b; }
UINTOP_TABLE
#undef X
@@ -19,12 +22,16 @@
int8_t test##inst(int8_t a, int8_t b) { return a op b; } \
int16_t test##inst(int16_t a, int16_t b) { return a op b; } \
int32_t test##inst(int32_t a, int32_t b) { return a op b; } \
- int64_t test##inst(int64_t a, int64_t b) { return a op b; }
+ int64_t test##inst(int64_t a, int64_t b) { return a op b; } \
+ v4si32 test##inst(v4si32 a, v4si32 b) { return a op b; } \
+ v8si16 test##inst(v8si16 a, v8si16 b) { return a op b; } \
+ v16si8 test##inst(v16si8 a, v16si8 b) { return a op b; }
SINTOP_TABLE
#undef X
#define X(inst, op, func) \
float test##inst(float a, float b) { return func(a op b); } \
- double test##inst(double a, double b) { return func(a op b); }
+ double test##inst(double a, double b) { return func(a op b); } \
+ v4f32 test##inst(v4f32 a, v4f32 b) { return func(a op b); }
FPOP_TABLE
#undef X
diff --git a/crosstest/test_arith.def b/crosstest/test_arith.def
index 4cf4596..019eb77 100644
--- a/crosstest/test_arith.def
+++ b/crosstest/test_arith.def
@@ -42,4 +42,27 @@
// instruction and "(a + b)" for the Fadd instruction. The two
// versions of myFrem() are defined in a separate bitcode file.
+#define INT_VALUE_ARRAY \
+{ 0x0, 0x1, 0x7ffffffe, 0x7fffffff, \
+ 0x80000000, 0x80000001, 0xfffffffe, 0xffffffff, \
+ 0x7e, 0x7f, 0x80, 0x81, \
+ 0xfe, 0xff, 0x100, 0x101, \
+ 0x7ffe, 0x7fff, 0x8000, 0x8001, \
+ 0xfffe, 0xffff, 0x10000, 0x10001 }
+
+#define FP_VALUE_ARRAY(NegInf, PosInf, NegNan, NaN) \
+{ 0, 1, 0x7e, \
+ 0x7f, 0x80, 0x81, \
+ 0xfe, 0xff, 0x7ffe, \
+ 0x7fff, 0x8000, 0x8001, \
+ 0xfffe, 0xffff, 0x7ffffffe, \
+ 0x7fffffff, 0x80000000, 0x80000001, \
+ 0xfffffffe, 0xffffffff, 0x100000000ll, \
+ 0x100000001ll, 0x7ffffffffffffffell, 0x7fffffffffffffffll, \
+ 0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell, \
+ 0xffffffffffffffffll, NegInf, PosInf, \
+ Nan, NegNan, -0.0, \
+ FLT_MIN, FLT_MAX, DBL_MIN, \
+ DBL_MAX }
+
#endif // TEST_ARITH_DEF
diff --git a/crosstest/test_arith.h b/crosstest/test_arith.h
index b4c2c37..a12ea24 100644
--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -1,12 +1,24 @@
#include <stdint.h>
#include "test_arith.def"
+// Vector types
+typedef int32_t v4si32 __attribute__((vector_size(16)));
+typedef uint32_t v4ui32 __attribute__((vector_size(16)));
+typedef int16_t v8si16 __attribute__((vector_size(16)));
+typedef uint16_t v8ui16 __attribute__((vector_size(16)));
+typedef int8_t v16si8 __attribute__((vector_size(16)));
+typedef uint8_t v16ui8 __attribute__((vector_size(16)));
+typedef float v4f32 __attribute__((vector_size(16)));
+
#define X(inst, op, isdiv) \
bool test##inst(bool a, bool b); \
uint8_t test##inst(uint8_t a, uint8_t b); \
uint16_t test##inst(uint16_t a, uint16_t b); \
uint32_t test##inst(uint32_t a, uint32_t b); \
- uint64_t test##inst(uint64_t a, uint64_t b);
+ uint64_t test##inst(uint64_t a, uint64_t b); \
+ v4ui32 test##inst(v4ui32 a, v4ui32 b); \
+ v8ui16 test##inst(v8ui16 a, v8ui16 b); \
+ v16ui8 test##inst(v16ui8 a, v16ui8 b);
UINTOP_TABLE
#undef X
@@ -15,18 +27,24 @@
int8_t test##inst(int8_t a, int8_t b); \
int16_t test##inst(int16_t a, int16_t b); \
int32_t test##inst(int32_t a, int32_t b); \
- int64_t test##inst(int64_t a, int64_t b);
+ int64_t test##inst(int64_t a, int64_t b); \
+ v4si32 test##inst(v4si32 a, v4si32 b); \
+ v8si16 test##inst(v8si16 a, v8si16 b); \
+ v16si8 test##inst(v16si8 a, v16si8 b);
SINTOP_TABLE
#undef X
float myFrem(float a, float b);
double myFrem(double a, double b);
+v4f32 myFrem(v4f32 a, v4f32 b);
#define X(inst, op, func) \
float test##inst(float a, float b); \
- double test##inst(double a, double b);
+ double test##inst(double a, double b); \
+ v4f32 test##inst(v4f32 a, v4f32 b);
FPOP_TABLE
#undef X
float mySqrt(float a);
double mySqrt(double a);
+// mySqrt for v4f32 is currently unsupported.
diff --git a/crosstest/test_arith_frem.ll b/crosstest/test_arith_frem.ll
index 34b7156..bb0590d 100644
--- a/crosstest/test_arith_frem.ll
+++ b/crosstest/test_arith_frem.ll
@@ -9,3 +9,8 @@
%rem = frem double %a, %b
ret double %rem
}
+
+define <4 x float> @_Z6myFremDv4_fS_(<4 x float> %a, <4 x float> %b) {
+ %rem = frem <4 x float> %a, %b
+ ret <4 x float> %rem
+}
diff --git a/crosstest/test_arith_main.cpp b/crosstest/test_arith_main.cpp
index 8c53ad5..d9bbbe9 100644
--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -4,7 +4,10 @@
#include <stdint.h>
+#include <climits> // CHAR_BIT
+#include <limits>
#include <cfloat>
+#include <cmath> // fmodf
#include <cstring> // memcmp
#include <iostream>
@@ -16,14 +19,20 @@
#include "test_arith.h"
}
-volatile unsigned Values[] = { 0x0, 0x1, 0x7ffffffe, 0x7fffffff,
- 0x80000000, 0x80000001, 0xfffffffe, 0xffffffff,
- 0x7e, 0x7f, 0x80, 0x81,
- 0xfe, 0xff, 0x100, 0x101,
- 0x7ffe, 0x7fff, 0x8000, 0x8001,
- 0xfffe, 0xffff, 0x10000, 0x10001, };
+volatile unsigned Values[] = INT_VALUE_ARRAY;
const static size_t NumValues = sizeof(Values) / sizeof(*Values);
+template <class T> bool inputsMayTriggerException(T Value1, T Value2) {
+ // Avoid HW divide-by-zero exception.
+ if (Value2 == 0)
+ return true;
+ // Avoid HW overflow exception (on x86-32). TODO: adjust
+ // for other architecture.
+ if (Value1 == std::numeric_limits<T>::min() && Value2 == -1)
+ return true;
+ return false;
+}
+
template <typename TypeUnsigned, typename TypeSigned>
void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef TypeUnsigned (*FuncTypeUnsigned)(TypeUnsigned, TypeUnsigned);
@@ -48,9 +57,9 @@
(FuncTypeUnsigned)(FuncTypeSigned)Subzero_::test##inst, isdiv \
} \
,
- SINTOP_TABLE
+ SINTOP_TABLE
#undef X
- };
+ };
const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
if (sizeof(TypeUnsigned) <= sizeof(uint32_t)) {
@@ -62,12 +71,8 @@
TypeUnsigned Value1 = Values[i];
TypeUnsigned Value2 = Values[j];
// Avoid HW divide-by-zero exception.
- if (Funcs[f].ExcludeDivExceptions && Value2 == 0)
- continue;
- // Avoid HW overflow exception (on x86-32). TODO: adjust
- // for other architectures.
- if (Funcs[f].ExcludeDivExceptions && Value1 == 0x80000000 &&
- Value2 == 0xffffffff)
+ if (Funcs[f].ExcludeDivExceptions &&
+ inputsMayTriggerException<TypeSigned>(Value1, Value2))
continue;
++TotalTests;
TypeUnsigned ResultSz = Funcs[f].FuncSz(Value1, Value2);
@@ -76,9 +81,9 @@
++Passes;
} else {
++Failures;
- std::cout << "test" << Funcs[f].Name << (8 * sizeof(TypeUnsigned))
- << "(" << Value1 << ", " << Value2
- << "): sz=" << (unsigned)ResultSz
+ std::cout << "test" << Funcs[f].Name
+ << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
+ << ", " << Value2 << "): sz=" << (unsigned)ResultSz
<< " llc=" << (unsigned)ResultLlc << std::endl;
}
}
@@ -96,8 +101,8 @@
(((TypeUnsigned)Values[iHi]) << 32) + Values[iLo];
TypeUnsigned Value2 =
(((TypeUnsigned)Values[jHi]) << 32) + Values[jLo];
- // Avoid HW divide-by-zero exception.
- if (Funcs[f].ExcludeDivExceptions && Value2 == 0)
+ if (Funcs[f].ExcludeDivExceptions &&
+ inputsMayTriggerException<TypeSigned>(Value1, Value2))
continue;
++TotalTests;
TypeUnsigned ResultSz = Funcs[f].FuncSz(Value1, Value2);
@@ -107,8 +112,8 @@
} else {
++Failures;
std::cout << "test" << Funcs[f].Name
- << (8 * sizeof(TypeUnsigned)) << "(" << Value1 << ", "
- << Value2 << "): sz=" << (unsigned)ResultSz
+ << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
+ << ", " << Value2 << "): sz=" << (unsigned)ResultSz
<< " llc=" << (unsigned)ResultLlc << std::endl;
}
}
@@ -119,27 +124,112 @@
}
}
+// Vectors are deterministically constructed by selecting elements from
+// a pool of scalar values based on a pseudorandom sequence. Testing
+// all possible combinations of scalar values from the value table is
+// not tractable.
+// TODO: Replace with a portable PRNG from C++11.
+class PRNG {
+public:
+ PRNG(uint32_t Seed = 1) : State(Seed) {}
+
+ uint32_t operator()() {
+ // Lewis, Goodman, and Miller (1969)
+ State = (16807 * State) % 2147483647;
+ return State;
+ }
+
+private:
+ uint32_t State;
+};
+
+const static size_t MaxTestsPerFunc = 100000;
+
+template <typename Type, typename ElementType, typename CastType>
+void outputVector(const Type Vect) {
+ const static size_t NumElementsInType = sizeof(Type) / sizeof(ElementType);
+ for (size_t i = 0; i < NumElementsInType; ++i) {
+ if (i > 0)
+ std::cout << ", ";
+ std::cout << (CastType) Vect[i];
+ }
+}
+
+template <typename TypeUnsigned, typename TypeSigned,
+ typename ElementTypeUnsigned, typename ElementTypeSigned>
+void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ typedef TypeUnsigned (*FuncTypeUnsigned)(TypeUnsigned, TypeUnsigned);
+ typedef TypeSigned (*FuncTypeSigned)(TypeSigned, TypeSigned);
+ static struct {
+ const char *Name;
+ FuncTypeUnsigned FuncLlc;
+ FuncTypeUnsigned FuncSz;
+ bool ExcludeDivExceptions; // for divide related tests
+ } Funcs[] = {
+#define X(inst, op, isdiv) \
+ { \
+ STR(inst), (FuncTypeUnsigned)test##inst, \
+ (FuncTypeUnsigned)Subzero_::test##inst, isdiv \
+ } \
+ ,
+ UINTOP_TABLE
+#undef X
+#define X(inst, op, isdiv) \
+ { \
+ STR(inst), (FuncTypeUnsigned)(FuncTypeSigned)test##inst, \
+ (FuncTypeUnsigned)(FuncTypeSigned)Subzero_::test##inst, isdiv \
+ } \
+ ,
+ SINTOP_TABLE
+#undef X
+ };
+ const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+ const static size_t NumElementsInType =
+ sizeof(TypeUnsigned) / sizeof(ElementTypeUnsigned);
+ for (size_t f = 0; f < NumFuncs; ++f) {
+ PRNG Index;
+ for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+ // Initialize the test vectors.
+ TypeUnsigned Value1, Value2;
+ for (size_t j = 0; j < NumElementsInType;) {
+ ElementTypeUnsigned Element1 = Values[Index() % NumElementsInType];
+ ElementTypeUnsigned Element2 = Values[Index() % NumElementsInType];
+ if (Funcs[f].ExcludeDivExceptions &&
+ inputsMayTriggerException<ElementTypeSigned>(Element1, Element2))
+ continue;
+ Value1[j] = Element1;
+ Value2[j] = Element2;
+ ++j;
+ }
+ // Perform the test.
+ TypeUnsigned ResultSz = Funcs[f].FuncSz(Value1, Value2);
+ TypeUnsigned ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+ ++TotalTests;
+ if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+ ++Passes;
+ } else {
+ std::cout << "test" << Funcs[f].Name << "v" << NumElementsInType << "i"
+ << (CHAR_BIT * sizeof(ElementTypeUnsigned)) << "(";
+ outputVector<TypeUnsigned, ElementTypeUnsigned, unsigned>(Value1);
+ std::cout << ", ";
+ outputVector<TypeUnsigned, ElementTypeUnsigned, unsigned>(Value2);
+ std::cout << "): sz=";
+ outputVector<TypeUnsigned, ElementTypeUnsigned, unsigned>(ResultSz);
+ std::cout << " llc=";
+ outputVector<TypeUnsigned, ElementTypeUnsigned, unsigned>(ResultLlc);
+ std::cout << std::endl;
+ }
+ }
+ }
+}
+
template <typename Type>
void testsFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
static const Type NegInf = -1.0 / 0.0;
static const Type PosInf = 1.0 / 0.0;
static const Type Nan = 0.0 / 0.0;
static const Type NegNan = -0.0 / 0.0;
- volatile Type Values[] = {
- 0, 1, 0x7e,
- 0x7f, 0x80, 0x81,
- 0xfe, 0xff, 0x7ffe,
- 0x7fff, 0x8000, 0x8001,
- 0xfffe, 0xffff, 0x7ffffffe,
- 0x7fffffff, 0x80000000, 0x80000001,
- 0xfffffffe, 0xffffffff, 0x100000000ll,
- 0x100000001ll, 0x7ffffffffffffffell, 0x7fffffffffffffffll,
- 0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell,
- 0xffffffffffffffffll, NegInf, PosInf,
- Nan, NegNan, -0.0,
- FLT_MIN, FLT_MAX,
- DBL_MIN, DBL_MAX
- };
+ volatile Type Values[] = FP_VALUE_ARRAY(NegInf, PosInf, NegNan, Nan);
const static size_t NumValues = sizeof(Values) / sizeof(*Values);
typedef Type (*FuncType)(Type, Type);
static struct {
@@ -152,7 +242,7 @@
,
FPOP_TABLE
#undef X
- };
+ };
const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
for (size_t f = 0; f < NumFuncs; ++f) {
@@ -169,8 +259,8 @@
} else {
++Failures;
std::cout << std::fixed << "test" << Funcs[f].Name
- << (8 * sizeof(Type)) << "(" << Value1 << ", " << Value2
- << "): sz=" << ResultSz << " llc=" << ResultLlc
+ << (CHAR_BIT * sizeof(Type)) << "(" << Value1 << ", "
+ << Value2 << "): sz=" << ResultSz << " llc=" << ResultLlc
<< std::endl;
}
}
@@ -186,14 +276,66 @@
++Passes;
} else {
++Failures;
- std::cout << std::fixed << "test_sqrt"
- << (8 * sizeof(Type)) << "(" << Value
- << "): sz=" << ResultSz << " llc=" << ResultLlc
+ std::cout << std::fixed << "test_sqrt" << (CHAR_BIT * sizeof(Type)) << "("
+ << Value << "): sz=" << ResultSz << " llc=" << ResultLlc
<< std::endl;
}
}
}
+void testsVecFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ static const float NegInf = -1.0 / 0.0;
+ static const float PosInf = 1.0 / 0.0;
+ static const float Nan = 0.0 / 0.0;
+ static const float NegNan = -0.0 / 0.0;
+ volatile float Values[] = FP_VALUE_ARRAY(NegInf, PosInf, NegNan, Nan);
+ const static size_t NumValues = sizeof(Values) / sizeof(*Values);
+ typedef v4f32 (*FuncType)(v4f32, v4f32);
+ static struct {
+ const char *Name;
+ FuncType FuncLlc;
+ FuncType FuncSz;
+ } Funcs[] = {
+#define X(inst, op, func) \
+ { STR(inst), (FuncType)test##inst, (FuncType)Subzero_::test##inst } \
+ ,
+ FPOP_TABLE
+#undef X
+ };
+ const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+ const static size_t NumElementsInType = 4;
+ for (size_t f = 0; f < NumFuncs; ++f) {
+ PRNG Index;
+ for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
+ // Initialize the test vectors.
+ v4f32 Value1, Value2;
+ for (size_t j = 0; j < NumElementsInType; ++j) {
+ Value1[j] = Values[Index() % NumElementsInType];
+ Value2[j] = Values[Index() % NumElementsInType];
+ }
+ // Perform the test.
+ v4f32 ResultSz = Funcs[f].FuncSz(Value1, Value2);
+ v4f32 ResultLlc = Funcs[f].FuncLlc(Value1, Value2);
+ ++TotalTests;
+ if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << std::fixed << "test" << Funcs[f].Name << "v4f32"
+ << "(";
+ outputVector<v4f32, float, float>(Value1);
+ std::cout << ", ";
+ outputVector<v4f32, float, float>(Value2);
+ std::cout << "): sz=";
+ outputVector<v4f32, float, float>(ResultSz);
+ std::cout << " llc=";
+ outputVector<v4f32, float, float>(ResultLlc);
+ std::cout << std::endl;
+ }
+ }
+ }
+}
+
int main(int argc, char **argv) {
size_t TotalTests = 0;
size_t Passes = 0;
@@ -203,10 +345,49 @@
testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
testsInt<uint64_t, int64_t>(TotalTests, Passes, Failures);
+ testsVecInt<v4ui32, v4si32, uint32_t, int32_t>(TotalTests, Passes, Failures);
+ testsVecInt<v8ui16, v8si16, uint16_t, int16_t>(TotalTests, Passes, Failures);
+ testsVecInt<v16ui8, v16si8, uint8_t, int8_t>(TotalTests, Passes, Failures);
testsFp<float>(TotalTests, Passes, Failures);
testsFp<double>(TotalTests, Passes, Failures);
+ testsVecFp(TotalTests, Passes, Failures);
std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
<< " Failures=" << Failures << "\n";
return Failures;
}
+
+extern "C" {
+// Subzero helpers
+ v4si32 Sz_shl_v4i32(v4si32 a, v4si32 b) { return a << b; }
+ v4si32 Sz_ashr_v4i32(v4si32 a, v4si32 b) { return a >> b; }
+ v4ui32 Sz_lshr_v4i32(v4ui32 a, v4ui32 b) { return a >> b; }
+ v4si32 Sz_sdiv_v4i32(v4si32 a, v4si32 b) { return a / b; }
+ v4ui32 Sz_udiv_v4i32(v4ui32 a, v4ui32 b) { return a / b; }
+ v4si32 Sz_srem_v4i32(v4si32 a, v4si32 b) { return a % b; }
+ v4ui32 Sz_urem_v4i32(v4ui32 a, v4ui32 b) { return a % b; }
+
+ v8si16 Sz_shl_v8i16(v8si16 a, v8si16 b) { return a << b; }
+ v8si16 Sz_ashr_v8i16(v8si16 a, v8si16 b) { return a >> b; }
+ v8ui16 Sz_lshr_v8i16(v8ui16 a, v8ui16 b) { return a >> b; }
+ v8si16 Sz_sdiv_v8i16(v8si16 a, v8si16 b) { return a / b; }
+ v8ui16 Sz_udiv_v8i16(v8ui16 a, v8ui16 b) { return a / b; }
+ v8si16 Sz_srem_v8i16(v8si16 a, v8si16 b) { return a % b; }
+ v8ui16 Sz_urem_v8i16(v8ui16 a, v8ui16 b) { return a % b; }
+
+ v16ui8 Sz_mul_v16i8(v16ui8 a, v16ui8 b) { return a * b; }
+ v16si8 Sz_shl_v16i8(v16si8 a, v16si8 b) { return a << b; }
+ v16si8 Sz_ashr_v16i8(v16si8 a, v16si8 b) { return a >> b; }
+ v16ui8 Sz_lshr_v16i8(v16ui8 a, v16ui8 b) { return a >> b; }
+ v16si8 Sz_sdiv_v16i8(v16si8 a, v16si8 b) { return a / b; }
+ v16ui8 Sz_udiv_v16i8(v16ui8 a, v16ui8 b) { return a / b; }
+ v16si8 Sz_srem_v16i8(v16si8 a, v16si8 b) { return a % b; }
+ v16ui8 Sz_urem_v16i8(v16ui8 a, v16ui8 b) { return a % b; }
+
+ v4f32 Sz_frem_v4f32(v4f32 a, v4f32 b) {
+ v4f32 Result;
+ for (int i = 0; i < 4; ++i)
+ Result[i] = fmodf(a[i], b[i]);
+ return Result;
+ }
+}