Eliminate EmulatedIntrinsics
Reactor's LLVM backend only used two 'emulated' intrinsics; Sinh() and
Cosh(), which can both just call the C/C++ math functions.
All of the Subzero transcendental intrinsics now also directly call the
math.h functions, and gather/scatter and atomics implementations have
been moved so EmulatedIntrinsics could be eliminated entirely.
Bug: b/169755552
Fixes: b/149110874
Change-Id: I9fdbf32c8a08200addc5763fd13d3da4a13696c8
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/62291
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/CMakeSettings.json b/CMakeSettings.json
index c3d2919..23c622f 100644
--- a/CMakeSettings.json
+++ b/CMakeSettings.json
@@ -10,7 +10,13 @@
"cmakeCommandArgs": "",
"buildCommandArgs": "-v",
"ctestCommandArgs": "",
- "variables": []
+ "variables": [
+ {
+ "name": "REACTOR_BACKEND",
+ "value": "Subzero",
+ "type": "STRING"
+ }
+ ]
},
{
"name": "x86-Debug",
@@ -21,8 +27,7 @@
"cmakeCommandArgs": "",
"buildCommandArgs": "-v",
"ctestCommandArgs": "",
- "inheritEnvironments": [ "msvc_x86" ],
- "variables": []
+ "inheritEnvironments": [ "msvc_x86" ]
},
{
"name": "x86-Release",
@@ -33,8 +38,7 @@
"cmakeCommandArgs": "",
"buildCommandArgs": "-v",
"ctestCommandArgs": "",
- "inheritEnvironments": [ "msvc_x86" ],
- "variables": []
+ "inheritEnvironments": [ "msvc_x86" ]
},
{
"name": "x64-Release",
@@ -45,8 +49,7 @@
"cmakeCommandArgs": "",
"buildCommandArgs": "-v",
"ctestCommandArgs": "",
- "inheritEnvironments": [ "msvc_x64_x64" ],
- "variables": []
+ "inheritEnvironments": [ "msvc_x64_x64" ]
}
]
}
\ No newline at end of file
diff --git a/src/Android.bp b/src/Android.bp
index b87eaed..a2f8d8a 100644
--- a/src/Android.bp
+++ b/src/Android.bp
@@ -34,7 +34,6 @@
"Reactor/Assert.cpp",
"Reactor/CPUID.cpp",
"Reactor/Debug.cpp",
- "Reactor/EmulatedIntrinsics.cpp",
"Reactor/ExecutableMemory.cpp",
"Reactor/LLVMJIT.cpp",
"Reactor/LLVMReactor.cpp",
diff --git a/src/Reactor/BUILD.gn b/src/Reactor/BUILD.gn
index e89fe65..676f99f 100644
--- a/src/Reactor/BUILD.gn
+++ b/src/Reactor/BUILD.gn
@@ -40,7 +40,6 @@
sources = [
"Assert.cpp",
"Debug.cpp",
- "EmulatedIntrinsics.cpp",
"ExecutableMemory.cpp",
"Pragma.cpp",
"Reactor.cpp",
diff --git a/src/Reactor/CMakeLists.txt b/src/Reactor/CMakeLists.txt
index a69a5af..8eb2034 100644
--- a/src/Reactor/CMakeLists.txt
+++ b/src/Reactor/CMakeLists.txt
@@ -22,8 +22,6 @@
Assert.hpp
Debug.cpp
Debug.hpp
- EmulatedIntrinsics.cpp
- EmulatedIntrinsics.hpp
ExecutableMemory.cpp
ExecutableMemory.hpp
Nucleus.hpp
diff --git a/src/Reactor/EmulatedIntrinsics.cpp b/src/Reactor/EmulatedIntrinsics.cpp
deleted file mode 100644
index d259cea..0000000
--- a/src/Reactor/EmulatedIntrinsics.cpp
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "EmulatedIntrinsics.hpp"
-
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <mutex>
-#include <utility>
-
-namespace rr {
-namespace {
-
-template<typename T>
-struct UnderlyingType
-{
- using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
-};
-
-template<typename T>
-using UnderlyingTypeT = typename UnderlyingType<T>::Type;
-
-// Call single arg function on a vector type
-template<typename Func, typename T>
-RValue<T> call4(Func func, const RValue<T> &x)
-{
- T result;
- result = Insert(result, Call(func, Extract(x, 0)), 0);
- result = Insert(result, Call(func, Extract(x, 1)), 1);
- result = Insert(result, Call(func, Extract(x, 2)), 2);
- result = Insert(result, Call(func, Extract(x, 3)), 3);
- return result;
-}
-
-// Call two arg function on a vector type
-template<typename Func, typename T>
-RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y)
-{
- T result;
- result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
- result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
- result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
- result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
- return result;
-}
-
-// Call three arg function on a vector type
-template<typename Func, typename T>
-RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y, const RValue<T> &z)
-{
- T result;
- result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0), Extract(z, 0)), 0);
- result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1), Extract(z, 1)), 1);
- result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2), Extract(z, 2)), 2);
- result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3), Extract(z, 3)), 3);
- return result;
-}
-
-template<typename T, typename EL = UnderlyingTypeT<T>>
-void gather(T &out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
-{
- constexpr bool atomic = false;
- constexpr std::memory_order order = std::memory_order_relaxed;
-
- Pointer<Byte> baseBytePtr = base;
-
- out = T(0);
- for(int i = 0; i < 4; i++)
- {
- If(Extract(mask, i) != 0)
- {
- auto offset = Extract(offsets, i);
- auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
- out = Insert(out, el, i);
- }
- Else If(zeroMaskedLanes)
- {
- out = Insert(out, EL(0), i);
- }
- }
-}
-
-template<typename T, typename EL = UnderlyingTypeT<T>>
-void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-{
- constexpr bool atomic = false;
- constexpr std::memory_order order = std::memory_order_relaxed;
-
- Pointer<Byte> baseBytePtr = base;
-
- for(int i = 0; i < 4; i++)
- {
- If(Extract(mask, i) != 0)
- {
- auto offset = Extract(offsets, i);
- Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
- }
- }
-}
-
-// TODO(b/148276653): Both atomicMin and atomicMax use a static (global) mutex that makes all min
-// operations for a given T mutually exclusive, rather than only the ones on the value pointed to
-// by ptr. Use a CAS loop, as is done for LLVMReactor's min/max atomic for Android.
-// TODO(b/148207274): Or, move this down into Subzero as a CAS-based operation.
-template<typename T>
-static T atomicMin(T *ptr, T value)
-{
- static std::mutex m;
-
- std::lock_guard<std::mutex> lock(m);
- T origValue = *ptr;
- *ptr = std::min(origValue, value);
- return origValue;
-}
-template<typename T>
-static T atomicMax(T *ptr, T value)
-{
- static std::mutex m;
-
- std::lock_guard<std::mutex> lock(m);
- T origValue = *ptr;
- *ptr = std::max(origValue, value);
- return origValue;
-}
-
-} // anonymous namespace
-
-namespace emulated {
-
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-{
- Float4 result{};
- gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
- return result;
-}
-
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
-{
- Int4 result{};
- gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
- return result;
-}
-
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-{
- scatter(base, val, offsets, mask, alignment);
-}
-
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
-{
- scatter<Int4>(base, val, offsets, mask, alignment);
-}
-
-RValue<Float> Exp2(RValue<Float> x)
-{
- return Call(exp2f, x);
-}
-
-RValue<Float> Log2(RValue<Float> x)
-{
- return Call(log2f, x);
-}
-
-RValue<Float4> Sin(RValue<Float4> x)
-{
- return call4(sinf, x);
-}
-
-RValue<Float4> Cos(RValue<Float4> x)
-{
- return call4(cosf, x);
-}
-
-RValue<Float4> Tan(RValue<Float4> x)
-{
- return call4(tanf, x);
-}
-
-RValue<Float4> Asin(RValue<Float4> x)
-{
- return call4(asinf, x);
-}
-
-RValue<Float4> Acos(RValue<Float4> x)
-{
- return call4(acosf, x);
-}
-
-RValue<Float4> Atan(RValue<Float4> x)
-{
- return call4(atanf, x);
-}
-
-RValue<Float4> Sinh(RValue<Float4> x)
-{
- // TODO(b/149110874) Use coshf/sinhf when we've implemented SpirV versions at the SpirV level
- return Float4(0.5f) * (emulated::Exp(x) - emulated::Exp(-x));
-}
-
-RValue<Float4> Cosh(RValue<Float4> x)
-{
- // TODO(b/149110874) Use coshf/sinhf when we've implemented SpirV versions at the SpirV level
- return Float4(0.5f) * (emulated::Exp(x) + emulated::Exp(-x));
-}
-
-RValue<Float4> Tanh(RValue<Float4> x)
-{
- return call4(tanhf, x);
-}
-
-RValue<Float4> Asinh(RValue<Float4> x)
-{
- return call4(asinhf, x);
-}
-
-RValue<Float4> Acosh(RValue<Float4> x)
-{
- return call4(acoshf, x);
-}
-
-RValue<Float4> Atanh(RValue<Float4> x)
-{
- return call4(atanhf, x);
-}
-
-RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
-{
- return call4(atan2f, x, y);
-}
-
-RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
-{
- return call4(powf, x, y);
-}
-
-RValue<Float4> Exp(RValue<Float4> x)
-{
- return call4(expf, x);
-}
-
-RValue<Float4> Log(RValue<Float4> x)
-{
- return call4(logf, x);
-}
-
-RValue<Float4> Exp2(RValue<Float4> x)
-{
- return call4(exp2f, x);
-}
-
-RValue<Float4> Log2(RValue<Float4> x)
-{
- return call4(log2f, x);
-}
-
-RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
-{
- return Call(atomicMin<int32_t>, x, y);
-}
-
-RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
-{
- return Call(atomicMin<uint32_t>, x, y);
-}
-
-RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
-{
- return Call(atomicMax<int32_t>, x, y);
-}
-
-RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
-{
- return Call(atomicMax<uint32_t>, x, y);
-}
-
-RValue<Float4> FRem(RValue<Float4> lhs, RValue<Float4> rhs)
-{
- return call4(fmodf, lhs, rhs);
-}
-
-RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
-{
- return call4(fmaf, x, y, z);
-}
-
-} // namespace emulated
-} // namespace rr
diff --git a/src/Reactor/EmulatedIntrinsics.hpp b/src/Reactor/EmulatedIntrinsics.hpp
deleted file mode 100644
index 4ca6224..0000000
--- a/src/Reactor/EmulatedIntrinsics.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "Reactor.hpp"
-
-// Implementation of intrinsics that are "emulated" - that is,
-// implemented either in terms of Reactor code, or make use of
-// rr::Call to C functions. These are typically slower than implementing
-// in terms of direct calls to the JIT backend; however, provide a good
-// starting point for implementing a new backend, or for when adding
-// functionality to an existing backend is non-trivial.
-
-namespace rr {
-namespace emulated {
-
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-RValue<Float> Exp2(RValue<Float> x);
-RValue<Float> Log2(RValue<Float> x);
-RValue<Float4> Sin(RValue<Float4> x);
-RValue<Float4> Cos(RValue<Float4> x);
-RValue<Float4> Tan(RValue<Float4> x);
-RValue<Float4> Asin(RValue<Float4> x);
-RValue<Float4> Acos(RValue<Float4> x);
-RValue<Float4> Atan(RValue<Float4> x);
-RValue<Float4> Sinh(RValue<Float4> x);
-RValue<Float4> Cosh(RValue<Float4> x);
-RValue<Float4> Tanh(RValue<Float4> x);
-RValue<Float4> Asinh(RValue<Float4> x);
-RValue<Float4> Acosh(RValue<Float4> x);
-RValue<Float4> Atanh(RValue<Float4> x);
-RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y);
-RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-RValue<Float4> Exp(RValue<Float4> x);
-RValue<Float4> Log(RValue<Float4> x);
-RValue<Float4> Exp2(RValue<Float4> x);
-RValue<Float4> Log2(RValue<Float4> x);
-RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder);
-RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder);
-RValue<Float4> FRem(RValue<Float4> lhs, RValue<Float4> rhs);
-RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z);
-
-} // namespace emulated
-} // namespace rr
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index a2781b7..14db765 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -16,7 +16,6 @@
#include "CPUID.hpp"
#include "Debug.hpp"
-#include "EmulatedIntrinsics.hpp"
#include "LLVMReactorDebugInfo.hpp"
#include "Print.hpp"
#include "Reactor.hpp"
@@ -3476,13 +3475,13 @@
RValue<Float4> Sinh(RValue<Float4> v)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Sinh(v);
+ return TransformFloat4PerElement(v, "sinhf");
}
RValue<Float4> Cosh(RValue<Float4> v)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Cosh(v);
+ return TransformFloat4PerElement(v, "coshf");
}
RValue<Float4> Tanh(RValue<Float4> v)
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 09d3ff3..dc38d8e 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -13,7 +13,6 @@
// limitations under the License.
#include "Debug.hpp"
-#include "EmulatedIntrinsics.hpp"
#include "Print.hpp"
#include "Reactor.hpp"
#include "ReactorDebugInfo.hpp"
@@ -51,6 +50,7 @@
#endif
#include <array>
+#include <cmath>
#include <iostream>
#include <limits>
#include <mutex>
@@ -1307,11 +1307,6 @@
return nullptr;
}
-RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
-{
- return emulated::FRem(lhs, rhs);
-}
-
Value *Nucleus::createShl(Value *lhs, Value *rhs)
{
RR_DEBUG_INFO_UPDATE_LOC();
@@ -3940,6 +3935,47 @@
storeValue(replicate);
}
+// Call single arg function on a vector type
+template<typename Func, typename T>
+static RValue<T> call4(Func func, const RValue<T> &x)
+{
+ T result;
+ result = Insert(result, Call(func, Extract(x, 0)), 0);
+ result = Insert(result, Call(func, Extract(x, 1)), 1);
+ result = Insert(result, Call(func, Extract(x, 2)), 2);
+ result = Insert(result, Call(func, Extract(x, 3)), 3);
+ return result;
+}
+
+// Call two arg function on a vector type
+template<typename Func, typename T>
+static RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y)
+{
+ T result;
+ result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0)), 0);
+ result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1)), 1);
+ result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2)), 2);
+ result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3)), 3);
+ return result;
+}
+
+// Call three arg function on a vector type
+template<typename Func, typename T>
+static RValue<T> call4(Func func, const RValue<T> &x, const RValue<T> &y, const RValue<T> &z)
+{
+ T result;
+ result = Insert(result, Call(func, Extract(x, 0), Extract(y, 0), Extract(z, 0)), 0);
+ result = Insert(result, Call(func, Extract(x, 1), Extract(y, 1), Extract(z, 1)), 1);
+ result = Insert(result, Call(func, Extract(x, 2), Extract(y, 2), Extract(z, 2)), 2);
+ result = Insert(result, Call(func, Extract(x, 3), Extract(y, 3), Extract(z, 3)), 3);
+ return result;
+}
+
+RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
+{
+ return call4(fmodf, lhs, rhs);
+}
+
RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
{
// TODO(b/214591655): Use FMA when available.
@@ -3949,7 +3985,7 @@
RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
{
// TODO(b/214591655): Use FMA instructions when available.
- return emulated::FMA(x, y, z);
+ return call4(fmaf, x, y, z);
}
RValue<Float4> Abs(RValue<Float4> x)
@@ -4325,148 +4361,203 @@
UNIMPLEMENTED("b/155867273 Subzero createMaskedStore()");
}
+template<typename T>
+struct UnderlyingType
+{
+ using Type = typename decltype(rr::Extract(std::declval<RValue<T>>(), 0))::rvalue_underlying_type;
+};
+
+template<typename T>
+using UnderlyingTypeT = typename UnderlyingType<T>::Type;
+
+template<typename T, typename EL = UnderlyingTypeT<T>>
+static void gather(T &out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
+{
+ constexpr bool atomic = false;
+ constexpr std::memory_order order = std::memory_order_relaxed;
+
+ Pointer<Byte> baseBytePtr = base;
+
+ out = T(0);
+ for(int i = 0; i < 4; i++)
+ {
+ If(Extract(mask, i) != 0)
+ {
+ auto offset = Extract(offsets, i);
+ auto el = Load(Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+ out = Insert(out, el, i);
+ }
+ Else If(zeroMaskedLanes)
+ {
+ out = Insert(out, EL(0), i);
+ }
+ }
+}
+
+template<typename T, typename EL = UnderlyingTypeT<T>>
+static void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+{
+ constexpr bool atomic = false;
+ constexpr std::memory_order order = std::memory_order_relaxed;
+
+ Pointer<Byte> baseBytePtr = base;
+
+ for(int i = 0; i < 4; i++)
+ {
+ If(Extract(mask, i) != 0)
+ {
+ auto offset = Extract(offsets, i);
+ Store(Extract(val, i), Pointer<EL>(&baseBytePtr[offset]), alignment, atomic, order);
+ }
+ }
+}
+
RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+ Float4 result{};
+ gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+ return result;
}
RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Gather(base, offsets, mask, alignment, zeroMaskedLanes);
+ Int4 result{};
+ gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
+ return result;
}
void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Scatter(base, val, offsets, mask, alignment);
+ scatter(base, val, offsets, mask, alignment);
}
void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Scatter(base, val, offsets, mask, alignment);
+ scatter<Int4>(base, val, offsets, mask, alignment);
}
RValue<Float> Exp2(RValue<Float> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Exp2(x);
+ return Call(exp2f, x);
}
RValue<Float> Log2(RValue<Float> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Log2(x);
+ return Call(log2f, x);
}
RValue<Float4> Sin(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Sin(x);
+ return call4(sinf, x);
}
RValue<Float4> Cos(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Cos(x);
+ return call4(cosf, x);
}
RValue<Float4> Tan(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Tan(x);
+ return call4(tanf, x);
}
RValue<Float4> Asin(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Asin(x);
+ return call4(asinf, x);
}
RValue<Float4> Acos(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Acos(x);
+ return call4(acosf, x);
}
RValue<Float4> Atan(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Atan(x);
+ return call4(atanf, x);
}
RValue<Float4> Sinh(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Sinh(x);
+ return call4(sinhf, x);
}
RValue<Float4> Cosh(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Cosh(x);
+ return call4(coshf, x);
}
RValue<Float4> Tanh(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Tanh(x);
+ return call4(tanhf, x);
}
RValue<Float4> Asinh(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Asinh(x);
+ return call4(asinhf, x);
}
RValue<Float4> Acosh(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Acosh(x);
+ return call4(acoshf, x);
}
RValue<Float4> Atanh(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Atanh(x);
+ return call4(atanhf, x);
}
RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Atan2(x, y);
+ return call4(atan2f, x, y);
}
RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Pow(x, y);
+ return call4(powf, x, y);
}
RValue<Float4> Exp(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Exp(x);
+ return call4(expf, x);
}
RValue<Float4> Log(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Log(x);
+ return call4(logf, x);
}
RValue<Float4> Exp2(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Exp2(x);
+ return call4(exp2f, x);
}
RValue<Float4> Log2(RValue<Float4> x)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::Log2(x);
+ return call4(log2f, x);
}
RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
@@ -4549,28 +4640,54 @@
}
}
+// TODO(b/148276653): Both atomicMin and atomicMax use a static (global) mutex that makes all min
+// operations for a given T mutually exclusive, rather than only the ones on the value pointed to
+// by ptr. Use a CAS loop, as is done for LLVMReactor's min/max atomic for Android.
+// TODO(b/148207274): Or, move this down into Subzero as a CAS-based operation.
+template<typename T>
+static T atomicMin(T *ptr, T value)
+{
+ static std::mutex m;
+
+ std::lock_guard<std::mutex> lock(m);
+ T origValue = *ptr;
+ *ptr = std::min(origValue, value);
+ return origValue;
+}
+
+template<typename T>
+static T atomicMax(T *ptr, T value)
+{
+ static std::mutex m;
+
+ std::lock_guard<std::mutex> lock(m);
+ T origValue = *ptr;
+ *ptr = std::max(origValue, value);
+ return origValue;
+}
+
RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::MinAtomic(x, y, memoryOrder);
+ return Call(atomicMin<int32_t>, x, y);
}
RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::MinAtomic(x, y, memoryOrder);
+ return Call(atomicMin<uint32_t>, x, y);
}
RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::MaxAtomic(x, y, memoryOrder);
+ return Call(atomicMax<int32_t>, x, y);
}
RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
{
RR_DEBUG_INFO_UPDATE_LOC();
- return emulated::MaxAtomic(x, y, memoryOrder);
+ return Call(atomicMax<uint32_t>, x, y);
}
void EmitDebugLocation()