Subzero: lower the rest of the atomic operations.
64-bit ops are expanded via a cmpxchg8b loop.
64/32-bit and/or/xor are also expanded into a cmpxchg /
cmpxchg8b loop.
Add a cross test for atomic RMW operations and
compare and swap.
Misc: Test that atomic.is.lock.free can be optimized out if result is ignored.
TODO:
* optimize compare and swap with compare+branch further down
instruction stream.
* optimize atomic RMW when the return value is ignored
(adds a locked field to binary ops though).
* We may want to do some actual target-dependent basic
block splitting + expansion (the instructions inserted by
the expansion must reference the pre-colored registers,
etc.). Otherwise, we are currently getting by with modeling
the extended liveness of the variables used in the loops
using fake uses.
BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882
R=jfb@chromium.org, stichnot@chromium.org
Review URL: https://codereview.chromium.org/362463002
diff --git a/crosstest/crosstest.py b/crosstest/crosstest.py
index a37b10f..be6c54c 100755
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -57,6 +57,11 @@
metavar='PATH',
help='Path to LLVM executables like llc ' +
'(defaults to $LLVM_BIN_PATH)')
+ argparser.add_argument('--crosstest-bitcode', required=False,
+ default=1, type=int,
+ help='Compile non-subzero crosstest object file ' +
+ 'from the same bitcode as the subzero object. ' +
+ 'If 0, then compile it straight from source.')
args = argparser.parse_args()
objs = []
@@ -113,7 +118,9 @@
# failures. This behavior can be inspected by switching
# use_llc between True and False.
use_llc = False
- if use_llc:
+ if not args.crosstest_bitcode:
+ objs.append(arg)
+ elif use_llc:
shellcmd([os.path.join(llvm_bin_path, 'llc'),
'-filetype=obj',
'-o=' + obj_llc,
@@ -125,4 +132,4 @@
linker = 'clang' if os.path.splitext(args.driver)[1] == '.c' else 'clang++'
shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32', args.driver] +
objs +
- ['-lm', '-o', os.path.join(args.dir, args.output)])
+ ['-lm', '-lpthread', '-o', os.path.join(args.dir, args.output)])
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index cf821e2..bba53d7 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -64,6 +64,17 @@
--driver=test_icmp_main.cpp \
--output=test_icmp_O${optlevel}
+ # Compile the non-subzero object files straight from source
+ # since the native LLVM backend does not understand how to
+ # lower NaCl-specific intrinsics.
+ ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+ --dir="${OUTDIR}" \
+ --llvm-bin-path="${LLVM_BIN_PATH}" \
+ --test=test_sync_atomic.cpp \
+ --crosstest-bitcode=0 \
+ --driver=test_sync_atomic_main.cpp \
+ --output=test_sync_atomic_O${optlevel}
+
done
for optlevel in ${OPTLEVELS} ; do
@@ -74,4 +85,5 @@
"${OUTDIR}"/test_fcmp_O${optlevel}
"${OUTDIR}"/test_global_O${optlevel}
"${OUTDIR}"/test_icmp_O${optlevel}
+ "${OUTDIR}"/test_sync_atomic_O${optlevel}
done
diff --git a/crosstest/test_sync_atomic.cpp b/crosstest/test_sync_atomic.cpp
new file mode 100644
index 0000000..05d0336
--- /dev/null
+++ b/crosstest/test_sync_atomic.cpp
@@ -0,0 +1,63 @@
+//===- subzero/crosstest/test_sync_atomic.cpp - Implementation for tests --===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This aims to test that all the atomic RMW instructions and compare and swap
+// work across the allowed atomic types. This uses the __sync_* builtins
+// to test the atomic operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#include <cstdlib>
+
+#include "test_sync_atomic.h"
+
+#define X(inst, type) \
+ type test_##inst(bool fetch_first, volatile type *ptr, type a) { \
+ if (fetch_first) { \
+ return __sync_fetch_and_##inst(ptr, a); \
+ } else { \
+ return __sync_##inst##_and_fetch(ptr, a); \
+ } \
+ } \
+ type test_alloca_##inst(bool fetch, volatile type *ptr, type a) { \
+ const size_t buf_size = 8; \
+ type buf[buf_size]; \
+ for (size_t i = 0; i < buf_size; ++i) { \
+ if (fetch) { \
+ buf[i] = __sync_fetch_and_##inst(ptr, a); \
+ } else { \
+ buf[i] = __sync_##inst##_and_fetch(ptr, a); \
+ } \
+ } \
+ type sum = 0; \
+ for (size_t i = 0; i < buf_size; ++i) { \
+ sum += buf[i]; \
+ } \
+ return sum; \
+ } \
+ type test_const_##inst(bool fetch, volatile type *ptr, type ign) { \
+ if (fetch) { \
+ return __sync_fetch_and_##inst(ptr, 42); \
+ } else { \
+ return __sync_##inst##_and_fetch(ptr, 99); \
+ } \
+ }
+
+FOR_ALL_RMWOP_TYPES(X)
+#undef X
+
+#define X(type) \
+ type test_val_cmp_swap(volatile type *ptr, type oldval, type newval) { \
+ return __sync_val_compare_and_swap(ptr, oldval, newval); \
+ }
+
+ATOMIC_TYPE_TABLE
+#undef X
diff --git a/crosstest/test_sync_atomic.def b/crosstest/test_sync_atomic.def
new file mode 100644
index 0000000..f84afde
--- /dev/null
+++ b/crosstest/test_sync_atomic.def
@@ -0,0 +1,50 @@
+//===- subzero/crosstest/test_sync_atomic.def - macros for tests -*- C++ -*-===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for testing atomic intrinsics (via sync builtins).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_SYNC_ATOMIC_DEF
+#define TEST_SYNC_ATOMIC_DEF
+
+#define STR(s) #s
+
+#define RMWOP_TABLE \
+ /* inst */ \
+ X(add) \
+ X(sub) \
+ X(or) \
+ X(and) \
+ X(xor)
+//#define X(inst)
+
+#define ATOMIC_TYPE_TABLE \
+ /* type */ \
+ X(uint8_t) \
+ X(uint16_t) \
+ X(uint32_t) \
+ X(uint64_t)
+//#define X(type)
+
+#define FOR_ALL_RMWTYPES_INST(F, inst) \
+ F(inst, uint8_t) \
+ F(inst, uint16_t) \
+ F(inst, uint32_t) \
+ F(inst, uint64_t)
+
+#define FOR_ALL_RMWOP_TYPES(X) \
+ FOR_ALL_RMWTYPES_INST(X, add) \
+ FOR_ALL_RMWTYPES_INST(X, sub) \
+ FOR_ALL_RMWTYPES_INST(X, or) \
+ FOR_ALL_RMWTYPES_INST(X, and) \
+ FOR_ALL_RMWTYPES_INST(X, xor)
+//#define X(inst, type)
+
+#endif // TEST_SYNC_ATOMIC_DEF
diff --git a/crosstest/test_sync_atomic.h b/crosstest/test_sync_atomic.h
new file mode 100644
index 0000000..a88cd73
--- /dev/null
+++ b/crosstest/test_sync_atomic.h
@@ -0,0 +1,29 @@
+//===- subzero/crosstest/test_sync_atomic.h - Test prototypes ---*- C++ -*-===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for cross testing atomic
+// intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "test_sync_atomic.def"
+
+#define X(inst, type) \
+ type test_##inst(bool fetch_first, volatile type *ptr, type a); \
+ type test_alloca_##inst(bool fetch, volatile type *ptr, type a); \
+ type test_const_##inst(bool fetch, volatile type *ptr, type ignored);
+
+FOR_ALL_RMWOP_TYPES(X)
+#undef X
+
+#define X(type) \
+ type test_val_cmp_swap(volatile type *ptr, type oldval, type newval);
+
+ATOMIC_TYPE_TABLE
+#undef X
diff --git a/crosstest/test_sync_atomic_main.cpp b/crosstest/test_sync_atomic_main.cpp
new file mode 100644
index 0000000..0cae7cd
--- /dev/null
+++ b/crosstest/test_sync_atomic_main.cpp
@@ -0,0 +1,298 @@
+//===- subzero/crosstest/test_sync_atomic_main.cpp - Driver for tests -----===//
+//
+// The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing atomic intrinsics, via the sync builtins.
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_sync_atomic.cpp --crosstest-bitcode=0 \
+ --driver=test_sync_atomic_main.cpp --prefix=Subzero_ \
+ --output=test_sync_atomic */
+
+#include <pthread.h>
+#include <stdint.h>
+
+#include <cerrno>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+// Include test_sync_atomic.h twice - once normally, and once within the
+// Subzero_ namespace, corresponding to the llc and Subzero translated
+// object files, respectively.
+#include "test_sync_atomic.h"
+namespace Subzero_ {
+#include "test_sync_atomic.h"
+}
+
+volatile uint64_t Values[] = {
+ 0, 1, 0x7e,
+ 0x7f, 0x80, 0x81,
+ 0xfe, 0xff, 0x7ffe,
+ 0x7fff, 0x8000, 0x8001,
+ 0xfffe, 0xffff,
+ 0x007fffff /*Max subnormal + */,
+ 0x00800000 /*Min+ */, 0x7f7fffff /*Max+ */,
+ 0x7f800000 /*+Inf*/, 0xff800000 /*-Inf*/,
+ 0x7fa00000 /*SNaN*/, 0x7fc00000 /*QNaN*/,
+ 0x7ffffffe, 0x7fffffff, 0x80000000,
+ 0x80000001, 0xfffffffe, 0xffffffff,
+ 0x100000000ll, 0x100000001ll,
+ 0x000fffffffffffffll /*Max subnormal + */,
+ 0x0010000000000000ll /*Min+ */,
+ 0x7fefffffffffffffll /*Max+ */,
+ 0x7ff0000000000000ll /*+Inf*/,
+ 0xfff0000000000000ll /*-Inf*/,
+ 0x7ff0000000000001ll /*SNaN*/,
+ 0x7ff8000000000000ll /*QNaN*/,
+ 0x7ffffffffffffffell, 0x7fffffffffffffffll, 0x8000000000000000ll,
+ 0x8000000000000001ll, 0xfffffffffffffffell, 0xffffffffffffffffll };
+
+const static size_t NumValues = sizeof(Values) / sizeof(*Values);
+
+struct {
+ volatile uint8_t l8;
+ volatile uint16_t l16;
+ volatile uint32_t l32;
+ volatile uint64_t l64;
+} AtomicLocs;
+
+template <typename Type>
+void testAtomicRMW(volatile Type *AtomicLoc,
+ size_t &TotalTests, size_t &Passes, size_t &Failures) {
+ typedef Type (*FuncType)(bool, volatile Type*, Type);
+ static struct {
+ const char *Name;
+ FuncType FuncLlc;
+ FuncType FuncSz;
+ } Funcs[] = {
+#define X(inst) \
+ { \
+ STR(inst), test_##inst, Subzero_::test_##inst \
+ }, \
+ { \
+ STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst \
+ }, \
+ { \
+ STR(inst) "_const", test_const_##inst, Subzero_::test_const_##inst \
+ },
+ RMWOP_TABLE
+#undef X
+ };
+ const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+ for (size_t f = 0; f < NumFuncs; ++f) {
+ for (size_t i = 0; i < NumValues; ++i) {
+ Type Value1 = static_cast<Type>(Values[i]);
+ for (size_t j = 0; j < NumValues; ++j) {
+ Type Value2 = static_cast<Type>(Values[j]);
+ for (size_t k = 0; k < 2; ++k) {
+ bool fetch_first = k;
+ ++TotalTests;
+ *AtomicLoc = Value1;
+ Type ResultSz1 = Funcs[f].FuncSz(
+ fetch_first, AtomicLoc, Value2);
+ Type ResultSz2 = *AtomicLoc;
+ *AtomicLoc = Value1;
+ Type ResultLlc1 = Funcs[f].FuncLlc(
+ fetch_first, AtomicLoc, Value2);
+ Type ResultLlc2 = *AtomicLoc;
+ if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << "test_" << Funcs[f].Name
+ << (CHAR_BIT * sizeof(Type)) << "("
+ << static_cast<uint64_t>(Value1) << ", "
+ << static_cast<uint64_t>(Value2)
+ << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+ << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+ << " sz2=" << static_cast<uint64_t>(ResultSz2)
+ << " llc2=" << static_cast<uint64_t>(ResultLlc2)
+ << "\n";
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename Type>
+void testValCompareAndSwap(volatile Type *AtomicLoc, size_t &TotalTests,
+ size_t &Passes, size_t &Failures) {
+ for (size_t i = 0; i < NumValues; ++i) {
+ Type Value1 = static_cast<Type>(Values[i]);
+ for (size_t j = 0; j < NumValues; ++j) {
+ Type Value2 = static_cast<Type>(Values[j]);
+ for (size_t f = 0; f < 2; ++f) {
+ bool flip = f;
+ ++TotalTests;
+ *AtomicLoc = Value1;
+ Type ResultSz1 = Subzero_::test_val_cmp_swap(
+ AtomicLoc, flip ? Value2 : Value1, Value2);
+ Type ResultSz2 = *AtomicLoc;
+ *AtomicLoc = Value1;
+ Type ResultLlc1 = test_val_cmp_swap(
+ AtomicLoc, flip ? Value2 : Value1, Value2);
+ Type ResultLlc2 = *AtomicLoc;
+ if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << "test_val_cmp_swap" << (CHAR_BIT * sizeof(Type)) << "("
+ << static_cast<uint64_t>(Value1) << ", "
+ << static_cast<uint64_t>(Value2)
+ << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+ << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+ << " sz2=" << static_cast<uint64_t>(ResultSz2)
+ << " llc2=" << static_cast<uint64_t>(ResultLlc2)
+ << "\n";
+ }
+ }
+ }
+ }
+}
+
+template <typename Type>
+struct ThreadData {
+ Type (*FuncPtr)(bool, volatile Type*, Type);
+ bool Fetch;
+ volatile Type *Ptr;
+ Type Adjustment;
+};
+
+template <typename Type>
+void *threadWrapper(void *Data) {
+ const size_t NumReps = 8000;
+ ThreadData<Type> *TData = reinterpret_cast<ThreadData<Type>*>(Data);
+ for (size_t i = 0; i < NumReps; ++i) {
+ (void)TData->FuncPtr(TData->Fetch, TData->Ptr, TData->Adjustment);
+ }
+ return NULL;
+}
+
+template <typename Type>
+void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
+ size_t &Passes, size_t &Failures) {
+ typedef Type (*FuncType)(bool, volatile Type*, Type);
+ static struct {
+ const char *Name;
+ FuncType FuncLlc;
+ FuncType FuncSz;
+ } Funcs[] = {
+#define X(inst) \
+ { \
+ STR(inst), test_##inst, Subzero_::test_##inst \
+ }, \
+ { \
+ STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst \
+ },
+ RMWOP_TABLE
+#undef X
+ };
+ const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+ // Just test a few values, otherwise it takes a *really* long time.
+ volatile uint64_t ValuesSubset[] = { 1, 0x7e, 0x000fffffffffffffffll };
+ const size_t NumValuesSubset = sizeof(ValuesSubset) / sizeof(*ValuesSubset);
+
+ for (size_t f = 0; f < NumFuncs; ++f) {
+ for (size_t i = 0; i < NumValuesSubset; ++i) {
+ Type Value1 = static_cast<Type>(ValuesSubset[i]);
+ for (size_t j = 0; j < NumValuesSubset; ++j) {
+ Type Value2 = static_cast<Type>(ValuesSubset[j]);
+ bool fetch_first = true;
+ ThreadData<Type> TDataSz = {
+ Funcs[f].FuncSz, fetch_first, AtomicLoc, Value2 };
+ ThreadData<Type> TDataLlc = {
+ Funcs[f].FuncLlc, fetch_first, AtomicLoc, Value2 };
+ ++TotalTests;
+ const size_t NumThreads = 4;
+ pthread_t t[NumThreads];
+
+ // Try N threads w/ just Llc.
+ *AtomicLoc = Value1;
+ for (size_t m = 0; m < NumThreads; ++m) {
+ pthread_create(&t[m], NULL, &threadWrapper<Type>,
+ reinterpret_cast<void *>(&TDataLlc));
+ }
+ for (size_t m = 0; m < NumThreads; ++m) {
+ pthread_join(t[m], NULL);
+ }
+ Type ResultLlc = *AtomicLoc;
+
+ // Try N threads w/ both Sz and Llc.
+ *AtomicLoc = Value1;
+ for (size_t m = 0; m < NumThreads; ++m) {
+ if (pthread_create(&t[m], NULL, &threadWrapper<Type>,
+ m % 2 == 0
+ ? reinterpret_cast<void *>(&TDataLlc)
+ : reinterpret_cast<void *>(&TDataSz)) != 0) {
+ ++Failures;
+ std::cout << "pthread_create failed w/ " << strerror(errno) << "\n";
+ abort();
+ }
+ }
+ for (size_t m = 0; m < NumThreads; ++m) {
+ if (pthread_join(t[m], NULL) != 0) {
+ ++Failures;
+ std::cout << "pthread_join failed w/ " << strerror(errno) << "\n";
+ abort();
+ }
+ }
+ Type ResultMixed = *AtomicLoc;
+
+ if (ResultLlc == ResultMixed) {
+ ++Passes;
+ } else {
+ ++Failures;
+ std::cout << "test_with_threads_" << Funcs[f].Name
+ << (8 * sizeof(Type)) << "("
+ << static_cast<uint64_t>(Value1) << ", "
+ << static_cast<uint64_t>(Value2)
+ << "): llc=" << static_cast<uint64_t>(ResultLlc)
+ << " mixed=" << static_cast<uint64_t>(ResultMixed)
+ << "\n";
+ }
+ }
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ size_t TotalTests = 0;
+ size_t Passes = 0;
+ size_t Failures = 0;
+
+ testAtomicRMW<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
+ testAtomicRMW<uint16_t>(&AtomicLocs.l16, TotalTests, Passes, Failures);
+ testAtomicRMW<uint32_t>(&AtomicLocs.l32, TotalTests, Passes, Failures);
+ testAtomicRMW<uint64_t>(&AtomicLocs.l64, TotalTests, Passes, Failures);
+ testValCompareAndSwap<uint8_t>(
+ &AtomicLocs.l8, TotalTests, Passes, Failures);
+ testValCompareAndSwap<uint16_t>(
+ &AtomicLocs.l16, TotalTests, Passes, Failures);
+ testValCompareAndSwap<uint32_t>(
+ &AtomicLocs.l32, TotalTests, Passes, Failures);
+ testValCompareAndSwap<uint64_t>(
+ &AtomicLocs.l64, TotalTests, Passes, Failures);
+ testAtomicRMWThreads<uint8_t>(
+ &AtomicLocs.l8, TotalTests, Passes, Failures);
+ testAtomicRMWThreads<uint16_t>(
+ &AtomicLocs.l16, TotalTests, Passes, Failures);
+ testAtomicRMWThreads<uint32_t>(
+ &AtomicLocs.l32, TotalTests, Passes, Failures);
+ testAtomicRMWThreads<uint64_t>(
+ &AtomicLocs.l64, TotalTests, Passes, Failures);
+
+ std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+ << " Failures=" << Failures << "\n";
+ return Failures;
+}
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index cd5095f..c0e8c8d 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -51,9 +51,8 @@
llvm::array_lengthof(TypeX8632Attributes);
const char *InstX8632SegmentRegNames[] = {
-#define X(val, name) \
- name,
- SEG_REGX8632_TABLE
+#define X(val, name) name,
+ SEG_REGX8632_TABLE
#undef X
};
const size_t InstX8632SegmentRegNamesSize =
@@ -140,6 +139,33 @@
addSource(Source);
}
+InstX8632Cmpxchg::InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr,
+ Variable *Eax, Variable *Desired,
+ bool Locked)
+ : InstX8632Lockable(Func, InstX8632::Cmpxchg, 3,
+ llvm::dyn_cast<Variable>(DestOrAddr), Locked) {
+ assert(Eax->getRegNum() == TargetX8632::Reg_eax);
+ addSource(DestOrAddr);
+ addSource(Eax);
+ addSource(Desired);
+}
+
+InstX8632Cmpxchg8b::InstX8632Cmpxchg8b(Cfg *Func, OperandX8632 *Addr,
+ Variable *Edx, Variable *Eax,
+ Variable *Ecx, Variable *Ebx,
+ bool Locked)
+ : InstX8632Lockable(Func, InstX8632::Cmpxchg, 5, NULL, Locked) {
+ assert(Edx->getRegNum() == TargetX8632::Reg_edx);
+ assert(Eax->getRegNum() == TargetX8632::Reg_eax);
+ assert(Ecx->getRegNum() == TargetX8632::Reg_ecx);
+ assert(Ebx->getRegNum() == TargetX8632::Reg_ebx);
+ addSource(Addr);
+ addSource(Edx);
+ addSource(Eax);
+ addSource(Ecx);
+ addSource(Ebx);
+}
+
InstX8632Cvt::InstX8632Cvt(Cfg *Func, Variable *Dest, Operand *Source)
: InstX8632(Func, InstX8632::Cvt, 1, Dest) {
addSource(Source);
@@ -284,9 +310,14 @@
InstX8632Xadd::InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source,
bool Locked)
- : InstX8632(Func, InstX8632::Xadd, 2, llvm::dyn_cast<Variable>(Dest)),
- Locked(Locked) {
- HasSideEffects = Locked;
+ : InstX8632Lockable(Func, InstX8632::Xadd, 2,
+ llvm::dyn_cast<Variable>(Dest), Locked) {
+ addSource(Dest);
+ addSource(Source);
+}
+
+InstX8632Xchg::InstX8632Xchg(Cfg *Func, Operand *Dest, Variable *Source)
+ : InstX8632(Func, InstX8632::Xchg, 2, llvm::dyn_cast<Variable>(Dest)) {
addSource(Dest);
addSource(Source);
}
@@ -398,6 +429,7 @@
Str << "\n";
}
+template <> const char *InstX8632Neg::Opcode = "neg";
template <> const char *InstX8632Add::Opcode = "add";
template <> const char *InstX8632Addps::Opcode = "addps";
template <> const char *InstX8632Adc::Opcode = "adc";
@@ -554,6 +586,48 @@
dumpSources(Func);
}
+void InstX8632Cmpxchg::emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(getSrcSize() == 3);
+ if (Locked) {
+ Str << "\tlock";
+ }
+ Str << "\tcmpxchg\t";
+ getSrc(0)->emit(Func);
+ Str << ", ";
+ getSrc(2)->emit(Func);
+ Str << "\n";
+}
+
+void InstX8632Cmpxchg::dump(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrDump();
+ if (Locked) {
+ Str << "lock ";
+ }
+ Str << "cmpxchg." << getSrc(0)->getType() << " ";
+ dumpSources(Func);
+}
+
+void InstX8632Cmpxchg8b::emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(getSrcSize() == 5);
+ if (Locked) {
+ Str << "\tlock";
+ }
+ Str << "\tcmpxchg8b\t";
+ getSrc(0)->emit(Func);
+ Str << "\n";
+}
+
+void InstX8632Cmpxchg8b::dump(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrDump();
+ if (Locked) {
+ Str << "lock ";
+ }
+ Str << "cmpxchg8b ";
+ dumpSources(Func);
+}
+
void InstX8632Cvt::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 1);
@@ -955,10 +1029,9 @@
void InstX8632Xadd::emit(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrEmit();
if (Locked) {
- Str << "\tlock xadd ";
- } else {
- Str << "\txadd\t";
+ Str << "\tlock";
}
+ Str << "\txadd\t";
getSrc(0)->emit(Func);
Str << ", ";
getSrc(1)->emit(Func);
@@ -975,6 +1048,22 @@
dumpSources(Func);
}
+void InstX8632Xchg::emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ Str << "\txchg\t";
+ getSrc(0)->emit(Func);
+ Str << ", ";
+ getSrc(1)->emit(Func);
+ Str << "\n";
+}
+
+void InstX8632Xchg::dump(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrDump();
+ Type Ty = getSrc(0)->getType();
+ Str << "xchg." << Ty << " ";
+ dumpSources(Func);
+}
+
void OperandX8632::dump(const Cfg *Func) const {
Ostream &Str = Func->getContext()->getStrDump();
Str << "<OperandX8632>";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index baf072a..25beb6d 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -54,9 +54,8 @@
public:
enum SegmentRegisters {
DefaultSegment = -1,
-#define X(val, name) \
- val,
- SEG_REGX8632_TABLE
+#define X(val, name) val,
+ SEG_REGX8632_TABLE
#undef X
SegReg_NUM
};
@@ -142,6 +141,8 @@
Br,
Call,
Cdq,
+ Cmpxchg,
+ Cmpxchg8b,
Cvt,
Div,
Divps,
@@ -162,6 +163,7 @@
Mul,
Mulps,
Mulss,
+ Neg,
Or,
Pop,
Push,
@@ -183,6 +185,7 @@
Ucomiss,
UD2,
Xadd,
+ Xchg,
Xor
};
static const char *getWidthString(Type Ty);
@@ -328,6 +331,41 @@
virtual ~InstX8632Call() {}
};
+template <InstX8632::InstKindX8632 K>
+class InstX8632Unaryop : public InstX8632 {
+public:
+ // Create an unary-op instruction like neg.
+ // The source and dest are the same variable.
+ static InstX8632Unaryop *create(Cfg *Func, Operand *SrcDest) {
+ return new (Func->allocate<InstX8632Unaryop>())
+ InstX8632Unaryop(Func, SrcDest);
+ }
+ virtual void emit(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrEmit();
+ assert(getSrcSize() == 1);
+ Str << "\t" << Opcode << "\t";
+ getSrc(0)->emit(Func);
+ Str << "\n";
+ }
+ virtual void dump(const Cfg *Func) const {
+ Ostream &Str = Func->getContext()->getStrDump();
+ dumpDest(Func);
+ Str << " = " << Opcode << "." << getDest()->getType() << " ";
+ dumpSources(Func);
+ }
+ static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+ InstX8632Unaryop(Cfg *Func, Operand *SrcDest)
+ : InstX8632(Func, K, 1, llvm::dyn_cast<Variable>(SrcDest)) {
+ addSource(SrcDest);
+ }
+ InstX8632Unaryop(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
+ InstX8632Unaryop &operator=(const InstX8632Unaryop &) LLVM_DELETED_FUNCTION;
+ virtual ~InstX8632Unaryop() {}
+ static const char *Opcode;
+};
+
// See the definition of emitTwoAddress() for a description of
// ShiftHack.
void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
@@ -400,6 +438,7 @@
static const char *Opcode;
};
+typedef InstX8632Unaryop<InstX8632::Neg> InstX8632Neg;
typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
typedef InstX8632Binop<InstX8632::Adc> InstX8632Adc;
@@ -423,6 +462,28 @@
typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
+// Base class for a lockable x86-32 instruction (emits a locked prefix).
+class InstX8632Lockable : public InstX8632 {
+public:
+ virtual void emit(const Cfg *Func) const = 0;
+ virtual void dump(const Cfg *Func) const;
+
+protected:
+ bool Locked;
+
+ InstX8632Lockable(Cfg *Func, InstKindX8632 Kind, SizeT Maxsrcs,
+ Variable *Dest, bool Locked)
+ : InstX8632(Func, Kind, Maxsrcs, Dest), Locked(Locked) {
+ // Assume that such instructions are used for Atomics and be careful
+ // with optimizations.
+ HasSideEffects = Locked;
+ }
+
+private:
+ InstX8632Lockable(const InstX8632Lockable &) LLVM_DELETED_FUNCTION;
+ InstX8632Lockable &operator=(const InstX8632Lockable &) LLVM_DELETED_FUNCTION;
+};
+
// Mul instruction - unsigned multiply.
class InstX8632Mul : public InstX8632 {
public:
@@ -502,6 +563,57 @@
virtual ~InstX8632Cdq() {}
};
+// Cmpxchg instruction - cmpxchg <dest>, <desired> will compare if <dest>
+// equals eax. If so, the ZF is set and <desired> is stored in <dest>.
+// If not, ZF is cleared and <dest> is copied to eax (or subregister).
+// <dest> can be a register or memory, while <desired> must be a register.
+// It is the user's responsiblity to mark eax with a FakeDef.
+class InstX8632Cmpxchg : public InstX8632Lockable {
+public:
+ static InstX8632Cmpxchg *create(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+ Variable *Desired, bool Locked) {
+ return new (Func->allocate<InstX8632Cmpxchg>())
+ InstX8632Cmpxchg(Func, DestOrAddr, Eax, Desired, Locked);
+ }
+ virtual void emit(const Cfg *Func) const;
+ virtual void dump(const Cfg *Func) const;
+ static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpxchg); }
+
+private:
+ InstX8632Cmpxchg(Cfg *Func, Operand *DestOrAddr, Variable *Eax,
+ Variable *Desired, bool Locked);
+ InstX8632Cmpxchg(const InstX8632Cmpxchg &) LLVM_DELETED_FUNCTION;
+ InstX8632Cmpxchg &operator=(const InstX8632Cmpxchg &) LLVM_DELETED_FUNCTION;
+ virtual ~InstX8632Cmpxchg() {}
+};
+
+// Cmpxchg8b instruction - cmpxchg8b <m64> will compare if <m64>
+// equals edx:eax. If so, the ZF is set and ecx:ebx is stored in <m64>.
+// If not, ZF is cleared and <m64> is copied to edx:eax.
+// The caller is responsible for inserting FakeDefs to mark edx
+// and eax as modified.
+// <m64> must be a memory operand.
+class InstX8632Cmpxchg8b : public InstX8632Lockable {
+public:
+ static InstX8632Cmpxchg8b *create(Cfg *Func, OperandX8632 *Dest,
+ Variable *Edx, Variable *Eax, Variable *Ecx,
+ Variable *Ebx, bool Locked) {
+ return new (Func->allocate<InstX8632Cmpxchg8b>())
+ InstX8632Cmpxchg8b(Func, Dest, Edx, Eax, Ecx, Ebx, Locked);
+ }
+ virtual void emit(const Cfg *Func) const;
+ virtual void dump(const Cfg *Func) const;
+ static bool classof(const Inst *Inst) { return isClassof(Inst, Cmpxchg8b); }
+
+private:
+ InstX8632Cmpxchg8b(Cfg *Func, OperandX8632 *Dest, Variable *Edx,
+ Variable *Eax, Variable *Ecx, Variable *Ebx, bool Locked);
+ InstX8632Cmpxchg8b(const InstX8632Cmpxchg8b &) LLVM_DELETED_FUNCTION;
+ InstX8632Cmpxchg8b &
+ operator=(const InstX8632Cmpxchg8b &) LLVM_DELETED_FUNCTION;
+ virtual ~InstX8632Cmpxchg8b() {}
+};
+
// Cvt instruction - wrapper for cvtsX2sY where X and Y are in {s,d,i}
// as appropriate. s=float, d=double, i=int. X and Y are determined
// from dest/src types. Sign and zero extension on the integer
@@ -861,7 +973,7 @@
//
// Both the dest and source are updated. The caller should then insert a
// FakeDef to reflect the second udpate.
-class InstX8632Xadd : public InstX8632 {
+class InstX8632Xadd : public InstX8632Lockable {
public:
static InstX8632Xadd *create(Cfg *Func, Operand *Dest, Variable *Source,
bool Locked) {
@@ -873,14 +985,35 @@
static bool classof(const Inst *Inst) { return isClassof(Inst, Xadd); }
private:
- bool Locked;
-
InstX8632Xadd(Cfg *Func, Operand *Dest, Variable *Source, bool Locked);
InstX8632Xadd(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
InstX8632Xadd &operator=(const InstX8632Xadd &) LLVM_DELETED_FUNCTION;
virtual ~InstX8632Xadd() {}
};
+// Exchange instruction. Exchanges the first operand (destination
+// operand) with the second operand (source operand). At least one of
+// the operands must be a register (and the other can be reg or mem).
+// Both the Dest and Source are updated. If there is a memory operand,
+// then the instruction is automatically "locked" without the need for
+// a lock prefix.
+class InstX8632Xchg : public InstX8632 {
+public:
+ static InstX8632Xchg *create(Cfg *Func, Operand *Dest, Variable *Source) {
+ return new (Func->allocate<InstX8632Xchg>())
+ InstX8632Xchg(Func, Dest, Source);
+ }
+ virtual void emit(const Cfg *Func) const;
+ virtual void dump(const Cfg *Func) const;
+ static bool classof(const Inst *Inst) { return isClassof(Inst, Xchg); }
+
+private:
+ InstX8632Xchg(Cfg *Func, Operand *Dest, Variable *Source);
+ InstX8632Xchg(const InstX8632Xchg &) LLVM_DELETED_FUNCTION;
+ InstX8632Xchg &operator=(const InstX8632Xchg &) LLVM_DELETED_FUNCTION;
+ virtual ~InstX8632Xchg() {}
+};
+
} // end of namespace Ice
#endif // SUBZERO_SRC_ICEINSTX8632_H
diff --git a/src/IceIntrinsics.cpp b/src/IceIntrinsics.cpp
index 02562b5..b83513f 100644
--- a/src/IceIntrinsics.cpp
+++ b/src/IceIntrinsics.cpp
@@ -46,7 +46,7 @@
"nacl.atomic.fence" },
{ { { Intrinsics::AtomicFenceAll, true }, { IceType_void }, 1 },
"nacl.atomic.fence.all" },
- { { { Intrinsics::AtomicIsLockFree, true },
+ { { { Intrinsics::AtomicIsLockFree, false },
{ IceType_i1, IceType_i32, IceType_i32 }, 3 },
"nacl.atomic.is.lock.free" },
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index f1b8c25..bf11573 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -1968,7 +1968,7 @@
void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
switch (Instr->getIntrinsicInfo().ID) {
- case Intrinsics::AtomicCmpxchg:
+ case Intrinsics::AtomicCmpxchg: {
if (!Intrinsics::VerifyMemoryOrder(
llvm::cast<ConstantInteger>(Instr->getArg(3))->getValue())) {
Func->setError("Unexpected memory ordering (success) for AtomicCmpxchg");
@@ -1979,9 +1979,18 @@
Func->setError("Unexpected memory ordering (failure) for AtomicCmpxchg");
return;
}
- // TODO(jvoung): fill it in.
- Func->setError("Unhandled intrinsic");
+ Variable *DestPrev = Instr->getDest();
+ Operand *PtrToMem = Instr->getArg(0);
+ Operand *Expected = Instr->getArg(1);
+ Operand *Desired = Instr->getArg(2);
+ lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
+ // TODO(jvoung): If we peek ahead a few instructions and see how
+ // DestPrev is used (typically via another compare and branch),
+ // we may be able to optimize. If the result truly is used by a
+ // compare + branch, and the comparison is for equality, then we can
+ // optimize out the later compare, and fuse with the later branch.
return;
+ }
case Intrinsics::AtomicFence:
if (!Intrinsics::VerifyMemoryOrder(
llvm::cast<ConstantInteger>(Instr->getArg(0))->getValue())) {
@@ -2183,18 +2192,54 @@
return;
}
+void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
+ Operand *Expected, Operand *Desired) {
+ if (Expected->getType() == IceType_i64) {
+ // Reserve the pre-colored registers first, before adding any more
+ // infinite-weight variables from FormMemoryOperand's legalization.
+ Variable *T_edx = makeReg(IceType_i32, Reg_edx);
+ Variable *T_eax = makeReg(IceType_i32, Reg_eax);
+ Variable *T_ecx = makeReg(IceType_i32, Reg_ecx);
+ Variable *T_ebx = makeReg(IceType_i32, Reg_ebx);
+ _mov(T_eax, loOperand(Expected));
+ _mov(T_edx, hiOperand(Expected));
+ _mov(T_ebx, loOperand(Desired));
+ _mov(T_ecx, hiOperand(Desired));
+ OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Expected->getType());
+ const bool Locked = true;
+ _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+ Variable *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
+ Variable *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
+ _mov(DestLo, T_eax);
+ _mov(DestHi, T_edx);
+ return;
+ }
+ Variable *T_eax = makeReg(Expected->getType(), Reg_eax);
+ _mov(T_eax, Expected);
+ OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Expected->getType());
+ Variable *DesiredReg = legalizeToVar(Desired);
+ const bool Locked = true;
+ _cmpxchg(Addr, T_eax, DesiredReg, Locked);
+ _mov(DestPrev, T_eax);
+}
+
void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
Operand *Ptr, Operand *Val) {
+ bool NeedsCmpxchg = false;
+ LowerBinOp Op_Lo = NULL;
+ LowerBinOp Op_Hi = NULL;
switch (Operation) {
default:
Func->setError("Unknown AtomicRMW operation");
return;
case Intrinsics::AtomicAdd: {
if (Dest->getType() == IceType_i64) {
- // Do a nasty cmpxchg8b loop. Factor this into a function.
- // TODO(jvoung): fill it in.
- Func->setError("Unhandled AtomicRMW operation");
- return;
+ // All the fall-through paths must set this to true, but use this
+ // for asserting.
+ NeedsCmpxchg = true;
+ Op_Lo = &TargetX8632::_add;
+ Op_Hi = &TargetX8632::_adc;
+ break;
}
OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
const bool Locked = true;
@@ -2206,26 +2251,160 @@
}
case Intrinsics::AtomicSub: {
if (Dest->getType() == IceType_i64) {
- // Do a nasty cmpxchg8b loop.
- // TODO(jvoung): fill it in.
- Func->setError("Unhandled AtomicRMW operation");
- return;
+ NeedsCmpxchg = true;
+ Op_Lo = &TargetX8632::_sub;
+ Op_Hi = &TargetX8632::_sbb;
+ break;
}
- // Generate a memory operand from Ptr.
- // neg...
- // Then do the same as AtomicAdd.
- // TODO(jvoung): fill it in.
- Func->setError("Unhandled AtomicRMW operation");
+ OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
+ const bool Locked = true;
+ Variable *T = NULL;
+ _mov(T, Val);
+ _neg(T);
+ _xadd(Addr, T, Locked);
+ _mov(Dest, T);
return;
}
case Intrinsics::AtomicOr:
+ // TODO(jvoung): If Dest is null or dead, then some of these
+ // operations do not need an "exchange", but just a locked op.
+ // That appears to be "worth" it for sub, or, and, and xor.
+ // xadd is probably fine vs lock add for add, and xchg is fine
+ // vs an atomic store.
+ NeedsCmpxchg = true;
+ Op_Lo = &TargetX8632::_or;
+ Op_Hi = &TargetX8632::_or;
+ break;
case Intrinsics::AtomicAnd:
+ NeedsCmpxchg = true;
+ Op_Lo = &TargetX8632::_and;
+ Op_Hi = &TargetX8632::_and;
+ break;
case Intrinsics::AtomicXor:
+ NeedsCmpxchg = true;
+ Op_Lo = &TargetX8632::_xor;
+ Op_Hi = &TargetX8632::_xor;
+ break;
case Intrinsics::AtomicExchange:
- // TODO(jvoung): fill it in.
- Func->setError("Unhandled AtomicRMW operation");
+ if (Dest->getType() == IceType_i64) {
+ NeedsCmpxchg = true;
+ // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
+ // just need to be moved to the ecx and ebx registers.
+ Op_Lo = NULL;
+ Op_Hi = NULL;
+ break;
+ }
+ OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Dest->getType());
+ Variable *T = NULL;
+ _mov(T, Val);
+ _xchg(Addr, T);
+ _mov(Dest, T);
return;
}
+ // Otherwise, we need a cmpxchg loop.
+ assert(NeedsCmpxchg);
+ expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
+}
+
+void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
+ Variable *Dest, Operand *Ptr,
+ Operand *Val) {
+ // Expand a more complex RMW operation as a cmpxchg loop:
+ // For 64-bit:
+ // mov eax, [ptr]
+ // mov edx, [ptr + 4]
+ // .LABEL:
+ // mov ebx, eax
+ // <Op_Lo> ebx, <desired_adj_lo>
+ // mov ecx, edx
+ // <Op_Hi> ecx, <desired_adj_hi>
+ // lock cmpxchg8b [ptr]
+ // jne .LABEL
+ // mov <dest_lo>, eax
+ // mov <dest_lo>, edx
+ //
+ // For 32-bit:
+ // mov eax, [ptr]
+ // .LABEL:
+ // mov <reg>, eax
+ // op <reg>, [desired_adj]
+ // lock cmpxchg [ptr], <reg>
+ // jne .LABEL
+ // mov <dest>, eax
+ //
+ // If Op_{Lo,Hi} are NULL, then just copy the value.
+ Val = legalize(Val);
+ Type Ty = Val->getType();
+ if (Ty == IceType_i64) {
+ Variable *T_edx = makeReg(IceType_i32, Reg_edx);
+ Variable *T_eax = makeReg(IceType_i32, Reg_eax);
+ OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Ty);
+ _mov(T_eax, loOperand(Addr));
+ _mov(T_edx, hiOperand(Addr));
+ Variable *T_ecx = makeReg(IceType_i32, Reg_ecx);
+ Variable *T_ebx = makeReg(IceType_i32, Reg_ebx);
+ InstX8632Label *Label = InstX8632Label::create(Func, this);
+ const bool IsXchg8b = Op_Lo == NULL && Op_Hi == NULL;
+ if (!IsXchg8b) {
+ Context.insert(Label);
+ _mov(T_ebx, T_eax);
+ (this->*Op_Lo)(T_ebx, loOperand(Val));
+ _mov(T_ecx, T_edx);
+ (this->*Op_Hi)(T_ecx, hiOperand(Val));
+ } else {
+ // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
+ // It just needs the Val loaded into ebx and ecx.
+ // That can also be done before the loop.
+ _mov(T_ebx, loOperand(Val));
+ _mov(T_ecx, hiOperand(Val));
+ Context.insert(Label);
+ }
+ const bool Locked = true;
+ _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
+ _br(InstX8632Br::Br_ne, Label);
+ if (!IsXchg8b) {
+ // If Val is a variable, model the extended live range of Val through
+ // the end of the loop, since it will be re-used by the loop.
+ if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+ Variable *ValLo = llvm::cast<Variable>(loOperand(ValVar));
+ Variable *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
+ Context.insert(InstFakeUse::create(Func, ValLo));
+ Context.insert(InstFakeUse::create(Func, ValHi));
+ }
+ } else {
+ // For xchg, the loop is slightly smaller and ebx/ecx are used.
+ Context.insert(InstFakeUse::create(Func, T_ebx));
+ Context.insert(InstFakeUse::create(Func, T_ecx));
+ }
+ // The address base is also reused in the loop.
+ Context.insert(InstFakeUse::create(Func, Addr->getBase()));
+ Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+ Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+ _mov(DestLo, T_eax);
+ _mov(DestHi, T_edx);
+ return;
+ }
+ OperandX8632Mem *Addr = FormMemoryOperand(Ptr, Ty);
+ Variable *T_eax = makeReg(Ty, Reg_eax);
+ _mov(T_eax, Addr);
+ InstX8632Label *Label = InstX8632Label::create(Func, this);
+ Context.insert(Label);
+ // We want to pick a different register for T than Eax, so don't use
+ // _mov(T == NULL, T_eax).
+ Variable *T = makeReg(Ty);
+ _mov(T, T_eax);
+ (this->*Op_Lo)(T, Val);
+ const bool Locked = true;
+ _cmpxchg(Addr, T_eax, T, Locked);
+ _br(InstX8632Br::Br_ne, Label);
+ // If Val is a variable, model the extended live range of Val through
+ // the end of the loop, since it will be re-used by the loop.
+ if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
+ Context.insert(InstFakeUse::create(Func, ValVar));
+ }
+ // The address base is also reused in the loop.
+ Context.insert(InstFakeUse::create(Func, Addr->getBase()));
+ _mov(Dest, T_eax);
}
namespace {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 001f4e6..4953ffc 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -95,9 +95,15 @@
virtual void doAddressOptLoad();
virtual void doAddressOptStore();
+ void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
+ Operand *Desired);
void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
Operand *Val);
+ typedef void (TargetX8632::*LowerBinOp)(Variable *, Operand *);
+ void expandAtomicRMWAsCmpxchg(LowerBinOp op_lo, LowerBinOp op_hi,
+ Variable *Dest, Operand *Ptr, Operand *Val);
+
// Operand legalization helpers. To deal with address mode
// constraints, the helpers will create a new Operand and emit
// instructions that guarantee that the Operand kind is one of those
@@ -177,6 +183,22 @@
void _cmp(Operand *Src0, Operand *Src1) {
Context.insert(InstX8632Icmp::create(Func, Src0, Src1));
}
+ void _cmpxchg(Operand *DestOrAddr, Variable *Eax, Variable *Desired,
+ bool Locked) {
+ Context.insert(
+ InstX8632Cmpxchg::create(Func, DestOrAddr, Eax, Desired, Locked));
+ // Mark eax as possibly modified by cmpxchg.
+ Context.insert(
+ InstFakeDef::create(Func, Eax, llvm::dyn_cast<Variable>(DestOrAddr)));
+ }
+ void _cmpxchg8b(OperandX8632 *Addr, Variable *Edx, Variable *Eax,
+ Variable *Ecx, Variable *Ebx, bool Locked) {
+ Context.insert(
+ InstX8632Cmpxchg8b::create(Func, Addr, Edx, Eax, Ecx, Ebx, Locked));
+ // Mark edx, and eax as possibly modified by cmpxchg8b.
+ Context.insert(InstFakeDef::create(Func, Edx));
+ Context.insert(InstFakeDef::create(Func, Eax));
+ }
void _cvt(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Cvt::create(Func, Dest, Src0));
}
@@ -232,6 +254,9 @@
void _mulss(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Mulss::create(Func, Dest, Src0));
}
+ void _neg(Variable *SrcDest) {
+ Context.insert(InstX8632Neg::create(Func, SrcDest));
+ }
void _or(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Or::create(Func, Dest, Src0));
}
@@ -294,7 +319,14 @@
Context.insert(InstX8632Xadd::create(Func, Dest, Src, Locked));
// The xadd exchanges Dest and Src (modifying Src).
// Model that update with a FakeDef.
- Context.insert(InstFakeDef::create(Func, Src));
+ Context.insert(
+ InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
+ }
+ void _xchg(Operand *Dest, Variable *Src) {
+ Context.insert(InstX8632Xchg::create(Func, Dest, Src));
+ // The xchg modifies Dest and Src -- model that update with a FakeDef.
+ Context.insert(
+ InstFakeDef::create(Func, Src, llvm::dyn_cast<Variable>(Dest)));
}
void _xor(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Xor::create(Func, Dest, Src0));
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
index 8dfcc61..9885b88 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -2,6 +2,7 @@
; size allowed.
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
@@ -28,6 +29,11 @@
declare void @llvm.nacl.atomic.fence.all()
declare i1 @llvm.nacl.atomic.is.lock.free(i32, i8*)
+; NOTE: The LLC equivalent for 16-bit atomic operations are expanded
+; as 32-bit operations. For Subzero, assume that real 16-bit operations
+; will be usable (the validator will be fixed):
+; https://code.google.com/p/nativeclient/issues/detail?id=2981
+
;;; Load
; x86 guarantees load/store to be atomic if naturally aligned.
@@ -107,7 +113,6 @@
; CHECK: movq x{{.*}}, qword
; CHECK: movq qword {{.*}}, x{{.*}}
-
;;; Store
define void @test_atomic_store_8(i32 %iptr, i32 %v) {
@@ -169,6 +174,8 @@
;;; RMW
+;; add
+
define i32 @test_atomic_rmw_add_8(i32 %iptr, i32 %v) {
entry:
%trunc = trunc i32 %v to i8
@@ -180,7 +187,7 @@
}
; CHECK-LABEL: test_atomic_rmw_add_8
; CHECK: lock xadd byte {{.*}}, [[REG:.*]]
-; CHECK: mov {{.*}}, {{.*}}[[REG]]
+; CHECK: mov {{.*}}, [[REG]]
define i32 @test_atomic_rmw_add_16(i32 %iptr, i32 %v) {
entry:
@@ -192,7 +199,7 @@
}
; CHECK-LABEL: test_atomic_rmw_add_16
; CHECK: lock xadd word {{.*}}, [[REG:.*]]
-; CHECK: mov {{.*}}, {{.*}}[[REG]]
+; CHECK: mov {{.*}}, [[REG]]
define i32 @test_atomic_rmw_add_32(i32 %iptr, i32 %v) {
entry:
@@ -202,16 +209,61 @@
}
; CHECK-LABEL: test_atomic_rmw_add_32
; CHECK: lock xadd dword {{.*}}, [[REG:.*]]
-; CHECK: mov {{.*}}, {{.*}}[[REG]]
+; CHECK: mov {{.*}}, [[REG]]
-;define i64 @test_atomic_rmw_add_64(i32 %iptr, i64 %v) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i64*
-; %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
-; ret i64 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_add_64
-; CHECKLATER: uh need a... cmpxchg8b loop.
+define i64 @test_atomic_rmw_add_64(i32 %iptr, i64 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
+ ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_add_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; RHS of add cannot be any of the e[abcd]x regs because they are
+; clobbered in the loop, and the RHS needs to be remain live.
+; CHECK: add ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: adc ecx, {{.*e.[^x]}}
+; Ptr cannot be eax, ebx, ecx, or edx (used up for the expected and desired).
+; It can be esi, edi, or ebp though, for example (so we need to be careful
+; about rejecting eb* and ed*.)
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+; Test with some more register pressure. When we have an alloca, ebp is
+; used to manage the stack frame, so it cannot be used as a register either.
+declare void @use_ptr(i32 %iptr)
+
+define i64 @test_atomic_rmw_add_64_alloca(i32 %iptr, i64 %v) {
+entry:
+ %alloca_ptr = alloca i8, i32 16, align 16
+ %ptr = inttoptr i32 %iptr to i64*
+ %old = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v, i32 6)
+ store i8 0, i8* %alloca_ptr, align 1
+ store i8 1, i8* %alloca_ptr, align 1
+ store i8 2, i8* %alloca_ptr, align 1
+ store i8 3, i8* %alloca_ptr, align 1
+ %__5 = ptrtoint i8* %alloca_ptr to i32
+ call void @use_ptr(i32 %__5)
+ ret i64 %old
+}
+; CHECK-LABEL: test_atomic_rmw_add_64_alloca
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; Ptr cannot be eax, ebx, ecx, or edx (used up for the expected and desired).
+; It also cannot be ebp since we use that for alloca. Also make sure it's
+; not esp, since that's the stack pointer and mucking with it will break
+; the later use_ptr function call.
+; That pretty much leaves esi, or edi as the only viable registers.
+; CHECK: lock cmpxchg8b qword ptr [e{{[ds]}}i]
+; CHECK: call use_ptr
define i32 @test_atomic_rmw_add_32_ignored(i32 %iptr, i32 %v) {
entry:
@@ -219,129 +271,562 @@
%ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %v, i32 6)
ret i32 %v
}
+; Technically this could use "lock add" instead of "lock xadd", if liveness
+; tells us that the destination variable is dead.
; CHECK-LABEL: test_atomic_rmw_add_32_ignored
; CHECK: lock xadd dword {{.*}}, [[REG:.*]]
-;define i32 @test_atomic_rmw_sub_32(i32 %iptr, i32 %v) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i32*
-; %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %v, i32 6)
-; ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_sub_32
-; CHECKLATER: neg
-; CHECKLATER: lock
-; CHECKLATER: xadd
+; Atomic RMW 64 needs to be expanded into its own loop.
+; Make sure that works w/ non-trivial function bodies.
+define i64 @test_atomic_rmw_add_64_loop(i32 %iptr, i64 %v) {
+entry:
+ %x = icmp ult i64 %v, 100
+ br i1 %x, label %err, label %loop
-;define i32 @test_atomic_rmw_or_32(i32 %iptr, i32 %v) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i32*
-; %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %v, i32 6)
-; ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_or_32
-; Need a cmpxchg loop.
+loop:
+ %v_next = phi i64 [ %v, %entry ], [ %next, %loop ]
+ %ptr = inttoptr i32 %iptr to i64*
+ %next = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %v_next, i32 6)
+ %success = icmp eq i64 %next, 100
+ br i1 %success, label %done, label %loop
-;define i32 @test_atomic_rmw_and_32(i32 %iptr, i32 %v) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i32*
-; %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %v, i32 6)
-; ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_and_32
-; Also a cmpxchg loop.
+done:
+ ret i64 %next
-;define i32 @test_atomic_rmw_xor_32(i32 %iptr, i32 %v) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i32*
-; %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %v, i32 6)
-; ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_xor_32
-; Also a cmpxchg loop.
+err:
+ ret i64 0
+}
+; CHECK-LABEL: test_atomic_rmw_add_64_loop
+; CHECK: push ebx
+; CHECK-LABEL: .Ltest_atomic_rmw_add_64_loop{{.*}}loop
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: add ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: adc ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+; CHECK-LABEL: .Ltest_atomic_rmw_add_64_loop{{.*}}done
-;define i32 @test_atomic_rmw_xchg_32(i32 %iptr, i32 %v) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i32*
-; %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %v, i32 6)
-; ret i32 %a
-;}
-; CHECKLATER-LABEL: test_atomic_rmw_xchg_32
+;; sub
+
+define i32 @test_atomic_rmw_sub_8(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i8
+ %ptr = inttoptr i32 %iptr to i8*
+ %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 2, i8* %ptr, i8 %trunc, i32 6)
+ %a_ext = zext i8 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_sub_8
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd byte {{.*}}, [[REG]]
+; CHECK: mov {{.*}}, [[REG]]
+
+define i32 @test_atomic_rmw_sub_16(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i16
+ %ptr = inttoptr i32 %iptr to i16*
+ %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 2, i16* %ptr, i16 %trunc, i32 6)
+ %a_ext = zext i16 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_sub_16
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd word {{.*}}, [[REG]]
+; CHECK: mov {{.*}}, [[REG]]
+
+define i32 @test_atomic_rmw_sub_32(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %v, i32 6)
+ ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_sub_32
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd dword {{.*}}, [[REG]]
+; CHECK: mov {{.*}}, [[REG]]
+
+define i64 @test_atomic_rmw_sub_64(i32 %iptr, i64 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 2, i64* %ptr, i64 %v, i32 6)
+ ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_sub_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: sub ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: sbb ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+
+define i32 @test_atomic_rmw_sub_32_ignored(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %v, i32 6)
+ ret i32 %v
+}
+; Could use "lock sub" instead of "neg; lock xadd"
+; CHECK-LABEL: test_atomic_rmw_sub_32_ignored
+; CHECK: neg [[REG:.*]]
+; CHECK: lock xadd dword {{.*}}, [[REG]]
+
+;; or
+
+define i32 @test_atomic_rmw_or_8(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i8
+ %ptr = inttoptr i32 %iptr to i8*
+ %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 3, i8* %ptr, i8 %trunc, i32 6)
+ %a_ext = zext i8 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_or_8
+; CHECK: mov al, byte ptr
+; CHECK: .L[[LABEL:.*]]:
+; Dest cannot be eax here, because eax is used for the old value. Also want
+; to make sure that cmpxchg's source is the same register.
+; CHECK: or [[REG:[^a].]]
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_or_16(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i16
+ %ptr = inttoptr i32 %iptr to i16*
+ %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 3, i16* %ptr, i16 %trunc, i32 6)
+ %a_ext = zext i16 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_or_16
+; CHECK: mov ax, word ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: or [[REG:[^a].]]
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_or_32(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %v, i32 6)
+ ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_or_32
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: or [[REG:e[^a].]]
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i64 @test_atomic_rmw_or_64(i32 %iptr, i64 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 3, i64* %ptr, i64 %v, i32 6)
+ ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_or_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: or ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: or ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_or_32_ignored(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %v, i32 6)
+ ret i32 %v
+}
+; CHECK-LABEL: test_atomic_rmw_or_32_ignored
+; Could just "lock or", if we inspect the liveness information first.
+; Would also need a way to introduce "lock"'edness to binary
+; operators without introducing overhead on the more common binary ops.
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: or [[REG:e[^a].]]
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+;; and
+
+define i32 @test_atomic_rmw_and_8(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i8
+ %ptr = inttoptr i32 %iptr to i8*
+ %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 4, i8* %ptr, i8 %trunc, i32 6)
+ %a_ext = zext i8 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_and_8
+; CHECK: mov al, byte ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and [[REG:[^a].]]
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_and_16(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i16
+ %ptr = inttoptr i32 %iptr to i16*
+ %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 4, i16* %ptr, i16 %trunc, i32 6)
+ %a_ext = zext i16 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_and_16
+; CHECK: mov ax, word ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_and_32(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %v, i32 6)
+ ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_and_32
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+define i64 @test_atomic_rmw_and_64(i32 %iptr, i64 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 4, i64* %ptr, i64 %v, i32 6)
+ ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_and_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: and ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: and ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_and_32_ignored(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %v, i32 6)
+ ret i32 %v
+}
+; CHECK-LABEL: test_atomic_rmw_and_32_ignored
+; Could just "lock and"
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: and
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+;; xor
+
+define i32 @test_atomic_rmw_xor_8(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i8
+ %ptr = inttoptr i32 %iptr to i8*
+ %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 5, i8* %ptr, i8 %trunc, i32 6)
+ %a_ext = zext i8 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xor_8
+; CHECK: mov al, byte ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor [[REG:[^a].]]
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], [[REG]]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_xor_16(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i16
+ %ptr = inttoptr i32 %iptr to i16*
+ %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 5, i16* %ptr, i16 %trunc, i32 6)
+ %a_ext = zext i16 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xor_16
+; CHECK: mov ax, word ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+
+define i32 @test_atomic_rmw_xor_32(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %v, i32 6)
+ ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xor_32
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+define i64 @test_atomic_rmw_xor_64(i32 %iptr, i64 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 5, i64* %ptr, i64 %v, i32 6)
+ ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xor_64
+; CHECK: push ebx
+; CHECK: mov eax, dword ptr [{{.*}}]
+; CHECK: mov edx, dword ptr [{{.*}}+4]
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: mov ebx, eax
+; CHECK: or ebx, {{.*e.[^x]}}
+; CHECK: mov ecx, edx
+; CHECK: or ecx, {{.*e.[^x]}}
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_xor_32_ignored(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %v, i32 6)
+ ret i32 %v
+}
+; CHECK-LABEL: test_atomic_rmw_xor_32_ignored
+; CHECK: mov eax, dword ptr
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: xor
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+; CHECK: jne .L[[LABEL]]
+
+;; exchange
+
+define i32 @test_atomic_rmw_xchg_8(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i8
+ %ptr = inttoptr i32 %iptr to i8*
+ %a = call i8 @llvm.nacl.atomic.rmw.i8(i32 6, i8* %ptr, i8 %trunc, i32 6)
+ %a_ext = zext i8 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_8
+; CHECK: xchg byte ptr {{.*}}, [[REG:.*]]
+
+define i32 @test_atomic_rmw_xchg_16(i32 %iptr, i32 %v) {
+entry:
+ %trunc = trunc i32 %v to i16
+ %ptr = inttoptr i32 %iptr to i16*
+ %a = call i16 @llvm.nacl.atomic.rmw.i16(i32 6, i16* %ptr, i16 %trunc, i32 6)
+ %a_ext = zext i16 %a to i32
+ ret i32 %a_ext
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_16
+; CHECK: xchg word ptr {{.*}}, [[REG:.*]]
+
+define i32 @test_atomic_rmw_xchg_32(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %a = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %v, i32 6)
+ ret i32 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_32
+; CHECK: xchg dword ptr {{.*}}, [[REG:.*]]
+
+define i64 @test_atomic_rmw_xchg_64(i32 %iptr, i64 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %a = call i64 @llvm.nacl.atomic.rmw.i64(i32 6, i64* %ptr, i64 %v, i32 6)
+ ret i64 %a
+}
+; CHECK-LABEL: test_atomic_rmw_xchg_64
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: lock cmpxchg8b qword ptr [{{e.[^x]}}]
+; CHECK: jne .L[[LABEL]]
+
+define i32 @test_atomic_rmw_xchg_32_ignored(i32 %iptr, i32 %v) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %ignored = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %v, i32 6)
+ ret i32 %v
+}
+; In this case, ignoring the return value doesn't help. The xchg is
+; used to do an atomic store.
+; CHECK-LABEL: test_atomic_rmw_xchg_32_ignored
+; CHECK: xchg dword ptr {{.*}}, [[REG:.*]]
;;;; Cmpxchg
-;define i32 @test_atomic_cmpxchg_8(i32 %iptr, i32 %expected, i32 %desired) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i8*
-; %trunc_exp = trunc i32 %expected to i8
-; %trunc_des = trunc i32 %desired to i8
-; %old = call i8 @llvm.nacl.atomic.cmpxchg.i8(i8* %ptr, i8 %trunc_exp,
-; i8 %trunc_des, i32 6, i32 6)
-; %old_ext = zext i8 %old to i32
-; ret i32 %old_ext
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_8
-; CHECKLATER: lock cmpxchg byte
+define i32 @test_atomic_cmpxchg_8(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+ %trunc_exp = trunc i32 %expected to i8
+ %trunc_des = trunc i32 %desired to i8
+ %ptr = inttoptr i32 %iptr to i8*
+ %old = call i8 @llvm.nacl.atomic.cmpxchg.i8(i8* %ptr, i8 %trunc_exp,
+ i8 %trunc_des, i32 6, i32 6)
+ %old_ext = zext i8 %old to i32
+ ret i32 %old_ext
+}
+; CHECK-LABEL: test_atomic_cmpxchg_8
+; CHECK: mov al, {{.*}}
+; Need to check that eax isn't used as the address register or the desired.
+; since it is already used as the *expected* register.
+; CHECK: lock cmpxchg byte ptr [e{{[^a].}}], {{[^a]}}
-;define i32 @test_atomic_cmpxchg_16(i32 %iptr, i32 %expected, i32 %desired) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i16*
-; %trunc_exp = trunc i32 %expected to i16
-; %trunc_des = trunc i32 %desired to i16
-; %old = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %trunc_exp,
-; i16 %trunc_des, i32 6, i32 6)
-; %old_ext = zext i16 %old to i32
-; ret i32 %old_ext
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_16
-; This one is a bit gross for NaCl right now.
-; https://code.google.com/p/nativeclient/issues/detail?id=2981
-; But we'll assume that NaCl will have it fixed...
-; CHECKLATER: lock cmpxchg word
+define i32 @test_atomic_cmpxchg_16(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+ %trunc_exp = trunc i32 %expected to i16
+ %trunc_des = trunc i32 %desired to i16
+ %ptr = inttoptr i32 %iptr to i16*
+ %old = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %trunc_exp,
+ i16 %trunc_des, i32 6, i32 6)
+ %old_ext = zext i16 %old to i32
+ ret i32 %old_ext
+}
+; CHECK-LABEL: test_atomic_cmpxchg_16
+; CHECK: mov ax, {{.*}}
+; CHECK: lock cmpxchg word ptr [e{{[^a].}}], {{[^a]}}
-;define i32 @test_atomic_cmpxchg_32(i32 %iptr, i32 %expected, i32 %desired) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i32*
-; %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
-; i32 %desired, i32 6, i32 6)
-; ret i32 %old
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_32
-; CHECKLATER: mov eax
-; CHECKLATER: mov ecx
-; CHECKLATER: lock cmpxchg dword
+define i32 @test_atomic_cmpxchg_32(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+ i32 %desired, i32 6, i32 6)
+ ret i32 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_32
+; CHECK: mov eax, {{.*}}
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
-;define i64 @test_atomic_cmpxchg_64(i32 %iptr, i64 %expected, i64 %desired) {
-;entry:
-; %ptr = inttoptr i32 %iptr to i64*
-; %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
-; i64 %desired, i32 6, i32 6)
-; ret i64 %old
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_64
-; CHECKLATER: mov eax
-; CHECKLATER: mov edx
-; CHECKLATER: mov ebx
-; CHECKLATER: mov ecx
-; CHECKLATER: lock cmpxchg8b qword
+define i64 @test_atomic_cmpxchg_64(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+ i64 %desired, i32 6, i32 6)
+ ret i64 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; edx and eax are already the return registers, so they don't actually
+; need to be reshuffled via movs. The next test stores the result
+; somewhere, so in that case they do need to be mov'ed.
-;define i32 @test_atomic_cmpxchg_32_loop(i32 %iptr,
-; i32 %expected, i32 %desired) {
-;entry:
-; br label %loop
-;
-;loop:
-; %cmp = phi i32 [ %expected, %entry], [%old, %loop]
-; %ptr = inttoptr i32 %iptr to i32*
-; %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %cmp,
-; i32 %desired, i32 6, i32 6)
-; %success = icmp eq i32 %cmp, %old
-; br i1 %success, label %done, label %loop
-;
-;done:
-; ret i32 %old
-;}
-; CHECKLATER-LABEL: test_atomic_cmpxchg_32_loop
+; Test a case where %old really does need to be copied out of edx:eax.
+define void @test_atomic_cmpxchg_64_store(i32 %ret_iptr, i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+ i64 %desired, i32 6, i32 6)
+ %__6 = inttoptr i32 %ret_iptr to i64*
+ store i64 %old, i64* %__6, align 1
+ ret void
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64_store
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+; CHECK: mov {{.*}}, edx
+; CHECK: mov {{.*}}, eax
+
+; Test with some more register pressure. When we have an alloca, ebp is
+; used to manage the stack frame, so it cannot be used as a register either.
+define i64 @test_atomic_cmpxchg_64_alloca(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+ %alloca_ptr = alloca i8, i32 16, align 16
+ %ptr = inttoptr i32 %iptr to i64*
+ %old = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+ i64 %desired, i32 6, i32 6)
+ store i8 0, i8* %alloca_ptr, align 1
+ store i8 1, i8* %alloca_ptr, align 1
+ store i8 2, i8* %alloca_ptr, align 1
+ store i8 3, i8* %alloca_ptr, align 1
+ %__6 = ptrtoint i8* %alloca_ptr to i32
+ call void @use_ptr(i32 %__6)
+ ret i64 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64_alloca
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; Ptr cannot be eax, ebx, ecx, or edx (used up for the expected and desired).
+; It also cannot be ebp since we use that for alloca. Also make sure it's
+; not esp, since that's the stack pointer and mucking with it will break
+; the later use_ptr function call.
+; That pretty much leaves esi, or edi as the only viable registers.
+; CHECK: lock cmpxchg8b qword ptr [e{{[ds]}}i]
+; CHECK: call use_ptr
+
+define i32 @test_atomic_cmpxchg_32_ignored(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+ %ptr = inttoptr i32 %iptr to i32*
+ %ignored = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected,
+ i32 %desired, i32 6, i32 6)
+ ret i32 0
+}
+; CHECK-LABEL: test_atomic_cmpxchg_32_ignored
+; CHECK: mov eax, {{.*}}
+; CHECK: lock cmpxchg dword ptr [e{{[^a].}}]
+
+define i64 @test_atomic_cmpxchg_64_ignored(i32 %iptr, i64 %expected, i64 %desired) {
+entry:
+ %ptr = inttoptr i32 %iptr to i64*
+ %ignored = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %expected,
+ i64 %desired, i32 6, i32 6)
+ ret i64 0
+}
+; CHECK-LABEL: test_atomic_cmpxchg_64_ignored
+; CHECK: push ebx
+; CHECK-DAG: mov edx
+; CHECK-DAG: mov eax
+; CHECK-DAG: mov ecx
+; CHECK-DAG: mov ebx
+; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
+
+define i32 @test_atomic_cmpxchg_32_loop(i32 %iptr, i32 %expected, i32 %desired) {
+entry:
+ br label %loop
+
+loop:
+ %cmp = phi i32 [ %expected, %entry ], [ %old, %loop ]
+ %ptr = inttoptr i32 %iptr to i32*
+ %old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %cmp,
+ i32 %desired, i32 6, i32 6)
+ %success = icmp eq i32 %cmp, %old
+ br i1 %success, label %done, label %loop
+
+done:
+ ret i32 %old
+}
+; CHECK-LABEL: test_atomic_cmpxchg_32_loop
;;;; Fence and is-lock-free.
@@ -381,6 +866,19 @@
; CHECK-LABEL: test_not_lock_free
; CHECK: mov {{.*}}, 0
+define i32 @test_atomic_is_lock_free_ignored(i32 %iptr) {
+entry:
+ %ptr = inttoptr i32 %iptr to i8*
+ %ignored = call i1 @llvm.nacl.atomic.is.lock.free(i32 4, i8* %ptr)
+ ret i32 0
+}
+; CHECK-LABEL: test_atomic_is_lock_free_ignored
+; CHECK: mov {{.*}}, 0
+; This can get optimized out, because it's side-effect-free.
+; CHECKO2REM-LABEL: test_atomic_is_lock_free_ignored
+; CHECKO2REM-NOT: mov {{.*}}, 1
+; CHECKO2REM: mov {{.*}}, 0
+
; TODO(jvoung): at some point we can take advantage of the
; fact that nacl.atomic.is.lock.free will resolve to a constant
; (which adds DCE opportunities). Once we optimize, the test expectations