Subzero: lower the rest of the atomic operations.

64-bit ops are expanded via a cmpxchg8b loop.

64/32-bit and/or/xor are also expanded into a cmpxchg /
cmpxchg8b loop.

Add a cross test for atomic RMW operations and
compare and swap.

Misc: Test that atomic.is.lock.free can be optimized out if result is ignored.

TODO:
* optimize compare and swap with compare+branch further down
instruction stream.

* optimize atomic RMW when the return value is ignored
(adds a locked field to binary ops though).

* We may want to do some actual target-dependent basic
block splitting + expansion (the instructions inserted by
the expansion must reference the pre-colored registers,
etc.). Otherwise, we are currently getting by with modeling
the extended liveness of the variables used in the loops
using fake uses.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882
R=jfb@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/362463002
diff --git a/crosstest/crosstest.py b/crosstest/crosstest.py
index a37b10f..be6c54c 100755
--- a/crosstest/crosstest.py
+++ b/crosstest/crosstest.py
@@ -57,6 +57,11 @@
                            metavar='PATH',
                            help='Path to LLVM executables like llc ' +
                                 '(defaults to $LLVM_BIN_PATH)')
+    argparser.add_argument('--crosstest-bitcode', required=False,
+                           default=1, type=int,
+                           help='Compile non-subzero crosstest object file ' +
+                           'from the same bitcode as the subzero object. ' +
+                           'If 0, then compile it straight from source.')
     args = argparser.parse_args()
 
     objs = []
@@ -113,7 +118,9 @@
         # failures.  This behavior can be inspected by switching
         # use_llc between True and False.
         use_llc = False
-        if use_llc:
+        if not args.crosstest_bitcode:
+            objs.append(arg)
+        elif use_llc:
             shellcmd([os.path.join(llvm_bin_path, 'llc'),
                       '-filetype=obj',
                       '-o=' + obj_llc,
@@ -125,4 +132,4 @@
     linker = 'clang' if os.path.splitext(args.driver)[1] == '.c' else 'clang++'
     shellcmd([os.path.join(llvm_bin_path, linker), '-g', '-m32', args.driver] +
              objs +
-             ['-lm', '-o', os.path.join(args.dir, args.output)])
+             ['-lm', '-lpthread', '-o', os.path.join(args.dir, args.output)])
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index cf821e2..bba53d7 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -64,6 +64,17 @@
         --driver=test_icmp_main.cpp \
         --output=test_icmp_O${optlevel}
 
+    # Compile the non-subzero object files straight from source
+    # since the native LLVM backend does not understand how to
+    # lower NaCl-specific intrinsics.
+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+       --dir="${OUTDIR}" \
+       --llvm-bin-path="${LLVM_BIN_PATH}" \
+       --test=test_sync_atomic.cpp \
+       --crosstest-bitcode=0 \
+       --driver=test_sync_atomic_main.cpp \
+       --output=test_sync_atomic_O${optlevel}
+
 done
 
 for optlevel in ${OPTLEVELS} ; do
@@ -74,4 +85,5 @@
     "${OUTDIR}"/test_fcmp_O${optlevel}
     "${OUTDIR}"/test_global_O${optlevel}
     "${OUTDIR}"/test_icmp_O${optlevel}
+    "${OUTDIR}"/test_sync_atomic_O${optlevel}
 done
diff --git a/crosstest/test_sync_atomic.cpp b/crosstest/test_sync_atomic.cpp
new file mode 100644
index 0000000..05d0336
--- /dev/null
+++ b/crosstest/test_sync_atomic.cpp
@@ -0,0 +1,63 @@
+//===- subzero/crosstest/test_sync_atomic.cpp - Implementation for tests --===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This aims to test that all the atomic RMW instructions and compare and swap
+// work across the allowed atomic types. This uses the __sync_* builtins
+// to test the atomic operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#include <cstdlib>
+
+#include "test_sync_atomic.h"
+
+#define X(inst, type)                                                   \
+  type test_##inst(bool fetch_first, volatile type *ptr, type a) {      \
+    if (fetch_first) {                                                  \
+      return __sync_fetch_and_##inst(ptr, a);                           \
+    } else {                                                            \
+      return __sync_##inst##_and_fetch(ptr, a);                         \
+    }                                                                   \
+  }                                                                     \
+  type test_alloca_##inst(bool fetch, volatile type *ptr, type a) {     \
+    const size_t buf_size = 8;                                          \
+    type buf[buf_size];                                                 \
+    for (size_t i = 0; i < buf_size; ++i) {                             \
+      if (fetch) {                                                      \
+        buf[i] = __sync_fetch_and_##inst(ptr, a);                       \
+      } else {                                                          \
+        buf[i] = __sync_##inst##_and_fetch(ptr, a);                     \
+      }                                                                 \
+    }                                                                   \
+    type sum = 0;                                                       \
+    for (size_t i = 0; i < buf_size; ++i) {                             \
+      sum += buf[i];                                                    \
+    }                                                                   \
+    return sum;                                                         \
+  }                                                                     \
+  type test_const_##inst(bool fetch, volatile type *ptr, type ign) {    \
+    if (fetch) {                                                        \
+      return __sync_fetch_and_##inst(ptr, 42);                          \
+    } else {                                                            \
+      return __sync_##inst##_and_fetch(ptr, 99);                        \
+    }                                                                   \
+  }
+
+FOR_ALL_RMWOP_TYPES(X)
+#undef X
+
+#define X(type)                                                          \
+  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval) { \
+    return __sync_val_compare_and_swap(ptr, oldval, newval);             \
+  }
+
+ATOMIC_TYPE_TABLE
+#undef X
diff --git a/crosstest/test_sync_atomic.def b/crosstest/test_sync_atomic.def
new file mode 100644
index 0000000..f84afde
--- /dev/null
+++ b/crosstest/test_sync_atomic.def
@@ -0,0 +1,50 @@
+//===- subzero/crosstest/test_sync_atomic.def - macros for tests -*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines macros for testing atomic intrinsics (via sync builtins).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_SYNC_ATOMIC_DEF
+#define TEST_SYNC_ATOMIC_DEF
+
+#define STR(s) #s
+
+#define RMWOP_TABLE  \
+  /* inst */         \
+  X(add)             \
+  X(sub)             \
+  X(or)              \
+  X(and)             \
+  X(xor)
+//#define X(inst)
+
+#define ATOMIC_TYPE_TABLE \
+  /* type */              \
+  X(uint8_t)              \
+  X(uint16_t)             \
+  X(uint32_t)             \
+  X(uint64_t)
+//#define X(type)
+
+#define FOR_ALL_RMWTYPES_INST(F, inst) \
+  F(inst, uint8_t)                     \
+  F(inst, uint16_t)                    \
+  F(inst, uint32_t)                    \
+  F(inst, uint64_t)
+
+#define FOR_ALL_RMWOP_TYPES(X)      \
+  FOR_ALL_RMWTYPES_INST(X, add)     \
+  FOR_ALL_RMWTYPES_INST(X, sub)     \
+  FOR_ALL_RMWTYPES_INST(X, or)      \
+  FOR_ALL_RMWTYPES_INST(X, and)     \
+  FOR_ALL_RMWTYPES_INST(X, xor)
+//#define X(inst, type)
+
+#endif // TEST_SYNC_ATOMIC_DEF
diff --git a/crosstest/test_sync_atomic.h b/crosstest/test_sync_atomic.h
new file mode 100644
index 0000000..a88cd73
--- /dev/null
+++ b/crosstest/test_sync_atomic.h
@@ -0,0 +1,29 @@
+//===- subzero/crosstest/test_sync_atomic.h - Test prototypes ---*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function prototypes for cross testing atomic
+// intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "test_sync_atomic.def"
+
+#define X(inst, type)                                                   \
+  type test_##inst(bool fetch_first, volatile type *ptr, type a);       \
+  type test_alloca_##inst(bool fetch, volatile type *ptr, type a);      \
+  type test_const_##inst(bool fetch, volatile type *ptr, type ignored);
+
+FOR_ALL_RMWOP_TYPES(X)
+#undef X
+
+#define X(type)   \
+  type test_val_cmp_swap(volatile type *ptr, type oldval, type newval);
+
+ATOMIC_TYPE_TABLE
+#undef X
diff --git a/crosstest/test_sync_atomic_main.cpp b/crosstest/test_sync_atomic_main.cpp
new file mode 100644
index 0000000..0cae7cd
--- /dev/null
+++ b/crosstest/test_sync_atomic_main.cpp
@@ -0,0 +1,298 @@
+//===- subzero/crosstest/test_sync_atomic_main.cpp - Driver for tests -----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Driver for cross testing atomic intrinsics, via the sync builtins.
+//
+//===----------------------------------------------------------------------===//
+
+/* crosstest.py --test=test_sync_atomic.cpp --crosstest-bitcode=0 \
+   --driver=test_sync_atomic_main.cpp --prefix=Subzero_ \
+   --output=test_sync_atomic */
+
+#include <pthread.h>
+#include <stdint.h>
+
+#include <cerrno>
+#include <climits>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+// Include test_sync_atomic.h twice - once normally, and once within the
+// Subzero_ namespace, corresponding to the llc and Subzero translated
+// object files, respectively.
+#include "test_sync_atomic.h"
+namespace Subzero_ {
+#include "test_sync_atomic.h"
+}
+
+volatile uint64_t Values[] = {
+    0,                    1,                    0x7e,
+    0x7f,                 0x80,                 0x81,
+    0xfe,                 0xff,                 0x7ffe,
+    0x7fff,               0x8000,               0x8001,
+    0xfffe,               0xffff,
+    0x007fffff /*Max subnormal + */,
+    0x00800000 /*Min+ */, 0x7f7fffff /*Max+ */,
+    0x7f800000 /*+Inf*/,  0xff800000 /*-Inf*/,
+    0x7fa00000 /*SNaN*/,  0x7fc00000 /*QNaN*/,
+    0x7ffffffe,           0x7fffffff,           0x80000000,
+    0x80000001,           0xfffffffe,           0xffffffff,
+    0x100000000ll,        0x100000001ll,
+    0x000fffffffffffffll /*Max subnormal + */,
+    0x0010000000000000ll /*Min+ */,
+    0x7fefffffffffffffll /*Max+ */,
+    0x7ff0000000000000ll /*+Inf*/,
+    0xfff0000000000000ll /*-Inf*/,
+    0x7ff0000000000001ll /*SNaN*/,
+    0x7ff8000000000000ll /*QNaN*/,
+    0x7ffffffffffffffell, 0x7fffffffffffffffll, 0x8000000000000000ll,
+    0x8000000000000001ll, 0xfffffffffffffffell, 0xffffffffffffffffll };
+
+const static size_t NumValues = sizeof(Values) / sizeof(*Values);
+
+struct {
+  volatile uint8_t l8;
+  volatile uint16_t l16;
+  volatile uint32_t l32;
+  volatile uint64_t l64;
+} AtomicLocs;
+
+template <typename Type>
+void testAtomicRMW(volatile Type *AtomicLoc,
+                   size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef Type (*FuncType)(bool, volatile Type*, Type);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+#define X(inst)                                                             \
+  {                                                                         \
+    STR(inst), test_##inst, Subzero_::test_##inst                           \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst   \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_const", test_const_##inst, Subzero_::test_const_##inst      \
+  },
+      RMWOP_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    for (size_t i = 0; i < NumValues; ++i) {
+      Type Value1 = static_cast<Type>(Values[i]);
+      for (size_t j = 0; j < NumValues; ++j) {
+        Type Value2 = static_cast<Type>(Values[j]);
+        for (size_t k = 0; k < 2; ++k) {
+          bool fetch_first = k;
+          ++TotalTests;
+          *AtomicLoc = Value1;
+          Type ResultSz1 = Funcs[f].FuncSz(
+              fetch_first, AtomicLoc, Value2);
+          Type ResultSz2 = *AtomicLoc;
+          *AtomicLoc = Value1;
+          Type ResultLlc1 = Funcs[f].FuncLlc(
+              fetch_first, AtomicLoc, Value2);
+          Type ResultLlc2 = *AtomicLoc;
+          if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
+            ++Passes;
+          } else {
+            ++Failures;
+            std::cout << "test_" << Funcs[f].Name
+                      << (CHAR_BIT * sizeof(Type)) << "("
+                      << static_cast<uint64_t>(Value1) << ", "
+                      << static_cast<uint64_t>(Value2)
+                      << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+                      << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+                      << " sz2=" << static_cast<uint64_t>(ResultSz2)
+                      << " llc2=" << static_cast<uint64_t>(ResultLlc2)
+                      << "\n";
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename Type>
+void testValCompareAndSwap(volatile Type *AtomicLoc, size_t &TotalTests,
+                           size_t &Passes, size_t &Failures) {
+  for (size_t i = 0; i < NumValues; ++i) {
+    Type Value1 = static_cast<Type>(Values[i]);
+    for (size_t j = 0; j < NumValues; ++j) {
+      Type Value2 = static_cast<Type>(Values[j]);
+      for (size_t f = 0; f < 2; ++f) {
+        bool flip = f;
+        ++TotalTests;
+        *AtomicLoc = Value1;
+        Type ResultSz1 = Subzero_::test_val_cmp_swap(
+            AtomicLoc, flip ? Value2 : Value1, Value2);
+        Type ResultSz2 = *AtomicLoc;
+        *AtomicLoc = Value1;
+        Type ResultLlc1 = test_val_cmp_swap(
+            AtomicLoc, flip ? Value2 : Value1, Value2);
+        Type ResultLlc2 = *AtomicLoc;
+        if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "test_val_cmp_swap" << (CHAR_BIT * sizeof(Type)) << "("
+                    << static_cast<uint64_t>(Value1) << ", "
+                    << static_cast<uint64_t>(Value2)
+                    << "): sz1=" << static_cast<uint64_t>(ResultSz1)
+                    << " llc1=" << static_cast<uint64_t>(ResultLlc1)
+                    << " sz2=" << static_cast<uint64_t>(ResultSz2)
+                    << " llc2=" << static_cast<uint64_t>(ResultLlc2)
+                    << "\n";
+        }
+      }
+    }
+  }
+}
+
+template <typename Type>
+struct ThreadData {
+  Type (*FuncPtr)(bool, volatile Type*, Type);
+  bool Fetch;
+  volatile Type *Ptr;
+  Type Adjustment;
+};
+
+template <typename Type>
+void *threadWrapper(void *Data) {
+  const size_t NumReps = 8000;
+  ThreadData<Type> *TData = reinterpret_cast<ThreadData<Type>*>(Data);
+  for (size_t i = 0; i < NumReps; ++i) {
+    (void)TData->FuncPtr(TData->Fetch, TData->Ptr, TData->Adjustment);
+  }
+  return NULL;
+}
+
+template <typename Type>
+void testAtomicRMWThreads(volatile Type *AtomicLoc, size_t &TotalTests,
+                          size_t &Passes, size_t &Failures) {
+  typedef Type (*FuncType)(bool, volatile Type*, Type);
+  static struct {
+    const char *Name;
+    FuncType FuncLlc;
+    FuncType FuncSz;
+  } Funcs[] = {
+#define X(inst)                                                             \
+  {                                                                         \
+    STR(inst), test_##inst, Subzero_::test_##inst                           \
+  },                                                                        \
+  {                                                                         \
+    STR(inst) "_alloca", test_alloca_##inst, Subzero_::test_alloca_##inst   \
+  },
+      RMWOP_TABLE
+#undef X
+  };
+  const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
+
+  // Just test a few values, otherwise it takes a *really* long time.
+  volatile uint64_t ValuesSubset[] = { 1, 0x7e, 0x000fffffffffffffffll };
+  const size_t NumValuesSubset = sizeof(ValuesSubset) / sizeof(*ValuesSubset);
+
+  for (size_t f = 0; f < NumFuncs; ++f) {
+    for (size_t i = 0; i < NumValuesSubset; ++i) {
+      Type Value1 = static_cast<Type>(ValuesSubset[i]);
+      for (size_t j = 0; j < NumValuesSubset; ++j) {
+        Type Value2 = static_cast<Type>(ValuesSubset[j]);
+        bool fetch_first = true;
+        ThreadData<Type> TDataSz = {
+          Funcs[f].FuncSz, fetch_first, AtomicLoc, Value2 };
+        ThreadData<Type> TDataLlc = {
+          Funcs[f].FuncLlc, fetch_first, AtomicLoc, Value2 };
+        ++TotalTests;
+        const size_t NumThreads = 4;
+        pthread_t t[NumThreads];
+
+        // Try N threads w/ just Llc.
+        *AtomicLoc = Value1;
+        for (size_t m = 0; m < NumThreads; ++m) {
+          pthread_create(&t[m], NULL, &threadWrapper<Type>,
+                         reinterpret_cast<void *>(&TDataLlc));
+        }
+        for (size_t m = 0; m < NumThreads; ++m) {
+          pthread_join(t[m], NULL);
+        }
+        Type ResultLlc = *AtomicLoc;
+
+        // Try N threads w/ both Sz and Llc.
+        *AtomicLoc = Value1;
+        for (size_t m = 0; m < NumThreads; ++m) {
+          if (pthread_create(&t[m], NULL, &threadWrapper<Type>,
+                             m % 2 == 0
+                             ? reinterpret_cast<void *>(&TDataLlc)
+                             : reinterpret_cast<void *>(&TDataSz)) != 0) {
+            ++Failures;
+            std::cout << "pthread_create failed w/ " << strerror(errno) << "\n";
+            abort();
+          }
+        }
+        for (size_t m = 0; m < NumThreads; ++m) {
+          if (pthread_join(t[m], NULL) != 0) {
+            ++Failures;
+            std::cout << "pthread_join failed w/ " << strerror(errno) << "\n";
+            abort();
+          }
+        }
+        Type ResultMixed = *AtomicLoc;
+
+        if (ResultLlc == ResultMixed) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "test_with_threads_" << Funcs[f].Name
+                    << (8 * sizeof(Type)) << "("
+                    << static_cast<uint64_t>(Value1) << ", "
+                    << static_cast<uint64_t>(Value2)
+                    << "): llc=" << static_cast<uint64_t>(ResultLlc)
+                    << " mixed=" << static_cast<uint64_t>(ResultMixed)
+                    << "\n";
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+
+  testAtomicRMW<uint8_t>(&AtomicLocs.l8, TotalTests, Passes, Failures);
+  testAtomicRMW<uint16_t>(&AtomicLocs.l16, TotalTests, Passes, Failures);
+  testAtomicRMW<uint32_t>(&AtomicLocs.l32, TotalTests, Passes, Failures);
+  testAtomicRMW<uint64_t>(&AtomicLocs.l64, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint8_t>(
+      &AtomicLocs.l8, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint16_t>(
+      &AtomicLocs.l16, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint32_t>(
+      &AtomicLocs.l32, TotalTests, Passes, Failures);
+  testValCompareAndSwap<uint64_t>(
+      &AtomicLocs.l64, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint8_t>(
+      &AtomicLocs.l8, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint16_t>(
+      &AtomicLocs.l16, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint32_t>(
+      &AtomicLocs.l32, TotalTests, Passes, Failures);
+  testAtomicRMWThreads<uint64_t>(
+      &AtomicLocs.l64, TotalTests, Passes, Failures);
+
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+  return Failures;
+}