Add a few Subzero intrinsics (not the atomic ones yet).

Handle:
* mem{cpy,move,set} (without optimizations for known lengths)
* nacl.read.tp
* setjmp, longjmp
* trap

Mostly see if the dispatching/organization is okay.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/321993002
diff --git a/Makefile.standalone b/Makefile.standalone
index beaf9b6..9aa2fdb 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -39,6 +39,7 @@
 	IceGlobalContext.cpp \
 	IceInst.cpp \
 	IceInstX8632.cpp \
+	IceIntrinsics.cpp \
 	IceLiveness.cpp \
 	IceOperand.cpp \
 	IceRegAlloc.cpp \
diff --git a/crosstest/mem_intrin.cpp b/crosstest/mem_intrin.cpp
new file mode 100644
index 0000000..baeb06d
--- /dev/null
+++ b/crosstest/mem_intrin.cpp
@@ -0,0 +1,97 @@
+/*
+ * Simple sanity test of memcpy, memmove, and memset intrinsics.
+ * (fixed length buffers, variable length buffers, etc.)
+ */
+
+#include <stdint.h> /* cstdint requires -std=c++0x or higher */
+#include <cstdlib>
+#include <cstring>
+
+#include "mem_intrin.h"
+
+typedef int elem_t;
+
+/*
+ * Reset buf to the sequence of bytes: n, n+1, n+2 ... length - 1
+ */
+static void __attribute__((noinline)) reset_buf(uint8_t *buf,
+                                                uint8_t init,
+                                                size_t length) {
+  size_t i;
+  size_t v = init;
+  for (i = 0; i < length; ++i)
+    buf[i] = v++;
+}
+
+/* Do a fletcher-16 checksum so that the order of the values matter.
+ * (Not doing a fletcher-32 checksum, since we are working with
+ * smaller buffers, whose total won't approach 2**16).
+ */
+static int __attribute__((noinline)) fletcher_checksum(uint8_t *buf,
+                                                       size_t length) {
+  size_t i;
+  int sum = 0;
+  int sum_of_sums = 0;
+  const int kModulus = 255;
+  for (i = 0; i < length; ++i) {
+    sum = (sum + buf[i]) % kModulus;
+    sum_of_sums = (sum_of_sums + sum) % kModulus;
+  }
+  return (sum_of_sums << 8) | sum;
+}
+
+#define NWORDS 32
+#define BYTE_LENGTH (NWORDS * sizeof(elem_t))
+
+int memcpy_test_fixed_len(uint8_t init) {
+  elem_t buf[NWORDS];
+  elem_t buf2[NWORDS];
+  reset_buf((uint8_t *)buf, init, BYTE_LENGTH);
+  memcpy((void *)buf2, (void *)buf, BYTE_LENGTH);
+  return fletcher_checksum((uint8_t*)buf2, BYTE_LENGTH);
+}
+
+int memmove_test_fixed_len(uint8_t init) {
+  elem_t buf[NWORDS];
+  reset_buf((uint8_t *)buf, init, BYTE_LENGTH);
+  memmove((void *)(buf + 4), (void *)buf, BYTE_LENGTH - (4 * sizeof(elem_t)));
+  return fletcher_checksum((uint8_t*)buf + 4, BYTE_LENGTH - 4);
+}
+
+int memset_test_fixed_len(uint8_t init) {
+  elem_t buf[NWORDS];
+  memset((void *)buf, init, BYTE_LENGTH);
+  return fletcher_checksum((uint8_t*)buf, BYTE_LENGTH);
+}
+
+int memcpy_test(uint8_t *buf, void *buf2, uint8_t init, size_t length) {
+  reset_buf(buf, init, length);
+  memcpy(buf2, (void *)buf, length);
+  return fletcher_checksum((uint8_t *)buf2, length);
+}
+
+int memmove_test(uint8_t *buf, void *buf2, uint8_t init, size_t length) {
+  int sum1;
+  int sum2;
+  const int overlap_bytes = 4 * sizeof(elem_t);
+  if (length <= overlap_bytes)
+    return 0;
+  uint8_t *overlap_buf = buf + overlap_bytes;
+  size_t reduced_length = length - overlap_bytes;
+  reset_buf(buf, init, length);
+
+  /* Test w/ overlap. */
+  memmove((void *)overlap_buf, (void *)buf, reduced_length);
+  sum1 = fletcher_checksum(overlap_buf, reduced_length);
+  /* Test w/out overlap. */
+  memmove(buf2, (void *)buf, length);
+  sum2 = fletcher_checksum((uint8_t *)buf2, length);
+  return sum1 + sum2;
+}
+
+int memset_test(uint8_t *buf, void *buf2, uint8_t init, size_t length) {
+  memset((void *)buf, init, length);
+  memset(buf2, init + 4, length);
+  return fletcher_checksum(buf, length) +
+      fletcher_checksum((uint8_t *)buf2, length);
+}
diff --git a/crosstest/mem_intrin.h b/crosstest/mem_intrin.h
new file mode 100644
index 0000000..97e6dcc
--- /dev/null
+++ b/crosstest/mem_intrin.h
@@ -0,0 +1,19 @@
+/*
+ * Simple sanity test of memcpy, memmove, and memset intrinsics.
+ * (fixed length buffers, variable length buffers, etc.).
+ * There is no include guard since this will be included multiple times,
+ * under different namespaces.
+ */
+
+/* Declare first buf as uint8_t * and second as void *, to avoid C++
+ * name mangling's use of substitutions. Otherwise Subzero's name
+ * mangling injection will need to bump each substitution sequence ID
+ * up by one (e.g., from S_ to S0_ and S1_ to S2_).
+ */
+int memcpy_test(uint8_t *buf, void *buf2, uint8_t init, size_t length);
+int memmove_test(uint8_t *buf, void *buf2, uint8_t init, size_t length);
+int memset_test(uint8_t *buf, void *buf2, uint8_t init, size_t length);
+
+int memcpy_test_fixed_len(uint8_t init);
+int memmove_test_fixed_len(uint8_t init);
+int memset_test_fixed_len(uint8_t init);
diff --git a/crosstest/mem_intrin_main.cpp b/crosstest/mem_intrin_main.cpp
new file mode 100644
index 0000000..76df66a
--- /dev/null
+++ b/crosstest/mem_intrin_main.cpp
@@ -0,0 +1,69 @@
+/* crosstest.py --test=mem_intrin.cpp --driver=mem_intrin_main.cpp \
+   --prefix=Subzero_ --output=mem_intrin */
+
+#include <stdint.h> /* cstdint requires -std=c++0x or higher */
+#include <cstdio>
+
+#include "mem_intrin.h"
+namespace Subzero_ {
+#include "mem_intrin.h"
+}
+
+#define XSTR(s) STR(s)
+#define STR(s) #s
+
+void testFixedLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+#define do_test_fixed(test_func)                                        \
+  for (uint8_t init_val = 0; init_val < 100; ++init_val) {              \
+    ++TotalTests;                                                       \
+    int llc_result = test_func(init_val);                               \
+    int sz_result = Subzero_::test_func(init_val);                      \
+    if (llc_result == sz_result) {                                      \
+      ++Passes;                                                         \
+    } else {                                                            \
+      ++Failures;                                                       \
+      printf("Failure (%s): init_val=%d, llc=%d, sz=%d\n",              \
+             STR(test_func), init_val, llc_result, sz_result);          \
+    }                                                                   \
+  }
+
+  do_test_fixed(memcpy_test_fixed_len)
+  do_test_fixed(memmove_test_fixed_len)
+  do_test_fixed(memset_test_fixed_len)
+#undef do_test_fixed
+}
+
+void testVariableLen(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  uint8_t buf[256];
+  uint8_t buf2[256];
+#define do_test_variable(test_func)                                     \
+  for (size_t len = 4; len < 128; ++len) {                              \
+    for (uint8_t init_val = 0; init_val < 100; ++init_val) {            \
+      ++TotalTests;                                                     \
+      int llc_result = test_func(buf, (void *)buf2, init_val, len);     \
+      int sz_result = Subzero_::test_func(buf, (void *)buf2, init_val, len); \
+      if (llc_result == sz_result) {                                    \
+        ++Passes;                                                       \
+      } else {                                                          \
+        ++Failures;                                                     \
+        printf("Failure (%s): init_val=%d, len=%d, llc=%d, sz=%d\n",    \
+               STR(test_func), init_val, len, llc_result, sz_result);   \
+      }                                                                 \
+    }                                                                   \
+  }
+
+  do_test_variable(memcpy_test)
+  do_test_variable(memmove_test)
+  do_test_variable(memset_test)
+#undef do_test_variable
+}
+
+int main(int argc, char **argv) {
+  unsigned TotalTests = 0;
+  unsigned Passes = 0;
+  unsigned Failures = 0;
+  testFixedLen(TotalTests, Passes, Failures);
+  testVariableLen(TotalTests, Passes, Failures);
+  printf("TotalTests=%u Passes=%u Failures=%u\n", TotalTests, Passes, Failures);
+  return Failures;
+}
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index 4ba208f..d89e1b9 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -23,6 +23,20 @@
     ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
         --dir="${OUTDIR}" \
         --llvm-bin-path="${LLVM_BIN_PATH}" \
+        --test=mem_intrin.cpp \
+        --driver=mem_intrin_main.cpp \
+        --output=mem_intrin_O${optlevel}
+
+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+        --dir="${OUTDIR}" \
+        --llvm-bin-path="${LLVM_BIN_PATH}" \
+        --test=test_arith.cpp --test=test_arith_frem.ll \
+        --driver=test_arith_main.cpp \
+        --output=test_arith_O${optlevel}
+
+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+        --dir="${OUTDIR}" \
+        --llvm-bin-path="${LLVM_BIN_PATH}" \
         --test=test_cast.cpp --test=test_cast_to_u1.ll \
         --driver=test_cast_main.cpp \
         --output=test_cast_O${optlevel}
@@ -41,19 +55,13 @@
         --driver=test_icmp_main.cpp \
         --output=test_icmp_O${optlevel}
 
-    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
-        --dir="${OUTDIR}" \
-        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_arith.cpp --test=test_arith_frem.ll \
-        --driver=test_arith_main.cpp \
-        --output=test_arith_O${optlevel}
-
 done
 
 for optlevel in ${OPTLEVELS} ; do
     "${OUTDIR}"/simple_loop_O${optlevel}
+    "${OUTDIR}"/mem_intrin_O${optlevel}
+    "${OUTDIR}"/test_arith_O${optlevel}
     "${OUTDIR}"/test_cast_O${optlevel}
     "${OUTDIR}"/test_fcmp_O${optlevel}
     "${OUTDIR}"/test_icmp_O${optlevel}
-    "${OUTDIR}"/test_arith_O${optlevel}
 done
diff --git a/src/IceGlobalContext.h b/src/IceGlobalContext.h
index 088421f..c46d7d45 100644
--- a/src/IceGlobalContext.h
+++ b/src/IceGlobalContext.h
@@ -20,6 +20,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include "IceDefs.h"
+#include "IceIntrinsics.h"
 #include "IceTypes.h"
 
 namespace Ice {
@@ -88,6 +89,8 @@
   // Allocate data of type T using the global allocator.
   template <typename T> T *allocate() { return Allocator.Allocate<T>(); }
 
+  const Intrinsics &getIntrinsicsInfo() const { return IntrinsicsInfo; }
+
 private:
   Ostream StrDump; // Stream for dumping / diagnostics
   Ostream StrEmit; // Stream for code emission
@@ -95,6 +98,7 @@
   llvm::BumpPtrAllocator Allocator;
   VerboseMask VMask;
   llvm::OwningPtr<class ConstantPool> ConstPool;
+  Intrinsics IntrinsicsInfo;
   const TargetArch Arch;
   const OptLevel Opt;
   const IceString TestPrefix;
diff --git a/src/IceInst.h b/src/IceInst.h
index a465eda..0397e02 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -18,6 +18,7 @@
 
 #include "IceDefs.h"
 #include "IceInst.def"
+#include "IceIntrinsics.h"
 #include "IceTypes.h"
 
 // TODO: The Cfg structure, and instructions in particular, need to be
@@ -42,6 +43,7 @@
     Cast,
     Fcmp,
     Icmp,
+    IntrinsicCall,
     Load,
     Phi,
     Ret,
@@ -286,8 +288,13 @@
 public:
   static InstCall *create(Cfg *Func, SizeT NumArgs, Variable *Dest,
                           Operand *CallTarget) {
+    // Set HasSideEffects to true so that the call instruction can't be
+    // dead-code eliminated. IntrinsicCalls can override this if the
+    // particular intrinsic is deletable and has no side-effects.
+    const bool HasSideEffects = true;
+    const InstKind Kind = Inst::Call;
     return new (Func->allocateInst<InstCall>())
-        InstCall(Func, NumArgs, Dest, CallTarget);
+        InstCall(Func, NumArgs, Dest, CallTarget, HasSideEffects, Kind);
   }
   void addArg(Operand *Arg) { addSource(Arg); }
   Operand *getCallTarget() const { return getSrc(0); }
@@ -296,18 +303,18 @@
   virtual void dump(const Cfg *Func) const;
   static bool classof(const Inst *Inst) { return Inst->getKind() == Call; }
 
-private:
-  InstCall(Cfg *Func, SizeT NumArgs, Variable *Dest, Operand *CallTarget)
-      : Inst(Func, Inst::Call, NumArgs + 1, Dest) {
-    // Set HasSideEffects so that the call instruction can't be
-    // dead-code eliminated.  Don't set this for a deletable intrinsic
-    // call.
-    HasSideEffects = true;
+protected:
+  InstCall(Cfg *Func, SizeT NumArgs, Variable *Dest, Operand *CallTarget,
+           bool HasSideEff, InstKind Kind)
+      : Inst(Func, Kind, NumArgs + 1, Dest) {
+    HasSideEffects = HasSideEff;
     addSource(CallTarget);
   }
+  virtual ~InstCall() {}
+
+private:
   InstCall(const InstCall &) LLVM_DELETED_FUNCTION;
   InstCall &operator=(const InstCall &) LLVM_DELETED_FUNCTION;
-  virtual ~InstCall() {}
 };
 
 // Cast instruction (a.k.a. conversion operation).
@@ -395,6 +402,34 @@
   const ICond Condition;
 };
 
+// Call to an intrinsic function.  The call target is captured as getSrc(0),
+// and arg I is captured as getSrc(I+1).
+class InstIntrinsicCall : public InstCall {
+public:
+  static InstIntrinsicCall *create(Cfg *Func, SizeT NumArgs, Variable *Dest,
+                                   Operand *CallTarget,
+                                   const Intrinsics::IntrinsicInfo &Info) {
+    return new (Func->allocateInst<InstIntrinsicCall>())
+        InstIntrinsicCall(Func, NumArgs, Dest, CallTarget, Info);
+  }
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == IntrinsicCall;
+  }
+
+  Intrinsics::IntrinsicInfo getIntrinsicInfo() const { return Info; }
+
+private:
+  InstIntrinsicCall(Cfg *Func, SizeT NumArgs, Variable *Dest,
+                    Operand *CallTarget, const Intrinsics::IntrinsicInfo &Info)
+      : InstCall(Func, NumArgs, Dest, CallTarget, Info.HasSideEffects,
+                 Inst::IntrinsicCall),
+        Info(Info) {}
+  InstIntrinsicCall(const InstIntrinsicCall &) LLVM_DELETED_FUNCTION;
+  InstIntrinsicCall &operator=(const InstIntrinsicCall &) LLVM_DELETED_FUNCTION;
+  virtual ~InstIntrinsicCall() {}
+  const Intrinsics::IntrinsicInfo Info;
+};
+
 // Load instruction.  The source address is captured in getSrc(0).
 class InstLoad : public Inst {
 public:
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 17e5712..6477683 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -50,6 +50,15 @@
 const size_t TypeX8632AttributesSize =
     llvm::array_lengthof(TypeX8632Attributes);
 
+const char *InstX8632SegmentRegNames[] = {
+#define X(val, name)                                                           \
+  name,
+    SEG_REGX8632_TABLE
+#undef X
+};
+const size_t InstX8632SegmentRegNamesSize =
+    llvm::array_lengthof(InstX8632SegmentRegNames);
+
 } // end of anonymous namespace
 
 const char *InstX8632::getWidthString(Type Ty) {
@@ -58,9 +67,9 @@
 
 OperandX8632Mem::OperandX8632Mem(Cfg *Func, Type Ty, Variable *Base,
                                  Constant *Offset, Variable *Index,
-                                 uint32_t Shift)
+                                 uint16_t Shift, SegmentRegisters SegmentReg)
     : OperandX8632(kMem, Ty), Base(Base), Offset(Offset), Index(Index),
-      Shift(Shift) {
+      Shift(Shift), SegmentReg(SegmentReg) {
   assert(Shift <= 3);
   Vars = NULL;
   NumVars = 0;
@@ -148,6 +157,9 @@
   addSource(Src1);
 }
 
+InstX8632UD2::InstX8632UD2(Cfg *Func)
+    : InstX8632(Func, InstX8632::UD2, 0, NULL) {}
+
 InstX8632Test::InstX8632Test(Cfg *Func, Operand *Src1, Operand *Src2)
     : InstX8632(Func, InstX8632::Test, 2, NULL) {
   addSource(Src1);
@@ -525,6 +537,17 @@
   dumpSources(Func);
 }
 
+void InstX8632UD2::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 0);
+  Str << "\tud2\n";
+}
+
+void InstX8632UD2::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "ud2\n";
+}
+
 void InstX8632Test::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 2);
@@ -758,6 +781,11 @@
 void OperandX8632Mem::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   Str << TypeX8632Attributes[getType()].WidthString << " ";
+  if (SegmentReg != DefaultSegment) {
+    assert(SegmentReg >= 0 &&
+           static_cast<size_t>(SegmentReg) < InstX8632SegmentRegNamesSize);
+    Str << InstX8632SegmentRegNames[SegmentReg] << ":";
+  }
   // TODO: The following is an almost verbatim paste of dump().
   bool Dumped = false;
   Str << "[";
@@ -782,11 +810,14 @@
     OffsetIsZero = (CI->getValue() == 0);
     OffsetIsNegative = (static_cast<int64_t>(CI->getValue()) < 0);
   }
-  if (!OffsetIsZero) { // Suppress if Offset is known to be 0
-    if (Dumped) {
+  if (Dumped) {
+    if (!OffsetIsZero) {     // Suppress if Offset is known to be 0
       if (!OffsetIsNegative) // Suppress if Offset is known to be negative
         Str << "+";
+      Offset->emit(Func);
     }
+  } else {
+    // There is only the offset.
     Offset->emit(Func);
   }
   Str << "]";
@@ -794,6 +825,11 @@
 
 void OperandX8632Mem::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
+  if (SegmentReg != DefaultSegment) {
+    assert(SegmentReg >= 0 &&
+           static_cast<size_t>(SegmentReg) < InstX8632SegmentRegNamesSize);
+    Str << InstX8632SegmentRegNames[SegmentReg] << ":";
+  }
   bool Dumped = false;
   Str << "[";
   if (Base) {
@@ -817,11 +853,14 @@
     OffsetIsZero = (CI->getValue() == 0);
     OffsetIsNegative = (static_cast<int64_t>(CI->getValue()) < 0);
   }
-  if (!OffsetIsZero) { // Suppress if Offset is known to be 0
-    if (Dumped) {
+  if (Dumped) {
+    if (!OffsetIsZero) {     // Suppress if Offset is known to be 0
       if (!OffsetIsNegative) // Suppress if Offset is known to be negative
         Str << "+";
+      Offset->dump(Func);
     }
+  } else {
+    // There is only the offset.
     Offset->dump(Func);
   }
   Str << "]";
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index d5e99c3..47650e1 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -38,6 +38,16 @@
 //#define X(val, init, name, name16, name8, scratch, preserved, stackptr,
 //          frameptr, isI8, isInt, isFP)
 
+// X86 segment registers.
+#define SEG_REGX8632_TABLE  \
+  /* enum value, name */    \
+  X(SegReg_CS, "cs")        \
+  X(SegReg_DS, "ds")        \
+  X(SegReg_ES, "es")        \
+  X(SegReg_SS, "ss")        \
+  X(SegReg_FS, "fs")        \
+  X(SegReg_GS, "gs")        \
+//#define X(val, name)
 
 #define ICEINSTX8632BR_TABLE   \
   /* enum value, dump, emit */ \
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 8a6f14a..7e6e199 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -52,16 +52,26 @@
 // value for the index register.
 class OperandX8632Mem : public OperandX8632 {
 public:
+  enum SegmentRegisters {
+    DefaultSegment = -1,
+#define X(val, name)                                                           \
+    val,
+      SEG_REGX8632_TABLE
+#undef X
+        SegReg_NUM
+  };
   static OperandX8632Mem *create(Cfg *Func, Type Ty, Variable *Base,
                                  Constant *Offset, Variable *Index = NULL,
-                                 uint32_t Shift = 0) {
+                                 uint16_t Shift = 0,
+                                 SegmentRegisters SegmentReg = DefaultSegment) {
     return new (Func->allocate<OperandX8632Mem>())
-        OperandX8632Mem(Func, Ty, Base, Offset, Index, Shift);
+        OperandX8632Mem(Func, Ty, Base, Offset, Index, Shift, SegmentReg);
   }
   Variable *getBase() const { return Base; }
   Constant *getOffset() const { return Offset; }
   Variable *getIndex() const { return Index; }
-  uint32_t getShift() const { return Shift; }
+  uint16_t getShift() const { return Shift; }
+  SegmentRegisters getSegmentRegister() const { return SegmentReg; }
   virtual void emit(const Cfg *Func) const;
   virtual void dump(const Cfg *Func) const;
 
@@ -71,14 +81,15 @@
 
 private:
   OperandX8632Mem(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
-                  Variable *Index, uint32_t Shift);
+                  Variable *Index, uint16_t Shift, SegmentRegisters SegmentReg);
   OperandX8632Mem(const OperandX8632Mem &) LLVM_DELETED_FUNCTION;
   OperandX8632Mem &operator=(const OperandX8632Mem &) LLVM_DELETED_FUNCTION;
   virtual ~OperandX8632Mem() {}
   Variable *Base;
   Constant *Offset;
   Variable *Index;
-  uint32_t Shift;
+  uint16_t Shift;
+  SegmentRegisters SegmentReg : 16;
 };
 
 // VariableSplit is a way to treat an f64 memory location as a pair
@@ -160,6 +171,7 @@
     Subss,
     Test,
     Ucomiss,
+    UD2,
     Xor
   };
   static const char *getWidthString(Type Ty);
@@ -531,6 +543,23 @@
   virtual ~InstX8632Ucomiss() {}
 };
 
+// UD2 instruction.
+class InstX8632UD2 : public InstX8632 {
+public:
+  static InstX8632UD2 *create(Cfg *Func) {
+    return new (Func->allocate<InstX8632UD2>()) InstX8632UD2(Func);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, UD2); }
+
+private:
+  InstX8632UD2(Cfg *Func);
+  InstX8632UD2(const InstX8632UD2 &) LLVM_DELETED_FUNCTION;
+  InstX8632UD2 &operator=(const InstX8632UD2 &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632UD2() {}
+};
+
 // Test instruction.
 class InstX8632Test : public InstX8632 {
 public:
diff --git a/src/IceIntrinsics.cpp b/src/IceIntrinsics.cpp
new file mode 100644
index 0000000..dbf79cf
--- /dev/null
+++ b/src/IceIntrinsics.cpp
@@ -0,0 +1,202 @@
+//===- subzero/src/IceIntrinsics.cpp - Functions related to intrinsics ----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Intrinsics utilities for matching and
+// then dispatching by name.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceIntrinsics.h"
+#include "IceLiveness.h"
+#include "IceOperand.h"
+
+#include <utility>
+
+namespace Ice {
+
+namespace {
+
+const struct IceIntrinsicsEntry_ {
+  Intrinsics::FullIntrinsicInfo Info;
+  const char *IntrinsicName;
+} IceIntrinsicsTable[] = {
+#define AtomicCmpxchgInit(Overload, NameSuffix)                                \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::AtomicCmpxchg, true },                                     \
+      { Overload, IceType_i32, Overload, Overload, IceType_i32, IceType_i32 }, \
+      6                                                                        \
+    }                                                                          \
+    , "nacl.atomic.cmpxchg." NameSuffix                                        \
+  }
+    AtomicCmpxchgInit(IceType_i8, "i8"),
+    AtomicCmpxchgInit(IceType_i16, "i16"),
+    AtomicCmpxchgInit(IceType_i32, "i32"),
+    AtomicCmpxchgInit(IceType_i64, "i64"),
+#undef AtomicCmpxchgInit
+    { { { Intrinsics::AtomicFence, true }, { IceType_void, IceType_i32 }, 2 },
+      "nacl.atomic.fence" },
+    { { { Intrinsics::AtomicFenceAll, true }, { IceType_void }, 1 },
+      "nacl.atomic.fence.all" },
+    { { { Intrinsics::AtomicIsLockFree, true },
+        { IceType_i1, IceType_i32, IceType_i32 }, 3 },
+      "nacl.atomic.is.lock.free" },
+
+#define AtomicLoadInit(Overload, NameSuffix)                                   \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::AtomicLoad, true }                                         \
+      , { Overload, IceType_i32, IceType_i32 }, 3                              \
+    }                                                                          \
+    , "nacl.atomic.load." NameSuffix                                           \
+  }
+    AtomicLoadInit(IceType_i8, "i8"),
+    AtomicLoadInit(IceType_i16, "i16"),
+    AtomicLoadInit(IceType_i32, "i32"),
+    AtomicLoadInit(IceType_i64, "i64"),
+#undef AtomicLoadInit
+
+#define AtomicRMWInit(Overload, NameSuffix)                                    \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::AtomicRMW, true }                                          \
+      , { Overload, IceType_i32, IceType_i32, Overload, IceType_i32 }, 5       \
+    }                                                                          \
+    , "nacl.atomic.rmw." NameSuffix                                            \
+  }
+    AtomicRMWInit(IceType_i8, "i8"),
+    AtomicRMWInit(IceType_i16, "i16"),
+    AtomicRMWInit(IceType_i32, "i32"),
+    AtomicRMWInit(IceType_i64, "i64"),
+#undef AtomicRMWInit
+
+#define AtomicStoreInit(Overload, NameSuffix)                                  \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::AtomicStore, true }                                        \
+      , { IceType_void, Overload, IceType_i32, IceType_i32 }, 5                \
+    }                                                                          \
+    , "nacl.atomic.store." NameSuffix                                          \
+  }
+    AtomicStoreInit(IceType_i8, "i8"),
+    AtomicStoreInit(IceType_i16, "i16"),
+    AtomicStoreInit(IceType_i32, "i32"),
+    AtomicStoreInit(IceType_i64, "i64"),
+#undef AtomicStoreInit
+
+#define BswapInit(Overload, NameSuffix)                                        \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::Bswap, false }                                             \
+      , { Overload, Overload }, 2                                              \
+    }                                                                          \
+    , "bswap." NameSuffix                                                      \
+  }
+    BswapInit(IceType_i16, "i16"),
+    BswapInit(IceType_i32, "i32"),
+    BswapInit(IceType_i64, "i64"),
+#undef BswapInit
+
+#define CtlzInit(Overload, NameSuffix)                                         \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::Ctlz, false }                                              \
+      , { Overload, Overload, IceType_i1 }, 3                                  \
+    }                                                                          \
+    , "ctlz." NameSuffix                                                       \
+  }
+    CtlzInit(IceType_i32, "i32"),
+    CtlzInit(IceType_i64, "i64"),
+#undef CtlzInit
+
+#define CtpopInit(Overload, NameSuffix)                                        \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::Ctpop, false }                                             \
+      , { Overload, Overload }, 2                                              \
+    }                                                                          \
+    , "ctpop." NameSuffix                                                      \
+  }
+    CtpopInit(IceType_i32, "i32"),
+    CtpopInit(IceType_i64, "i64"),
+#undef CtpopInit
+
+#define CttzInit(Overload, NameSuffix)                                         \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::Cttz, false }                                              \
+      , { Overload, Overload, IceType_i1 }, 3                                  \
+    }                                                                          \
+    , "cttz." NameSuffix                                                       \
+  }
+    CttzInit(IceType_i32, "i32"),
+    CttzInit(IceType_i64, "i64"),
+#undef CttzInit
+    { { { Intrinsics::Longjmp, true },
+        { IceType_void, IceType_i32, IceType_i32 }, 3 },
+      "nacl.longjmp" },
+    { { { Intrinsics::Memcpy, true }, { IceType_void, IceType_i32, IceType_i32,
+                                        IceType_i32,  IceType_i32, IceType_i1 },
+        6 },
+      "memcpy.p0i8.p0i8.i32" },
+    { { { Intrinsics::Memmove, true },
+        { IceType_void, IceType_i32, IceType_i32,
+          IceType_i32,  IceType_i32, IceType_i1 },
+        6 },
+      "memmove.p0i8.p0i8.i32" },
+    { { { Intrinsics::Memset, true }, { IceType_void, IceType_i32, IceType_i8,
+                                        IceType_i32,  IceType_i32, IceType_i1 },
+        6 },
+      "memset.p0i8.i32" },
+    { { { Intrinsics::NaClReadTP, false }, { IceType_i32 }, 1 },
+      "nacl.read.tp" },
+    { { { Intrinsics::Setjmp, true }, { IceType_i32, IceType_i32 }, 2 },
+      "nacl.setjmp" },
+
+#define SqrtInit(Overload, NameSuffix)                                         \
+  {                                                                            \
+    {                                                                          \
+      { Intrinsics::Sqrt, false }                                              \
+      , { Overload, Overload }, 2                                              \
+    }                                                                          \
+    , "sqrt." NameSuffix                                                       \
+  }
+    SqrtInit(IceType_f32, "f32"),
+    SqrtInit(IceType_f64, "f64"),
+#undef SqrtInit
+    { { { Intrinsics::Stacksave, true }, { IceType_i32 }, 1 }, "stacksave" },
+    { { { Intrinsics::Stackrestore, true }, { IceType_void, IceType_i32 }, 2 },
+      "stackrestore" },
+    { { { Intrinsics::Trap, true }, { IceType_void }, 1 }, "trap" }
+  };
+const size_t IceIntrinsicsTableSize = llvm::array_lengthof(IceIntrinsicsTable);
+
+} // end of namespace
+
+Intrinsics::Intrinsics() {
+  for (size_t I = 0; I < IceIntrinsicsTableSize; ++I) {
+    const struct IceIntrinsicsEntry_ &Entry = IceIntrinsicsTable[I];
+    assert(Entry.Info.NumTypes <= kMaxIntrinsicParameters);
+    map.insert(std::make_pair(IceString(Entry.IntrinsicName), Entry.Info));
+  }
+}
+
+Intrinsics::~Intrinsics() {}
+
+const Intrinsics::FullIntrinsicInfo *
+Intrinsics::find(const IceString &Name) const {
+  IntrinsicMap::const_iterator it = map.find(Name);
+  if (it == map.end())
+    return NULL;
+  return &it->second;
+}
+
+} // end of namespace Ice
diff --git a/src/IceIntrinsics.h b/src/IceIntrinsics.h
new file mode 100644
index 0000000..4f9f7de
--- /dev/null
+++ b/src/IceIntrinsics.h
@@ -0,0 +1,94 @@
+//===- subzero/src/IceIntrinsics.h - List of Ice Intrinsics -----*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the kinds of intrinsics supported by PNaCl.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICEINTRINSICS_H
+#define SUBZERO_SRC_ICEINTRINSICS_H
+
+#include "IceDefs.h"
+
+namespace Ice {
+
+static const size_t kMaxIntrinsicParameters = 6;
+
+class Intrinsics {
+public:
+  Intrinsics();
+  ~Intrinsics();
+
+  // Some intrinsics allow overloading by type. This enum collapses all
+  // overloads into a single ID, but the type can still be recovered by the
+  // type of the intrinsic function call's return value and parameters.
+  enum IntrinsicID {
+    UnknownIntrinsic = 0,
+    // Arbitrary (alphabetical) order.
+    AtomicCmpxchg,
+    AtomicFence,
+    AtomicFenceAll,
+    AtomicIsLockFree,
+    AtomicLoad,
+    AtomicRMW,
+    AtomicStore,
+    Bswap,
+    Ctlz,
+    Ctpop,
+    Cttz,
+    Longjmp,
+    Memcpy,
+    Memmove,
+    Memset,
+    NaClReadTP,
+    Setjmp,
+    Sqrt,
+    Stacksave,
+    Stackrestore,
+    Trap
+  };
+
+  // Basic attributes related to each intrinsic, that are relevant to
+  // code generation. We will want to have more attributes (e.g., Setjmp
+  // returns twice and which affects stack coloring) once the lowering
+  // cares about such attributes. Perhaps the attributes representation
+  // can be shared with general function calls, though most functions
+  // will be opaque.
+  struct IntrinsicInfo {
+    IntrinsicID ID : 31;
+    bool HasSideEffects : 1;
+  };
+
+  // The complete set of information about an intrinsic.
+  struct FullIntrinsicInfo {
+    struct IntrinsicInfo Info; // Information that CodeGen would care about.
+
+    // Sanity check during parsing.
+    Type Signature[kMaxIntrinsicParameters];
+    uint8_t NumTypes;
+  };
+
+  // Find the information about a given intrinsic, based on function name.
+  // The function name is expected to have the common "llvm." prefix
+  // stripped. If found, returns a reference to a FullIntrinsicInfo entry
+  // (valid for the lifetime of the map). Otherwise returns null.
+  const FullIntrinsicInfo *find(const IceString &Name) const;
+
+private:
+  // TODO(jvoung): May want to switch to something like LLVM's StringMap.
+  typedef std::map<IceString, FullIntrinsicInfo> IntrinsicMap;
+  IntrinsicMap map;
+
+  Intrinsics(const Intrinsics &) LLVM_DELETED_FUNCTION;
+  Intrinsics &operator=(const Intrinsics &) LLVM_DELETED_FUNCTION;
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICEINTRINSICS_H
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 72a3e8c..877f717 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -116,6 +116,9 @@
   case Inst::Icmp:
     lowerIcmp(llvm::dyn_cast<InstIcmp>(Inst));
     break;
+  case Inst::IntrinsicCall:
+    lowerIntrinsicCall(llvm::dyn_cast<InstIntrinsicCall>(Inst));
+    break;
   case Inst::Load:
     lowerLoad(llvm::dyn_cast<InstLoad>(Inst));
     break;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 7f798a8..dbb9a42 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -167,6 +167,7 @@
   virtual void lowerCast(const InstCast *Inst) = 0;
   virtual void lowerFcmp(const InstFcmp *Inst) = 0;
   virtual void lowerIcmp(const InstIcmp *Inst) = 0;
+  virtual void lowerIntrinsicCall(const InstIntrinsicCall *Inst) = 0;
   virtual void lowerLoad(const InstLoad *Inst) = 0;
   virtual void lowerPhi(const InstPhi *Inst) = 0;
   virtual void lowerRet(const InstRet *Inst) = 0;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 0edcab5..449e413 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -431,6 +431,9 @@
   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
 }
 
+// static
+Type TargetX8632::stackSlotType() { return IceType_i32; }
+
 void TargetX8632::addProlog(CfgNode *Node) {
   // If SimpleCoalescing is false, each variable without a register
   // gets its own unique stack slot, which leads to large stack
@@ -760,7 +763,7 @@
   if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
     return OperandX8632Mem::create(Func, IceType_i32, Mem->getBase(),
                                    Mem->getOffset(), Mem->getIndex(),
-                                   Mem->getShift());
+                                   Mem->getShift(), Mem->getSegmentRegister());
   }
   llvm_unreachable("Unsupported operand type");
   return NULL;
@@ -790,7 +793,8 @@
                                    SymOffset->getName());
     }
     return OperandX8632Mem::create(Func, IceType_i32, Mem->getBase(), Offset,
-                                   Mem->getIndex(), Mem->getShift());
+                                   Mem->getIndex(), Mem->getShift(),
+                                   Mem->getSegmentRegister());
   }
   llvm_unreachable("Unsupported operand type");
   return NULL;
@@ -1774,6 +1778,91 @@
   Context.insert(Label);
 }
 
+void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
+  switch (Instr->getIntrinsicInfo().ID) {
+  case Intrinsics::AtomicCmpxchg:
+  case Intrinsics::AtomicFence:
+  case Intrinsics::AtomicFenceAll:
+  case Intrinsics::AtomicIsLockFree:
+  case Intrinsics::AtomicLoad:
+  case Intrinsics::AtomicRMW:
+  case Intrinsics::AtomicStore:
+  case Intrinsics::Bswap:
+  case Intrinsics::Ctlz:
+  case Intrinsics::Ctpop:
+  case Intrinsics::Cttz:
+    Func->setError("Unhandled intrinsic");
+    return;
+  case Intrinsics::Longjmp: {
+    InstCall *Call = makeHelperCall("longjmp", NULL, 2);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    lowerCall(Call);
+    break;
+  }
+  case Intrinsics::Memcpy: {
+    // In the future, we could potentially emit an inline memcpy/memset, etc.
+    // for intrinsic calls w/ a known length.
+    InstCall *Call = makeHelperCall("memcpy", NULL, 3);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    Call->addArg(Instr->getArg(2));
+    lowerCall(Call);
+    break;
+  }
+  case Intrinsics::Memmove: {
+    InstCall *Call = makeHelperCall("memmove", NULL, 3);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(Instr->getArg(1));
+    Call->addArg(Instr->getArg(2));
+    lowerCall(Call);
+    break;
+  }
+  case Intrinsics::Memset: {
+    // The value operand needs to be extended to a stack slot size
+    // because we "push" only works for a specific operand size.
+    Operand *ValOp = Instr->getArg(1);
+    assert(ValOp->getType() == IceType_i8);
+    Variable *ValExt = makeReg(stackSlotType());
+    _movzx(ValExt, ValOp);
+    InstCall *Call = makeHelperCall("memset", NULL, 3);
+    Call->addArg(Instr->getArg(0));
+    Call->addArg(ValExt);
+    Call->addArg(Instr->getArg(2));
+    lowerCall(Call);
+    break;
+  }
+  case Intrinsics::NaClReadTP: {
+    Constant *Zero = Ctx->getConstantInt(IceType_i32, 0);
+    Operand *Src = OperandX8632Mem::create(Func, IceType_i32, NULL, Zero, NULL,
+                                           0, OperandX8632Mem::SegReg_GS);
+    Variable *Dest = Instr->getDest();
+    Variable *T = NULL;
+    _mov(T, Src);
+    _mov(Dest, T);
+    break;
+  }
+  case Intrinsics::Setjmp: {
+    InstCall *Call = makeHelperCall("setjmp", Instr->getDest(), 1);
+    Call->addArg(Instr->getArg(0));
+    lowerCall(Call);
+    break;
+  }
+  case Intrinsics::Sqrt:
+  case Intrinsics::Stacksave:
+  case Intrinsics::Stackrestore:
+    Func->setError("Unhandled intrinsic");
+    return;
+  case Intrinsics::Trap:
+    _ud2();
+    break;
+  case Intrinsics::UnknownIntrinsic:
+    Func->setError("Should not be lowering UnknownIntrinsic");
+    return;
+  }
+  return;
+}
+
 namespace {
 
 bool isAdd(const Inst *Inst) {
@@ -1784,7 +1873,7 @@
   return false;
 }
 
-void computeAddressOpt(Variable *&Base, Variable *&Index, int32_t &Shift,
+void computeAddressOpt(Variable *&Base, Variable *&Index, uint16_t &Shift,
                        int32_t &Offset) {
   (void)Offset; // TODO: pattern-match for non-zero offsets.
   if (Base == NULL)
@@ -1965,14 +2054,20 @@
   Variable *Dest = Inst->getDest();
   Operand *Addr = Inst->getSrc(0);
   Variable *Index = NULL;
-  int32_t Shift = 0;
+  uint16_t Shift = 0;
   int32_t Offset = 0; // TODO: make Constant
+  // Vanilla ICE load instructions should not use the segment registers,
+  // and computeAddressOpt only works at the level of Variables and Constants,
+  // not other OperandX8632Mem, so there should be no mention of segment
+  // registers there either.
+  const OperandX8632Mem::SegmentRegisters SegmentReg =
+      OperandX8632Mem::DefaultSegment;
   Variable *Base = llvm::dyn_cast<Variable>(Addr);
   computeAddressOpt(Base, Index, Shift, Offset);
   if (Base && Addr != Base) {
     Constant *OffsetOp = Ctx->getConstantInt(IceType_i32, Offset);
     Addr = OperandX8632Mem::create(Func, Dest->getType(), Base, OffsetOp, Index,
-                                   Shift);
+                                   Shift, SegmentReg);
     Inst->setDeleted();
     Context.insert(InstLoad::create(Func, Dest, Addr));
   }
@@ -2081,14 +2176,20 @@
   Operand *Data = Inst->getData();
   Operand *Addr = Inst->getAddr();
   Variable *Index = NULL;
-  int32_t Shift = 0;
+  uint16_t Shift = 0;
   int32_t Offset = 0; // TODO: make Constant
   Variable *Base = llvm::dyn_cast<Variable>(Addr);
+  // Vanilla ICE store instructions should not use the segment registers,
+  // and computeAddressOpt only works at the level of Variables and Constants,
+  // not other OperandX8632Mem, so there should be no mention of segment
+  // registers there either.
+  const OperandX8632Mem::SegmentRegisters SegmentReg =
+      OperandX8632Mem::DefaultSegment;
   computeAddressOpt(Base, Index, Shift, Offset);
   if (Base && Addr != Base) {
     Constant *OffsetOp = Ctx->getConstantInt(IceType_i32, Offset);
     Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
-                                   Shift);
+                                   Shift, SegmentReg);
     Inst->setDeleted();
     Context.insert(InstStore::create(Func, Data, Addr));
   }
@@ -2147,9 +2248,9 @@
       RegIndex = legalizeToVar(Index, true);
     }
     if (Base != RegBase || Index != RegIndex) {
-      From =
-          OperandX8632Mem::create(Func, Mem->getType(), RegBase,
-                                  Mem->getOffset(), RegIndex, Mem->getShift());
+      From = OperandX8632Mem::create(
+          Func, Mem->getType(), RegBase, Mem->getOffset(), RegIndex,
+          Mem->getShift(), Mem->getSegmentRegister());
     }
 
     if (!(Allowed & Legal_Mem)) {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 3ca9ca3..7902136 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -83,6 +83,7 @@
   virtual void lowerCast(const InstCast *Inst);
   virtual void lowerFcmp(const InstFcmp *Inst);
   virtual void lowerIcmp(const InstIcmp *Inst);
+  virtual void lowerIntrinsicCall(const InstIntrinsicCall *Inst);
   virtual void lowerLoad(const InstLoad *Inst);
   virtual void lowerPhi(const InstPhi *Inst);
   virtual void lowerRet(const InstRet *Inst);
@@ -123,6 +124,7 @@
     InstCall *Call = InstCall::create(Func, MaxSrcs, Dest, CallTarget);
     return Call;
   }
+  static Type stackSlotType();
 
   // The following are helpers that insert lowered x86 instructions
   // with minimal syntactic overhead, so that the lowering code can
@@ -246,6 +248,7 @@
   void _ucomiss(Operand *Src0, Operand *Src1) {
     Context.insert(InstX8632Ucomiss::create(Func, Src0, Src1));
   }
+  void _ud2() { Context.insert(InstX8632UD2::create(Func)); }
   void _xor(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Xor::create(Func, Dest, Src0));
   }
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index bce8c94..2b323f7 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -528,11 +528,37 @@
     unsigned NumArgs = Inst->getNumArgOperands();
     // Note: Subzero doesn't (yet) do anything special with the Tail
     // flag in the bitcode, i.e. CallInst::isTailCall().
-    Ice::InstCall *NewInst =
-        Ice::InstCall::create(Func, NumArgs, Dest, CallTarget);
+    Ice::InstCall *NewInst = NULL;
+    const Ice::Intrinsics::FullIntrinsicInfo *Info = NULL;
+
+    if (Ice::ConstantRelocatable *Target =
+            llvm::dyn_cast<Ice::ConstantRelocatable>(CallTarget)) {
+      // Check if this direct call is to an Intrinsic (starts with "llvm.")
+      static const char LLVMPrefix[] = "llvm.";
+      const size_t LLVMPrefixLen = strlen(LLVMPrefix);
+      Ice::IceString Name = Target->getName();
+      if (Name.substr(0, LLVMPrefixLen) == LLVMPrefix) {
+        Ice::IceString NameSuffix = Name.substr(LLVMPrefixLen);
+        Info = Ctx->getIntrinsicsInfo().find(NameSuffix);
+        if (!Info) {
+          report_fatal_error(std::string("Invalid PNaCl intrinsic call: ") +
+                             LLVMObjectAsString(Inst));
+        }
+        NewInst = Ice::InstIntrinsicCall::create(Func, NumArgs, Dest,
+                                                 CallTarget, Info->Info);
+      }
+    }
+
+    // Not an intrinsic call.
+    if (NewInst == NULL) {
+      NewInst = Ice::InstCall::create(Func, NumArgs, Dest, CallTarget);
+    }
     for (unsigned i = 0; i < NumArgs; ++i) {
       NewInst->addArg(convertOperand(Inst, i));
     }
+    if (Info) {
+      validateIntrinsicCall(NewInst, Info);
+    }
     return NewInst;
   }
 
@@ -559,6 +585,31 @@
     return Node;
   }
 
+  void validateIntrinsicCall(const Ice::InstCall *Call,
+                             const Ice::Intrinsics::FullIntrinsicInfo *I) {
+    assert(I->NumTypes >= 1);
+    if (I->Signature[0] == Ice::IceType_void) {
+      if (Call->getDest() != NULL) {
+        report_fatal_error(
+            "Return value for intrinsic func w/ void return type.");
+      }
+    } else {
+      if (I->Signature[0] != Call->getDest()->getType()) {
+        report_fatal_error("Mismatched return types.");
+      }
+    }
+    if (Call->getNumArgs() + 1 != I->NumTypes) {
+      std::cerr << "Call->getNumArgs() " << (int)Call->getNumArgs()
+                << " I->NumTypes " << (int)I->NumTypes << "\n";
+      report_fatal_error("Mismatched # of args.");
+    }
+    for (size_t i = 1; i < I->NumTypes; ++i) {
+      if (Call->getArg(i - 1)->getType() != I->Signature[i]) {
+        report_fatal_error("Mismatched argument type.");
+      }
+    }
+  }
+
 private:
   // Data
   Ice::GlobalContext *Ctx;
diff --git a/szdiff.py b/szdiff.py
index 9b8d613..046982e 100755
--- a/szdiff.py
+++ b/szdiff.py
@@ -43,14 +43,23 @@
     tail_call = re.compile(' tail call ');
     trailing_comment = re.compile(';.*')
     ignore_pattern = re.compile('^ *$|^declare|^@')
+    prev_line = None
     for line in bitcode:
+        if prev_line:
+            line = prev_line + line
+            prev_line = None
         # Convert tail call into regular (non-tail) call.
         line = tail_call.sub(' call ', line)
         # Remove trailing comments and spaces.
         line = trailing_comment.sub('', line).rstrip()
         # Ignore blanks lines, forward declarations, and variable definitions.
-        if not ignore_pattern.search(line):
-            llc_out.append(line)
+        if ignore_pattern.search(line):
+          continue
+        # SZ doesn't break up long lines, but LLVM does. Normalize to SZ.
+        if line.endswith(','):
+            prev_line = line
+            continue
+        llc_out.append(line)
 
     # Compare sz_out and llc_out line by line, but ignore pairs of
     # lines where the llc line matches a certain pattern.
@@ -61,6 +70,8 @@
         '|'.join([' -[0-9]',                 # negative constants
                   ' (float|double) [-0-9]',  # FP constants
                   ' (float|double) %\w+, [-0-9]',
+                  ' @llvm\..*i\d+\*',        # intrinsic calls w/ pointer args
+                  ' i\d+\* @llvm\.',         # intrinsic calls w/ pointer ret
                   ' inttoptr ',              # inttoptr pointer types
                   ' ptrtoint ',              # ptrtoint pointer types
                   ' bitcast .*\* .* to .*\*' # bitcast pointer types
@@ -72,8 +83,8 @@
         if llc_line and ignore_pattern.search(llc_line):
             lines_diff += 1
             continue
-        if sz_line: print 'SZ>' + sz_line
-        if llc_line: print 'LL>' + llc_line
+        if sz_line: print 'SZ (%d)> %s' % (lines_total, sz_line)
+        if llc_line: print 'LL (%d)> %s' % (lines_total, llc_line)
         return_code = 1
 
     if return_code == 0:
diff --git a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
new file mode 100644
index 0000000..15f9a65
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -0,0 +1,177 @@
+; This tests the NaCl intrinsics not related to atomic operations.
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+declare i8* @llvm.nacl.read.tp()
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
+declare void @llvm.nacl.longjmp(i8*, i32)
+declare i32 @llvm.nacl.setjmp(i8*)
+declare void @llvm.trap()
+
+define i32 @test_nacl_read_tp() {
+entry:
+  %ptr = call i8* @llvm.nacl.read.tp()
+  %__1 = ptrtoint i8* %ptr to i32
+  ret i32 %__1
+}
+; CHECK-LABEL: test_nacl_read_tp
+; CHECK: mov e{{.*}}, dword ptr gs:[0]
+; CHECKO2REM-LABEL: test_nacl_read_tp
+; CHECKO2REM: mov e{{.*}}, dword ptr gs:[0]
+
+define i32 @test_nacl_read_tp_more_addressing() {
+entry:
+  %ptr = call i8* @llvm.nacl.read.tp()
+  %__1 = ptrtoint i8* %ptr to i32
+  %x = add i32 %__1, %__1
+  %__3 = inttoptr i32 %x to i32*
+  %v = load i32* %__3, align 1
+  %ptr2 = call i8* @llvm.nacl.read.tp()
+  %__6 = ptrtoint i8* %ptr2 to i32
+  %y = add i32 %__6, 4
+  %__8 = inttoptr i32 %y to i32*
+  store i32 %v, i32* %__8, align 1
+  ret i32 %v
+}
+; CHECK-LABEL: test_nacl_read_tp_more_addressing
+; CHECK: mov e{{.*}}, dword ptr gs:[0]
+; CHECK: mov e{{.*}}, dword ptr gs:[0]
+; CHECKO2REM-LABEL: test_nacl_read_tp_more_addressing
+; CHECKO2REM: mov e{{.*}}, dword ptr gs:[0]
+; CHECKO2REM: mov e{{.*}}, dword ptr gs:[0]
+
+define i32 @test_nacl_read_tp_dead(i32 %a) {
+entry:
+  %ptr = call i8* @llvm.nacl.read.tp()
+  ; Not actually using the result of nacl read tp call.
+  ; In O2 mode this should be DCE'ed.
+  ret i32 %a
+}
+; Consider nacl.read.tp side-effect free, so it can be eliminated.
+; CHECKO2REM-LABEL: test_nacl_read_tp_dead
+; CHECKO2REM-NOT: mov e{{.*}}, dword ptr gs:[0]
+
+define void @test_memcpy(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 %len, i32 1, i1 0)
+  ret void
+}
+; CHECK-LABEL: test_memcpy
+; CHECK: call memcpy
+
+; TODO(jvoung) -- if we want to be clever, we can do this and the memmove,
+; memset without a function call.
+define void @test_memcpy_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                       i32 8, i32 1, i1 0)
+  ret void
+}
+; CHECK-LABEL: test_memcpy_const_len_align
+; CHECK: call memcpy
+
+define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                        i32 %len, i32 1, i1 0)
+  ret void
+}
+; CHECK-LABEL: test_memmove
+; CHECK: call memmove
+
+define void @test_memmove_const_len_align(i32 %iptr_dst, i32 %iptr_src) {
+entry:
+  %dst = inttoptr i32 %iptr_dst to i8*
+  %src = inttoptr i32 %iptr_src to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src,
+                                        i32 8, i32 1, i1 0)
+  ret void
+}
+; CHECK-LABEL: test_memmove_const_len_align
+; CHECK: call memmove
+
+define void @test_memset(i32 %iptr_dst, i32 %wide_val, i32 %len) {
+entry:
+  %val = trunc i32 %wide_val to i8
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val,
+                                  i32 %len, i32 1, i1 0)
+  ret void
+}
+; CHECK-LABEL: test_memset
+; CHECK: call memset
+
+define void @test_memset_const_len_align(i32 %iptr_dst, i32 %wide_val) {
+entry:
+  %val = trunc i32 %wide_val to i8
+  %dst = inttoptr i32 %iptr_dst to i8*
+  call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val,
+                                  i32 8, i32 1, i1 0)
+  ret void
+}
+; CHECK-LABEL: test_memset_const_len_align
+; CHECK: call memset
+
+define i32 @test_setjmplongjmp(i32 %iptr_env) {
+entry:
+  %env = inttoptr i32 %iptr_env to i8*
+  %i = call i32 @llvm.nacl.setjmp(i8* %env)
+  %r1 = icmp eq i32 %i, 0
+  br i1 %r1, label %Zero, label %NonZero
+Zero:
+  ; Redundant inttoptr, to make --pnacl cast-eliding/re-insertion happy.
+  %env2 = inttoptr i32 %iptr_env to i8*
+  call void @llvm.nacl.longjmp(i8* %env2, i32 1)
+  ret i32 0
+NonZero:
+  ret i32 1
+}
+; CHECK-LABEL: test_setjmplongjmp
+; CHECK: call setjmp
+; CHECK: call longjmp
+; CHECKO2REM-LABEL: test_setjmplongjmp
+; CHECKO2REM: call setjmp
+; CHECKO2REM: call longjmp
+
+define i32 @test_setjmp_unused(i32 %iptr_env, i32 %i_other) {
+entry:
+  %env = inttoptr i32 %iptr_env to i8*
+  %i = call i32 @llvm.nacl.setjmp(i8* %env)
+  ret i32 %i_other
+}
+; Don't consider setjmp side-effect free, so it's not eliminated if
+; result unused.
+; CHECKO2REM-LABEL: test_setjmp_unused
+; CHECKO2REM: call setjmp
+
+define i32 @test_trap(i32 %br) {
+entry:
+  %r1 = icmp eq i32 %br, 0
+  br i1 %r1, label %Zero, label %NonZero
+Zero:
+  call void @llvm.trap()
+  unreachable
+NonZero:
+  ret i32 1
+}
+; CHECK-LABEL: test_trap
+; CHECK: ud2
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ