Lower insertelement and extractelement.

Use instructions that do the operations in registers and that are
available in SSE2. Spill to memory to perform the operation in the
absence of any other reasonable options (v16i8 and v16i1).

Unfortunately there is no natural class of SSE2 instructions that
insertelement / extractelement can get lowered
to for all vector types (though pinsr[bwd] and pextr[bwd] are
available in SSE4.1). There are in some cases a large number of
choices available for lowering and I have not looked into which
choices are the best yet, besides using LLVM output as a guide.

BUG=none
R=jvoung@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/401523003
diff --git a/crosstest/runtests.sh b/crosstest/runtests.sh
index 913cc5b..2073fb5 100755
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -82,6 +82,13 @@
        --driver=test_sync_atomic_main.cpp \
        --output=test_sync_atomic_O${optlevel}
 
+    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
+        --dir="${OUTDIR}" \
+        --llvm-bin-path="${LLVM_BIN_PATH}" \
+        --test=test_vector_ops.ll \
+        --driver=test_vector_ops_main.cpp \
+        --output=test_vector_ops_O${optlevel}
+
 done
 
 for optlevel in ${OPTLEVELS} ; do
@@ -94,4 +101,5 @@
     "${OUTDIR}"/test_global_O${optlevel}
     "${OUTDIR}"/test_icmp_O${optlevel}
     "${OUTDIR}"/test_sync_atomic_O${optlevel}
+    "${OUTDIR}"/test_vector_ops_O${optlevel}
 done
diff --git a/crosstest/test_vector_ops.def b/crosstest/test_vector_ops.def
new file mode 100644
index 0000000..422ed54
--- /dev/null
+++ b/crosstest/test_vector_ops.def
@@ -0,0 +1,19 @@
+#ifndef TEST_VECTOR_OPS_DEF
+
+#define VECTOR_TYPE_TABLE                 \
+/* typename, element type,  cast type */  \
+X(v16si8,          int8_t,  int64_t)      \
+X(v16ui8,         uint8_t,  int64_t)      \
+X(v8si16,         int16_t,  int64_t)      \
+X(v8ui16,        uint16_t,  int64_t)      \
+X(v4si32,         int32_t,  int64_t)      \
+X(v4ui32,        uint32_t,  int64_t)      \
+X(v4f32,            float,    float)      \
+
+#define I1_VECTOR_TYPE_TABLE              \
+/* typename, expanded type, # elements */ \
+X(v4i1,             v4ui32,          4)   \
+X(v8i1,             v8ui16,          8)   \
+X(v16i1,            v16ui8,         16)   \
+
+#endif
diff --git a/crosstest/test_vector_ops.ll b/crosstest/test_vector_ops.ll
new file mode 100644
index 0000000..07011b2
--- /dev/null
+++ b/crosstest/test_vector_ops.ll
@@ -0,0 +1,717 @@
+target triple = "i686-pc-linux-gnu"
+
+define <4 x float> @insertelement_v4f32(<4 x float> %vec, float %elt, i32 %idx) {
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  ]
+idx0:
+  %res0 = insertelement <4 x float> %vec, float %elt, i32 0
+  ret <4 x float> %res0
+idx1:
+  %res1 = insertelement <4 x float> %vec, float %elt, i32 1
+  ret <4 x float> %res1
+idx2:
+  %res2 = insertelement <4 x float> %vec, float %elt, i32 2
+  ret <4 x float> %res2
+idx3:
+  %res3 = insertelement <4 x float> %vec, float %elt, i32 3
+  ret <4 x float> %res3
+abort:
+  unreachable
+}
+
+define <4 x i32> @insertelement_v4i1(<4 x i32> %arg_vec, i64 %elt_arg, i32 %idx) {
+  %vec = trunc <4 x i32> %arg_vec to <4 x i1>
+  %elt = trunc i64 %elt_arg to i1
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  ]
+idx0:
+  %res0_i1 = insertelement <4 x i1> %vec, i1 %elt, i32 0
+  %res0 = zext <4 x i1> %res0_i1 to <4 x i32>
+  ret <4 x i32> %res0
+idx1:
+  %res1_i1 = insertelement <4 x i1> %vec, i1 %elt, i32 1
+  %res1 = zext <4 x i1> %res1_i1 to <4 x i32>
+  ret <4 x i32> %res1
+idx2:
+  %res2_i1 = insertelement <4 x i1> %vec, i1 %elt, i32 2
+  %res2 = zext <4 x i1> %res2_i1 to <4 x i32>
+  ret <4 x i32> %res2
+idx3:
+  %res3_i1 = insertelement <4 x i1> %vec, i1 %elt, i32 3
+  %res3 = zext <4 x i1> %res3_i1 to <4 x i32>
+  ret <4 x i32> %res3
+abort:
+  unreachable
+}
+
+define <8 x i16> @insertelement_v8i1(<8 x i16> %arg_vec, i64 %elt_arg, i32 %idx) {
+  %vec = trunc <8 x i16> %arg_vec to <8 x i1>
+  %elt = trunc i64 %elt_arg to i1
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  ]
+idx0:
+  %res0_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 0
+  %res0 = zext <8 x i1> %res0_i1 to <8 x i16>
+  ret <8 x i16> %res0
+idx1:
+  %res1_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 1
+  %res1 = zext <8 x i1> %res1_i1 to <8 x i16>
+  ret <8 x i16> %res1
+idx2:
+  %res2_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 2
+  %res2 = zext <8 x i1> %res2_i1 to <8 x i16>
+  ret <8 x i16> %res2
+idx3:
+  %res3_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 3
+  %res3 = zext <8 x i1> %res3_i1 to <8 x i16>
+  ret <8 x i16> %res3
+idx4:
+  %res4_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 4
+  %res4 = zext <8 x i1> %res4_i1 to <8 x i16>
+  ret <8 x i16> %res4
+idx5:
+  %res5_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 5
+  %res5 = zext <8 x i1> %res5_i1 to <8 x i16>
+  ret <8 x i16> %res5
+idx6:
+  %res6_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 6
+  %res6 = zext <8 x i1> %res6_i1 to <8 x i16>
+  ret <8 x i16> %res6
+idx7:
+  %res7_i1 = insertelement <8 x i1> %vec, i1 %elt, i32 7
+  %res7 = zext <8 x i1> %res7_i1 to <8 x i16>
+  ret <8 x i16> %res7
+abort:
+  unreachable
+}
+
+define <16 x i8> @insertelement_v16i1(<16 x i8> %arg_vec, i64 %elt_arg, i32 %idx) {
+  %vec = trunc <16 x i8> %arg_vec to <16 x i1>
+  %elt = trunc i64 %elt_arg to i1
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  i32 8, label %idx8
+  i32 9, label %idx9
+  i32 10, label %idx10
+  i32 11, label %idx11
+  i32 12, label %idx12
+  i32 13, label %idx13
+  i32 14, label %idx14
+  i32 15, label %idx15
+  ]
+idx0:
+  %res0_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 0
+  %res0 = zext <16 x i1> %res0_i1 to <16 x i8>
+  ret <16 x i8> %res0
+idx1:
+  %res1_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 1
+  %res1 = zext <16 x i1> %res1_i1 to <16 x i8>
+  ret <16 x i8> %res1
+idx2:
+  %res2_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 2
+  %res2 = zext <16 x i1> %res2_i1 to <16 x i8>
+  ret <16 x i8> %res2
+idx3:
+  %res3_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 3
+  %res3 = zext <16 x i1> %res3_i1 to <16 x i8>
+  ret <16 x i8> %res3
+idx4:
+  %res4_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 4
+  %res4 = zext <16 x i1> %res4_i1 to <16 x i8>
+  ret <16 x i8> %res4
+idx5:
+  %res5_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 5
+  %res5 = zext <16 x i1> %res5_i1 to <16 x i8>
+  ret <16 x i8> %res5
+idx6:
+  %res6_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 6
+  %res6 = zext <16 x i1> %res6_i1 to <16 x i8>
+  ret <16 x i8> %res6
+idx7:
+  %res7_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 7
+  %res7 = zext <16 x i1> %res7_i1 to <16 x i8>
+  ret <16 x i8> %res7
+idx8:
+  %res8_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 8
+  %res8 = zext <16 x i1> %res8_i1 to <16 x i8>
+  ret <16 x i8> %res8
+idx9:
+  %res9_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 9
+  %res9 = zext <16 x i1> %res9_i1 to <16 x i8>
+  ret <16 x i8> %res9
+idx10:
+  %res10_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 10
+  %res10 = zext <16 x i1> %res10_i1 to <16 x i8>
+  ret <16 x i8> %res10
+idx11:
+  %res11_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 11
+  %res11 = zext <16 x i1> %res11_i1 to <16 x i8>
+  ret <16 x i8> %res11
+idx12:
+  %res12_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 12
+  %res12 = zext <16 x i1> %res12_i1 to <16 x i8>
+  ret <16 x i8> %res12
+idx13:
+  %res13_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 13
+  %res13 = zext <16 x i1> %res13_i1 to <16 x i8>
+  ret <16 x i8> %res13
+idx14:
+  %res14_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 14
+  %res14 = zext <16 x i1> %res14_i1 to <16 x i8>
+  ret <16 x i8> %res14
+idx15:
+  %res15_i1 = insertelement <16 x i1> %vec, i1 %elt, i32 15
+  %res15 = zext <16 x i1> %res15_i1 to <16 x i8>
+  ret <16 x i8> %res15
+abort:
+  unreachable
+}
+
+define <4 x i32> @insertelement_v4si32(<4 x i32> %vec, i64 %elt_arg, i32 %idx) {
+entry:
+  %elt = trunc i64 %elt_arg to i32
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  ]
+idx0:
+  %res0 = insertelement <4 x i32> %vec, i32 %elt, i32 0
+  ret <4 x i32> %res0
+idx1:
+  %res1 = insertelement <4 x i32> %vec, i32 %elt, i32 1
+  ret <4 x i32> %res1
+idx2:
+  %res2 = insertelement <4 x i32> %vec, i32 %elt, i32 2
+  ret <4 x i32> %res2
+idx3:
+  %res3 = insertelement <4 x i32> %vec, i32 %elt, i32 3
+  ret <4 x i32> %res3
+abort:
+  unreachable
+}
+
+define <4 x i32> @insertelement_v4ui32(<4 x i32> %vec, i64 %elt_arg, i32 %idx) {
+entry:
+  %res = call <4 x i32> @insertelement_v4si32(<4 x i32> %vec, i64 %elt_arg, i32 %idx)
+  ret <4 x i32> %res
+}
+
+define <8 x i16> @insertelement_v8si16(<8 x i16> %vec, i64 %elt_arg, i32 %idx) {
+entry:
+  %elt = trunc i64 %elt_arg to i16
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  ]
+idx0:
+  %res0 = insertelement <8 x i16> %vec, i16 %elt, i32 0
+  ret <8 x i16> %res0
+idx1:
+  %res1 = insertelement <8 x i16> %vec, i16 %elt, i32 1
+  ret <8 x i16> %res1
+idx2:
+  %res2 = insertelement <8 x i16> %vec, i16 %elt, i32 2
+  ret <8 x i16> %res2
+idx3:
+  %res3 = insertelement <8 x i16> %vec, i16 %elt, i32 3
+  ret <8 x i16> %res3
+idx4:
+  %res4 = insertelement <8 x i16> %vec, i16 %elt, i32 4
+  ret <8 x i16> %res4
+idx5:
+  %res5 = insertelement <8 x i16> %vec, i16 %elt, i32 5
+  ret <8 x i16> %res5
+idx6:
+  %res6 = insertelement <8 x i16> %vec, i16 %elt, i32 6
+  ret <8 x i16> %res6
+idx7:
+  %res7 = insertelement <8 x i16> %vec, i16 %elt, i32 7
+  ret <8 x i16> %res7
+abort:
+  unreachable
+}
+
+define <8 x i16> @insertelement_v8ui16(<8 x i16> %vec, i64 %elt_arg, i32 %idx) {
+entry:
+  %res = call <8 x i16> @insertelement_v8si16(<8 x i16> %vec, i64 %elt_arg, i32 %idx)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @insertelement_v16si8(<16 x i8> %vec, i64 %elt_arg, i32 %idx) {
+entry:
+  %elt = trunc i64 %elt_arg to i8
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  i32 8, label %idx8
+  i32 9, label %idx9
+  i32 10, label %idx10
+  i32 11, label %idx11
+  i32 12, label %idx12
+  i32 13, label %idx13
+  i32 14, label %idx14
+  i32 15, label %idx15
+  ]
+idx0:
+  %res0 = insertelement <16 x i8> %vec, i8 %elt, i32 0
+  ret <16 x i8> %res0
+idx1:
+  %res1 = insertelement <16 x i8> %vec, i8 %elt, i32 1
+  ret <16 x i8> %res1
+idx2:
+  %res2 = insertelement <16 x i8> %vec, i8 %elt, i32 2
+  ret <16 x i8> %res2
+idx3:
+  %res3 = insertelement <16 x i8> %vec, i8 %elt, i32 3
+  ret <16 x i8> %res3
+idx4:
+  %res4 = insertelement <16 x i8> %vec, i8 %elt, i32 4
+  ret <16 x i8> %res4
+idx5:
+  %res5 = insertelement <16 x i8> %vec, i8 %elt, i32 5
+  ret <16 x i8> %res5
+idx6:
+  %res6 = insertelement <16 x i8> %vec, i8 %elt, i32 6
+  ret <16 x i8> %res6
+idx7:
+  %res7 = insertelement <16 x i8> %vec, i8 %elt, i32 7
+  ret <16 x i8> %res7
+idx8:
+  %res8 = insertelement <16 x i8> %vec, i8 %elt, i32 8
+  ret <16 x i8> %res8
+idx9:
+  %res9 = insertelement <16 x i8> %vec, i8 %elt, i32 9
+  ret <16 x i8> %res9
+idx10:
+  %res10 = insertelement <16 x i8> %vec, i8 %elt, i32 10
+  ret <16 x i8> %res10
+idx11:
+  %res11 = insertelement <16 x i8> %vec, i8 %elt, i32 11
+  ret <16 x i8> %res11
+idx12:
+  %res12 = insertelement <16 x i8> %vec, i8 %elt, i32 12
+  ret <16 x i8> %res12
+idx13:
+  %res13 = insertelement <16 x i8> %vec, i8 %elt, i32 13
+  ret <16 x i8> %res13
+idx14:
+  %res14 = insertelement <16 x i8> %vec, i8 %elt, i32 14
+  ret <16 x i8> %res14
+idx15:
+  %res15 = insertelement <16 x i8> %vec, i8 %elt, i32 15
+  ret <16 x i8> %res15
+abort:
+  unreachable
+}
+
+define <16 x i8> @insertelement_v16ui8(<16 x i8> %vec, i64 %elt_arg, i32 %idx) {
+entry:
+  %res = call <16 x i8> @insertelement_v16si8(<16 x i8> %vec, i64 %elt_arg, i32 %idx)
+  ret <16 x i8> %res
+}
+
+define float @extractelement_v4f32(<4 x float> %vec, i32 %idx) {
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  ]
+idx0:
+  %res0 = extractelement <4 x float> %vec, i32 0
+  ret float %res0
+idx1:
+  %res1 = extractelement <4 x float> %vec, i32 1
+  ret float %res1
+idx2:
+  %res2 = extractelement <4 x float> %vec, i32 2
+  ret float %res2
+idx3:
+  %res3 = extractelement <4 x float> %vec, i32 3
+  ret float %res3
+abort:
+  unreachable
+}
+
+define i64 @extractelement_v4i1(<4 x i32> %arg_vec, i32 %idx) {
+  %vec = trunc <4 x i32> %arg_vec to <4 x i1>
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  ]
+idx0:
+  %res0_i1 = extractelement <4 x i1> %vec, i32 0
+  %res0 = zext i1 %res0_i1 to i64
+  ret i64 %res0
+idx1:
+  %res1_i1 = extractelement <4 x i1> %vec, i32 1
+  %res1 = zext i1 %res1_i1 to i64
+  ret i64 %res1
+idx2:
+  %res2_i1 = extractelement <4 x i1> %vec, i32 2
+  %res2 = zext i1 %res2_i1 to i64
+  ret i64 %res2
+idx3:
+  %res3_i1 = extractelement <4 x i1> %vec, i32 3
+  %res3 = zext i1 %res3_i1 to i64
+  ret i64 %res3
+abort:
+  unreachable
+}
+
+define i64 @extractelement_v8i1(<8 x i16> %arg_vec, i32 %idx) {
+  %vec = trunc <8 x i16> %arg_vec to <8 x i1>
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  ]
+idx0:
+  %res0_i1 = extractelement <8 x i1> %vec, i32 0
+  %res0 = zext i1 %res0_i1 to i64
+  ret i64 %res0
+idx1:
+  %res1_i1 = extractelement <8 x i1> %vec, i32 1
+  %res1 = zext i1 %res1_i1 to i64
+  ret i64 %res1
+idx2:
+  %res2_i1 = extractelement <8 x i1> %vec, i32 2
+  %res2 = zext i1 %res2_i1 to i64
+  ret i64 %res2
+idx3:
+  %res3_i1 = extractelement <8 x i1> %vec, i32 3
+  %res3 = zext i1 %res3_i1 to i64
+  ret i64 %res3
+idx4:
+  %res4_i1 = extractelement <8 x i1> %vec, i32 4
+  %res4 = zext i1 %res4_i1 to i64
+  ret i64 %res4
+idx5:
+  %res5_i1 = extractelement <8 x i1> %vec, i32 5
+  %res5 = zext i1 %res5_i1 to i64
+  ret i64 %res5
+idx6:
+  %res6_i1 = extractelement <8 x i1> %vec, i32 6
+  %res6 = zext i1 %res6_i1 to i64
+  ret i64 %res6
+idx7:
+  %res7_i1 = extractelement <8 x i1> %vec, i32 7
+  %res7 = zext i1 %res7_i1 to i64
+  ret i64 %res7
+abort:
+  unreachable
+}
+
+define i64 @extractelement_v16i1(<16 x i8> %arg_vec, i32 %idx) {
+  %vec = trunc <16 x i8> %arg_vec to <16 x i1>
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  i32 8, label %idx8
+  i32 9, label %idx9
+  i32 10, label %idx10
+  i32 11, label %idx11
+  i32 12, label %idx12
+  i32 13, label %idx13
+  i32 14, label %idx14
+  i32 15, label %idx15
+  ]
+idx0:
+  %res0_i1 = extractelement <16 x i1> %vec, i32 0
+  %res0 = zext i1 %res0_i1 to i64
+  ret i64 %res0
+idx1:
+  %res1_i1 = extractelement <16 x i1> %vec, i32 1
+  %res1 = zext i1 %res1_i1 to i64
+  ret i64 %res1
+idx2:
+  %res2_i1 = extractelement <16 x i1> %vec, i32 2
+  %res2 = zext i1 %res2_i1 to i64
+  ret i64 %res2
+idx3:
+  %res3_i1 = extractelement <16 x i1> %vec, i32 3
+  %res3 = zext i1 %res3_i1 to i64
+  ret i64 %res3
+idx4:
+  %res4_i1 = extractelement <16 x i1> %vec, i32 4
+  %res4 = zext i1 %res4_i1 to i64
+  ret i64 %res4
+idx5:
+  %res5_i1 = extractelement <16 x i1> %vec, i32 5
+  %res5 = zext i1 %res5_i1 to i64
+  ret i64 %res5
+idx6:
+  %res6_i1 = extractelement <16 x i1> %vec, i32 6
+  %res6 = zext i1 %res6_i1 to i64
+  ret i64 %res6
+idx7:
+  %res7_i1 = extractelement <16 x i1> %vec, i32 7
+  %res7 = zext i1 %res7_i1 to i64
+  ret i64 %res7
+idx8:
+  %res8_i1 = extractelement <16 x i1> %vec, i32 8
+  %res8 = zext i1 %res8_i1 to i64
+  ret i64 %res8
+idx9:
+  %res9_i1 = extractelement <16 x i1> %vec, i32 9
+  %res9 = zext i1 %res9_i1 to i64
+  ret i64 %res9
+idx10:
+  %res10_i1 = extractelement <16 x i1> %vec, i32 10
+  %res10 = zext i1 %res10_i1 to i64
+  ret i64 %res10
+idx11:
+  %res11_i1 = extractelement <16 x i1> %vec, i32 11
+  %res11 = zext i1 %res11_i1 to i64
+  ret i64 %res11
+idx12:
+  %res12_i1 = extractelement <16 x i1> %vec, i32 12
+  %res12 = zext i1 %res12_i1 to i64
+  ret i64 %res12
+idx13:
+  %res13_i1 = extractelement <16 x i1> %vec, i32 13
+  %res13 = zext i1 %res13_i1 to i64
+  ret i64 %res13
+idx14:
+  %res14_i1 = extractelement <16 x i1> %vec, i32 14
+  %res14 = zext i1 %res14_i1 to i64
+  ret i64 %res14
+idx15:
+  %res15_i1 = extractelement <16 x i1> %vec, i32 15
+  %res15 = zext i1 %res15_i1 to i64
+  ret i64 %res15
+abort:
+  unreachable
+}
+
+define i64 @extractelement_v4si32(<4 x i32> %vec, i32 %idx) {
+entry:
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  ]
+idx0:
+  %res0_i32 = extractelement <4 x i32> %vec, i32 0
+  %res0 = zext i32 %res0_i32 to i64
+  ret i64 %res0
+idx1:
+  %res1_i32 = extractelement <4 x i32> %vec, i32 1
+  %res1 = zext i32 %res1_i32 to i64
+  ret i64 %res1
+idx2:
+  %res2_i32 = extractelement <4 x i32> %vec, i32 2
+  %res2 = zext i32 %res2_i32 to i64
+  ret i64 %res2
+idx3:
+  %res3_i32 = extractelement <4 x i32> %vec, i32 3
+  %res3 = zext i32 %res3_i32 to i64
+  ret i64 %res3
+abort:
+  unreachable
+}
+
+define i64 @extractelement_v4ui32(<4 x i32> %vec, i32 %idx) {
+entry:
+  %res = call i64 @extractelement_v4si32(<4 x i32> %vec, i32 %idx)
+  ret i64 %res
+}
+
+define i64 @extractelement_v8si16(<8 x i16> %vec, i32 %idx) {
+entry:
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  ]
+idx0:
+  %res0_i16 = extractelement <8 x i16> %vec, i32 0
+  %res0 = zext i16 %res0_i16 to i64
+  ret i64 %res0
+idx1:
+  %res1_i16 = extractelement <8 x i16> %vec, i32 1
+  %res1 = zext i16 %res1_i16 to i64
+  ret i64 %res1
+idx2:
+  %res2_i16 = extractelement <8 x i16> %vec, i32 2
+  %res2 = zext i16 %res2_i16 to i64
+  ret i64 %res2
+idx3:
+  %res3_i16 = extractelement <8 x i16> %vec, i32 3
+  %res3 = zext i16 %res3_i16 to i64
+  ret i64 %res3
+idx4:
+  %res4_i16 = extractelement <8 x i16> %vec, i32 4
+  %res4 = zext i16 %res4_i16 to i64
+  ret i64 %res4
+idx5:
+  %res5_i16 = extractelement <8 x i16> %vec, i32 5
+  %res5 = zext i16 %res5_i16 to i64
+  ret i64 %res5
+idx6:
+  %res6_i16 = extractelement <8 x i16> %vec, i32 6
+  %res6 = zext i16 %res6_i16 to i64
+  ret i64 %res6
+idx7:
+  %res7_i16 = extractelement <8 x i16> %vec, i32 7
+  %res7 = zext i16 %res7_i16 to i64
+  ret i64 %res7
+abort:
+  unreachable
+}
+
+define i64 @extractelement_v8ui16(<8 x i16> %vec, i32 %idx) {
+entry:
+  %res = call i64 @extractelement_v8si16(<8 x i16> %vec, i32 %idx)
+  ret i64 %res
+}
+
+define i64 @extractelement_v16si8(<16 x i8> %vec, i32 %idx) {
+entry:
+  switch i32 %idx, label %abort [
+  i32 0, label %idx0
+  i32 1, label %idx1
+  i32 2, label %idx2
+  i32 3, label %idx3
+  i32 4, label %idx4
+  i32 5, label %idx5
+  i32 6, label %idx6
+  i32 7, label %idx7
+  i32 8, label %idx8
+  i32 9, label %idx9
+  i32 10, label %idx10
+  i32 11, label %idx11
+  i32 12, label %idx12
+  i32 13, label %idx13
+  i32 14, label %idx14
+  i32 15, label %idx15
+  ]
+idx0:
+  %res0_i8 = extractelement <16 x i8> %vec, i32 0
+  %res0 = zext i8 %res0_i8 to i64
+  ret i64 %res0
+idx1:
+  %res1_i8 = extractelement <16 x i8> %vec, i32 1
+  %res1 = zext i8 %res1_i8 to i64
+  ret i64 %res1
+idx2:
+  %res2_i8 = extractelement <16 x i8> %vec, i32 2
+  %res2 = zext i8 %res2_i8 to i64
+  ret i64 %res2
+idx3:
+  %res3_i8 = extractelement <16 x i8> %vec, i32 3
+  %res3 = zext i8 %res3_i8 to i64
+  ret i64 %res3
+idx4:
+  %res4_i8 = extractelement <16 x i8> %vec, i32 4
+  %res4 = zext i8 %res4_i8 to i64
+  ret i64 %res4
+idx5:
+  %res5_i8 = extractelement <16 x i8> %vec, i32 5
+  %res5 = zext i8 %res5_i8 to i64
+  ret i64 %res5
+idx6:
+  %res6_i8 = extractelement <16 x i8> %vec, i32 6
+  %res6 = zext i8 %res6_i8 to i64
+  ret i64 %res6
+idx7:
+  %res7_i8 = extractelement <16 x i8> %vec, i32 7
+  %res7 = zext i8 %res7_i8 to i64
+  ret i64 %res7
+idx8:
+  %res8_i8 = extractelement <16 x i8> %vec, i32 8
+  %res8 = zext i8 %res8_i8 to i64
+  ret i64 %res8
+idx9:
+  %res9_i8 = extractelement <16 x i8> %vec, i32 9
+  %res9 = zext i8 %res9_i8 to i64
+  ret i64 %res9
+idx10:
+  %res10_i8 = extractelement <16 x i8> %vec, i32 10
+  %res10 = zext i8 %res10_i8 to i64
+  ret i64 %res10
+idx11:
+  %res11_i8 = extractelement <16 x i8> %vec, i32 11
+  %res11 = zext i8 %res11_i8 to i64
+  ret i64 %res11
+idx12:
+  %res12_i8 = extractelement <16 x i8> %vec, i32 12
+  %res12 = zext i8 %res12_i8 to i64
+  ret i64 %res12
+idx13:
+  %res13_i8 = extractelement <16 x i8> %vec, i32 13
+  %res13 = zext i8 %res13_i8 to i64
+  ret i64 %res13
+idx14:
+  %res14_i8 = extractelement <16 x i8> %vec, i32 14
+  %res14 = zext i8 %res14_i8 to i64
+  ret i64 %res14
+idx15:
+  %res15_i8 = extractelement <16 x i8> %vec, i32 15
+  %res15 = zext i8 %res15_i8 to i64
+  ret i64 %res15
+abort:
+  unreachable
+}
+
+define i64 @extractelement_v16ui8(<16 x i8> %vec, i32 %idx) {
+entry:
+  %res = call i64 @extractelement_v16si8(<16 x i8> %vec, i32 %idx)
+  ret i64 %res
+}
diff --git a/crosstest/test_vector_ops_main.cpp b/crosstest/test_vector_ops_main.cpp
new file mode 100644
index 0000000..266450d
--- /dev/null
+++ b/crosstest/test_vector_ops_main.cpp
@@ -0,0 +1,225 @@
+/* crosstest.py --test=test_vector_ops.ll  --driver=test_vector_ops_main.cpp \
+   --prefix=Subzero_ --output=test_vector_ops */
+
+#include <stdint.h>
+#include <cstring>
+#include <sstream>
+#include <iostream>
+#include <limits>
+#include <utility>
+#include <vector>
+#include <stdlib.h>
+
+#include "test_vector_ops.def"
+
+// typedefs of native C++ SIMD vector types
+#define X(ty, elty, castty) typedef elty ty __attribute__((vector_size(16)));
+VECTOR_TYPE_TABLE
+#undef X
+
+// i1 vector types are not native C++ SIMD vector types. Instead, they
+// are expanded by the test code into native 128 bit SIMD vector types
+// with the appropriate number of elements. Representing the types in
+// VectorOps<> requires a unique name for each type which this
+// declaration provides.
+#define X(ty, expandedty, num_elements)                                        \
+  class ty;
+I1_VECTOR_TYPE_TABLE
+#undef X
+
+template <typename T> struct VectorOps;
+
+#define DECLARE_VECTOR_OPS(TYNAME, TY, ELTY, CASTTY, NUM_ELEMENTS)             \
+  template <> struct VectorOps<TYNAME> {                                       \
+    typedef TY Ty;                                                             \
+    typedef ELTY ElementTy;                                                    \
+    typedef CASTTY CastTy;                                                     \
+    static TY (*insertelement)(TY, CASTTY, int32_t);                           \
+    static TY (*Subzero_insertelement)(TY, CASTTY, int32_t);                   \
+    static CASTTY (*extractelement)(TY, int32_t);                              \
+    static CASTTY (*Subzero_extractelement)(TY, int32_t);                      \
+    static size_t NumElements;                                                 \
+    static const char *TypeName;                                               \
+  };                                                                           \
+  extern "C" TY insertelement_##TYNAME(TY, CASTTY, int32_t);                   \
+  extern "C" TY Subzero_insertelement_##TYNAME(TY, CASTTY, int32_t);           \
+  extern "C" CASTTY extractelement_##TYNAME(TY, int32_t);                      \
+  extern "C" CASTTY Subzero_extractelement_##TYNAME(TY, int32_t);              \
+  size_t VectorOps<TYNAME>::NumElements = NUM_ELEMENTS;                        \
+  TY (*VectorOps<TYNAME>::insertelement)(TY, CASTTY, int32_t) =                \
+      &insertelement_##TYNAME;                                                 \
+  TY (*VectorOps<TYNAME>::Subzero_insertelement)(TY, CASTTY, int32_t) =        \
+      &Subzero_insertelement_##TYNAME;                                         \
+  CASTTY (*VectorOps<TYNAME>::extractelement)(TY, int32_t) =                   \
+      &extractelement_##TYNAME;                                                \
+  CASTTY (*VectorOps<TYNAME>::Subzero_extractelement)(TY, int32_t) =           \
+      &Subzero_extractelement_##TYNAME;                                        \
+  const char *VectorOps<TYNAME>::TypeName = #TYNAME;
+
+#define X(ty, elty, castty)                                                    \
+  DECLARE_VECTOR_OPS(ty, ty, elty, castty, (sizeof(ty) / sizeof(elty)))
+VECTOR_TYPE_TABLE
+#undef X
+
+#define X(ty, expandedty, num_elements)                                        \
+  DECLARE_VECTOR_OPS(ty, expandedty, bool, int64_t, num_elements)
+I1_VECTOR_TYPE_TABLE
+#undef X
+
+template <typename T>
+std::string vectAsString(const typename VectorOps<T>::Ty Vect) {
+  std::ostringstream OS;
+  for (size_t I = 0; I < VectorOps<T>::NumElements; ++I) {
+    if (I > 0)
+      OS << " ";
+    OS << (typename VectorOps<T>::CastTy)Vect[I];
+  }
+  return OS.str();
+}
+
+template <typename T>
+typename VectorOps<T>::Ty *getTestVectors(size_t &NumTestVectors) {
+  typedef typename VectorOps<T>::Ty Ty;
+  typedef typename VectorOps<T>::ElementTy ElementTy;
+
+  Ty Zero;
+  memset(&Zero, 0, sizeof(Zero));
+  Ty Incr;
+  // Note: The casts in the next two initializations are necessary,
+  // since ElementTy isn't necessarily the type that the value is stored
+  // in the vector.
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Incr[I] = (ElementTy)I;
+  Ty Decr;
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Decr[I] = (ElementTy)-I;
+  Ty Min;
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Min[I] = std::numeric_limits<ElementTy>::min();
+  Ty Max;
+  for (int I = 0; I < VectorOps<T>::NumElements; ++I)
+    Max[I] = std::numeric_limits<ElementTy>::max();
+  Ty TestVectors[] = {Zero, Incr, Decr, Min, Max};
+
+  NumTestVectors = sizeof(TestVectors) / sizeof(Ty);
+
+  const size_t VECTOR_ALIGNMENT = 16;
+  void *Dest;
+  if (posix_memalign(&Dest, VECTOR_ALIGNMENT, sizeof(TestVectors))) {
+    std::cerr << "memory allocation error" << std::endl;
+    abort();
+  }
+
+  memcpy(Dest, TestVectors, sizeof(TestVectors));
+
+  return static_cast<Ty *>(Dest);
+}
+
+template <typename T>
+void testInsertElement(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename VectorOps<T>::Ty Ty;
+  typedef typename VectorOps<T>::ElementTy ElementTy;
+
+  size_t NumTestVectors;
+  Ty *TestVectors = getTestVectors<T>(NumTestVectors);
+
+  ElementTy TestElements[] = {0, 1, std::numeric_limits<ElementTy>::min(),
+                              std::numeric_limits<ElementTy>::max()};
+  const size_t NumTestElements = sizeof(TestElements) / sizeof(ElementTy);
+
+  for (size_t VI = 0; VI < NumTestVectors; ++VI) {
+    Ty Vect = TestVectors[VI];
+    for (size_t EI = 0; EI < NumTestElements; ++EI) {
+      ElementTy Elt = TestElements[EI];
+      for (size_t I = 0; I < VectorOps<T>::NumElements; ++I) {
+        Ty ResultLlc = VectorOps<T>::insertelement(Vect, Elt, I);
+        Ty ResultSz = VectorOps<T>::Subzero_insertelement(Vect, Elt, I);
+        ++TotalTests;
+        if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "insertelement<" << VectorOps<T>::TypeName << ">(Vect=";
+          std::cout << vectAsString<T>(Vect)
+                    << ", Element=" << (typename VectorOps<T>::CastTy)Elt
+                    << ", Pos=" << I << ")" << std::endl;
+          std::cout << "llc=" << vectAsString<T>(ResultLlc) << std::endl;
+          std::cout << "sz =" << vectAsString<T>(ResultSz) << std::endl;
+        }
+      }
+    }
+  }
+
+  free(TestVectors);
+}
+
+template <typename T>
+void testExtractElement(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename VectorOps<T>::Ty Ty;
+  typedef typename VectorOps<T>::ElementTy ElementTy;
+  typedef typename VectorOps<T>::CastTy CastTy;
+
+  size_t NumTestVectors;
+  Ty *TestVectors = getTestVectors<T>(NumTestVectors);
+
+  for (size_t VI = 0; VI < NumTestVectors; ++VI) {
+    Ty Vect = TestVectors[VI];
+    for (size_t I = 0; I < VectorOps<T>::NumElements; ++I) {
+      CastTy ResultLlc = VectorOps<T>::extractelement(Vect, I);
+      CastTy ResultSz = VectorOps<T>::Subzero_extractelement(Vect, I);
+      ++TotalTests;
+      if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "extractelement<" << VectorOps<T>::TypeName << ">(Vect=";
+        std::cout << vectAsString<T>(Vect) << ", Pos=" << I << ")" << std::endl;
+        std::cout << "llc=" << ResultLlc << std::endl;
+        std::cout << "sz =" << ResultSz << std::endl;
+      }
+    }
+  }
+
+  free(TestVectors);
+}
+
+int main(int argc, char *argv[]) {
+  size_t TotalTests = 0;
+  size_t Passes = 0;
+  size_t Failures = 0;
+
+  testInsertElement<v4i1>(TotalTests, Passes, Failures);
+  testInsertElement<v8i1>(TotalTests, Passes, Failures);
+  testInsertElement<v16i1>(TotalTests, Passes, Failures);
+  testInsertElement<v16si8>(TotalTests, Passes, Failures);
+  testInsertElement<v16ui8>(TotalTests, Passes, Failures);
+  testInsertElement<v8si16>(TotalTests, Passes, Failures);
+  testInsertElement<v8ui16>(TotalTests, Passes, Failures);
+  testInsertElement<v4si32>(TotalTests, Passes, Failures);
+  testInsertElement<v4ui32>(TotalTests, Passes, Failures);
+  testInsertElement<v4f32>(TotalTests, Passes, Failures);
+
+  testExtractElement<v4i1>(TotalTests, Passes, Failures);
+  testExtractElement<v8i1>(TotalTests, Passes, Failures);
+  testExtractElement<v16i1>(TotalTests, Passes, Failures);
+  testExtractElement<v16si8>(TotalTests, Passes, Failures);
+  testExtractElement<v16ui8>(TotalTests, Passes, Failures);
+  testExtractElement<v8si16>(TotalTests, Passes, Failures);
+  testExtractElement<v8ui16>(TotalTests, Passes, Failures);
+  testExtractElement<v4si32>(TotalTests, Passes, Failures);
+  testExtractElement<v4ui32>(TotalTests, Passes, Failures);
+  testExtractElement<v4f32>(TotalTests, Passes, Failures);
+
+  std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
+            << " Failures=" << Failures << "\n";
+
+  return Failures;
+}
+
+extern "C" {
+
+void ice_unreachable(void) {
+  std::cerr << "\"unreachable\" instruction encountered" << std::endl;
+  abort();
+}
+}
diff --git a/src/IceConverter.cpp b/src/IceConverter.cpp
index 9ba1fea..9f4c1d8 100644
--- a/src/IceConverter.cpp
+++ b/src/IceConverter.cpp
@@ -337,6 +337,10 @@
       return convertArithInstruction(Inst, Ice::InstArithmetic::Or);
     case Instruction::Xor:
       return convertArithInstruction(Inst, Ice::InstArithmetic::Xor);
+    case Instruction::ExtractElement:
+      return convertExtractElementInstruction(cast<ExtractElementInst>(Inst));
+    case Instruction::InsertElement:
+      return convertInsertElementInstruction(cast<InsertElementInst>(Inst));
     case Instruction::Call:
       return convertCallInstruction(cast<CallInst>(Inst));
     case Instruction::Alloca:
@@ -534,6 +538,22 @@
     return Ice::InstFcmp::create(Func, Cond, Dest, Src0, Src1);
   }
 
+  Ice::Inst *convertExtractElementInstruction(const ExtractElementInst *Inst) {
+    Ice::Variable *Dest = mapValueToIceVar(Inst);
+    Ice::Operand *Source1 = convertValue(Inst->getOperand(0));
+    Ice::Operand *Source2 = convertValue(Inst->getOperand(1));
+    return Ice::InstExtractElement::create(Func, Dest, Source1, Source2);
+  }
+
+  Ice::Inst *convertInsertElementInstruction(const InsertElementInst *Inst) {
+    Ice::Variable *Dest = mapValueToIceVar(Inst);
+    Ice::Operand *Source1 = convertValue(Inst->getOperand(0));
+    Ice::Operand *Source2 = convertValue(Inst->getOperand(1));
+    Ice::Operand *Source3 = convertValue(Inst->getOperand(2));
+    return Ice::InstInsertElement::create(Func, Dest, Source1, Source2,
+                                          Source3);
+  }
+
   Ice::Inst *convertSelectInstruction(const SelectInst *Inst) {
     Ice::Variable *Dest = mapValueToIceVar(Inst);
     Ice::Operand *Cond = convertValue(Inst->getCondition());
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 12ca16c..004b555 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -267,6 +267,13 @@
   addSource(Source);
 }
 
+InstExtractElement::InstExtractElement(Cfg *Func, Variable *Dest,
+                                       Operand *Source1, Operand *Source2)
+    : Inst(Func, Inst::ExtractElement, 2, Dest) {
+  addSource(Source1);
+  addSource(Source2);
+}
+
 InstFcmp::InstFcmp(Cfg *Func, FCond Condition, Variable *Dest, Operand *Source1,
                    Operand *Source2)
     : Inst(Func, Inst::Fcmp, 2, Dest), Condition(Condition) {
@@ -281,6 +288,15 @@
   addSource(Source2);
 }
 
+InstInsertElement::InstInsertElement(Cfg *Func, Variable *Dest,
+                                     Operand *Source1, Operand *Source2,
+                                     Operand *Source3)
+    : Inst(Func, Inst::InsertElement, 3, Dest) {
+  addSource(Source1);
+  addSource(Source2);
+  addSource(Source3);
+}
+
 InstLoad::InstLoad(Cfg *Func, Variable *Dest, Operand *SourceAddr)
     : Inst(Func, Inst::Load, 1, Dest) {
   addSource(SourceAddr);
@@ -586,6 +602,31 @@
   dumpSources(Func);
 }
 
+void InstExtractElement::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = extractelement ";
+  Str << getSrc(0)->getType() << " ";
+  getSrc(0)->dump(Func);
+  Str << ", ";
+  Str << getSrc(1)->getType() << " ";
+  getSrc(1)->dump(Func);
+};
+
+void InstInsertElement::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = insertelement ";
+  Str << getSrc(0)->getType() << " ";
+  getSrc(0)->dump(Func);
+  Str << ", ";
+  Str << getSrc(1)->getType() << " ";
+  getSrc(1)->dump(Func);
+  Str << ", ";
+  Str << getSrc(2)->getType() << " ";
+  getSrc(2)->dump(Func);
+};
+
 void InstFcmp::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
diff --git a/src/IceInst.h b/src/IceInst.h
index 0397e02..0a6c61d 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -41,9 +41,11 @@
     Br,
     Call,
     Cast,
+    ExtractElement,
     Fcmp,
     Icmp,
     IntrinsicCall,
+    InsertElement,
     Load,
     Phi,
     Ret,
@@ -344,6 +346,29 @@
   const OpKind CastKind;
 };
 
+// ExtractElement instruction.
+class InstExtractElement : public Inst {
+public:
+  static InstExtractElement *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                    Operand *Source2) {
+    return new (Func->allocateInst<InstExtractElement>())
+        InstExtractElement(Func, Dest, Source1, Source2);
+  }
+
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == ExtractElement;
+  }
+
+private:
+  InstExtractElement(Cfg *Func, Variable *Dest, Operand *Source1,
+                     Operand *Source2);
+  InstExtractElement(const InstExtractElement &) LLVM_DELETED_FUNCTION;
+  InstExtractElement &
+  operator=(const InstExtractElement &) LLVM_DELETED_FUNCTION;
+  virtual ~InstExtractElement() {}
+};
+
 // Floating-point comparison instruction.  The source operands are
 // captured in getSrc(0) and getSrc(1).
 class InstFcmp : public Inst {
@@ -402,6 +427,28 @@
   const ICond Condition;
 };
 
+// InsertElement instruction.
+class InstInsertElement : public Inst {
+public:
+  static InstInsertElement *create(Cfg *Func, Variable *Dest, Operand *Source1,
+                                   Operand *Source2, Operand *Source3) {
+    return new (Func->allocateInst<InstInsertElement>())
+        InstInsertElement(Func, Dest, Source1, Source2, Source3);
+  }
+
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) {
+    return Inst->getKind() == InsertElement;
+  }
+
+private:
+  InstInsertElement(Cfg *Func, Variable *Dest, Operand *Source1,
+                    Operand *Source2, Operand *Source3);
+  InstInsertElement(const InstInsertElement &) LLVM_DELETED_FUNCTION;
+  InstInsertElement &operator=(const InstInsertElement &) LLVM_DELETED_FUNCTION;
+  virtual ~InstInsertElement() {}
+};
+
 // Call to an intrinsic function.  The call target is captured as getSrc(0),
 // and arg I is captured as getSrc(I+1).
 class InstIntrinsicCall : public InstCall {
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index bb99440..baa145f 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -42,7 +42,7 @@
   const char *PackString;  // b, w, d, or <blank>
   const char *WidthString; // {byte,word,dword,qword} ptr
 } TypeX8632Attributes[] = {
-#define X(tag, cvt, sdss, pack, width)                                         \
+#define X(tag, elementty, cvt, sdss, pack, width)                              \
   { cvt, "" sdss, pack, width }                                                \
   ,
     ICETYPEX8632_TABLE
@@ -312,21 +312,6 @@
   return false;
 }
 
-InstX8632Pshufd::InstX8632Pshufd(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2)
-    : InstX8632(Func, InstX8632::Pshufd, 2, Dest) {
-  addSource(Source1);
-  addSource(Source2);
-}
-
-InstX8632Shufps::InstX8632Shufps(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2)
-    : InstX8632(Func, InstX8632::Shufps, 3, Dest) {
-  addSource(Dest);
-  addSource(Source1);
-  addSource(Source2);
-}
-
 InstX8632Ret::InstX8632Ret(Cfg *Func, Variable *Source)
     : InstX8632(Func, InstX8632::Ret, Source ? 1 : 0, NULL) {
   if (Source)
@@ -454,9 +439,15 @@
   Str << "\n";
 }
 
+
+// Unary ops
 template <> const char *InstX8632Bsf::Opcode = "bsf";
 template <> const char *InstX8632Bsr::Opcode = "bsr";
+template <> const char *InstX8632Lea::Opcode = "lea";
+template <> const char *InstX8632Movd::Opcode = "movd";
+template <> const char *InstX8632Movss::Opcode = "movss";
 template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
+// Binary ops
 template <> const char *InstX8632Add::Opcode = "add";
 template <> const char *InstX8632Addps::Opcode = "addps";
 template <> const char *InstX8632Adc::Opcode = "adc";
@@ -489,6 +480,12 @@
 template <> const char *InstX8632Psra::Opcode = "psra";
 template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
+// Ternary ops
+template <> const char *InstX8632Shufps::Opcode = "shufps";
+template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";
+// Three address ops
+template <> const char *InstX8632Pextrw::Opcode = "pextrw";
+template <> const char *InstX8632Pshufd::Opcode = "pshufd";
 
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
@@ -556,6 +553,22 @@
   emitTwoAddress(buf, this, Func);
 }
 
+template <> void InstX8632Div::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  Str << "\t" << Opcode << "\t";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+template <> void InstX8632Idiv::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  Str << "\t" << Opcode << "\t";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
 template <> void InstX8632Imul::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 2);
@@ -868,6 +881,25 @@
   getSrc(0)->dump(Func);
 }
 
+template <> void InstX8632Lea::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->hasReg());
+  Str << "\tlea\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  Operand *Src0 = getSrc(0);
+  if (Variable *VSrc0 = llvm::dyn_cast<Variable>(Src0)) {
+    Type Ty = VSrc0->getType();
+    // lea on x86-32 doesn't accept mem128 operands, so cast VSrc0 to an
+    // acceptable type.
+    VSrc0->asType(isVectorType(Ty) ? IceType_i32 : Ty).emit(Func);
+  } else {
+    Src0->emit(Func);
+  }
+  Str << "\n";
+}
+
 void InstX8632Mov::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
@@ -893,6 +925,9 @@
   // safe, we instead widen the dest to match src.  This works even
   // for stack-allocated dest variables because typeWidthOnStack()
   // pads to a 4-byte boundary even if only a lower portion is used.
+  // TODO: This assert disallows usages such as copying a floating point
+  // value between a vector and a scalar (which movss is used for).
+  // Clean this up.
   assert(Func->getTarget()->typeWidthInBytesOnStack(getDest()->getType()) ==
          Func->getTarget()->typeWidthInBytesOnStack(Src->getType()));
   getDest()->asType(Src->getType()).emit(Func);
@@ -1066,6 +1101,39 @@
   emitTwoAddress(buf, this, Func);
 }
 
+template <> void InstX8632Pextrw::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Str << "\t" << Opcode << "\t";
+  Variable *Dest = getDest();
+  assert(Dest->hasReg() && Dest->getType() == IceType_i16);
+  // pextrw takes r32 dest.
+  Dest->asType(IceType_i32).emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << "\n";
+}
+
+template <> void InstX8632Pinsrw::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 3);
+  Str << "\t" << Opcode << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  Operand *Src1 = getSrc(1);
+  if (Variable *VSrc1 = llvm::dyn_cast<Variable>(Src1)) {
+    // If src1 is a register, it should be r32.
+    VSrc1->asType(VSrc1->hasReg() ? IceType_i32 : IceType_i16).emit(Func);
+  } else {
+    Src1->emit(Func);
+  }
+  Str << ", ";
+  getSrc(2)->emit(Func);
+  Str << "\n";
+}
+
 void InstX8632Pop::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 0);
@@ -1138,25 +1206,6 @@
   emitTwoAddress(buf, this, Func);
 }
 
-void InstX8632Pshufd::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 2);
-  Str << "\tpshufd\t";
-  getDest()->emit(Func);
-  Str << ", ";
-  getSrc(0)->emit(Func);
-  Str << ", ";
-  getSrc(1)->emit(Func);
-  Str << "\n";
-}
-
-void InstX8632Pshufd::dump(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = pshufd." << getDest()->getType() << " ";
-  dumpSources(Func);
-}
-
 void InstX8632Ret::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   Str << "\tret\n";
@@ -1169,25 +1218,6 @@
   dumpSources(Func);
 }
 
-void InstX8632Shufps::emit(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrEmit();
-  assert(getSrcSize() == 3);
-  Str << "\tshufps\t";
-  getDest()->emit(Func);
-  Str << ", ";
-  getSrc(1)->emit(Func);
-  Str << ", ";
-  getSrc(2)->emit(Func);
-  Str << "\n";
-}
-
-void InstX8632Shufps::dump(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = shufps." << getDest()->getType() << " ";
-  dumpSources(Func);
-}
-
 void InstX8632Xadd::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   if (Locked) {
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index 8930c29..be7aeb5 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -66,23 +66,23 @@
   X(Br_p,        "p",  "jp")   \
 //#define X(tag, dump, emit)
 
-#define ICETYPEX8632_TABLE                          \
-  /* tag,          cvt, sdss,  pack, width */       \
-  X(IceType_void,  "?",  ""  , "" ,  "???")         \
-  X(IceType_i1,    "si", ""  , "" ,  "byte ptr")    \
-  X(IceType_i8,    "si", ""  , "" ,  "byte ptr")    \
-  X(IceType_i16,   "si", ""  , "" ,  "word ptr")    \
-  X(IceType_i32,   "si", ""  , "" ,  "dword ptr")   \
-  X(IceType_i64,   "si", ""  , "" ,  "qword ptr")   \
-  X(IceType_f32,   "ss", "ss", "" ,  "dword ptr")   \
-  X(IceType_f64,   "sd", "sd", "" ,  "qword ptr")   \
-  X(IceType_v4i1,  "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v8i1,  "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v16i1, "?",  ""  , "" ,  "xmmword ptr") \
-  X(IceType_v16i8, "?",  ""  , "b",  "xmmword ptr") \
-  X(IceType_v8i16, "?",  ""  , "w",  "xmmword ptr") \
-  X(IceType_v4i32, "dq", ""  , "d",  "xmmword ptr") \
-  X(IceType_v4f32, "ps", ""  , "" ,  "xmmword ptr") \
-//#define X(tag, cvt, sdss, width)
+#define ICETYPEX8632_TABLE                                        \
+  /* tag,          element type, cvt, sdss,  pack, width */       \
+  X(IceType_void,  IceType_void, "?" , ""  , "" ,  "???")         \
+  X(IceType_i1,    IceType_void, "si", ""  , "" ,  "byte ptr")    \
+  X(IceType_i8,    IceType_void, "si", ""  , "" ,  "byte ptr")    \
+  X(IceType_i16,   IceType_void, "si", ""  , "" ,  "word ptr")    \
+  X(IceType_i32,   IceType_void, "si", ""  , "" ,  "dword ptr")   \
+  X(IceType_i64,   IceType_void, "si", ""  , "" ,  "qword ptr")   \
+  X(IceType_f32,   IceType_void, "ss", "ss", "" ,  "dword ptr")   \
+  X(IceType_f64,   IceType_void, "sd", "sd", "" ,  "qword ptr")   \
+  X(IceType_v4i1,  IceType_i32 , "?" , ""  , "" ,  "xmmword ptr") \
+  X(IceType_v8i1,  IceType_i16 , "?" , ""  , "" ,  "xmmword ptr") \
+  X(IceType_v16i1, IceType_i8  , "?" , ""  , "" ,  "xmmword ptr") \
+  X(IceType_v16i8, IceType_i8  , "?" , ""  , "b",  "xmmword ptr") \
+  X(IceType_v8i16, IceType_i16 , "?" , ""  , "w",  "xmmword ptr") \
+  X(IceType_v4i32, IceType_i32 , "dq", ""  , "d",  "xmmword ptr") \
+  X(IceType_v4f32, IceType_f32 , "ps", ""  , "" ,  "xmmword ptr") \
+//#define X(tag, elementty, cvt, sdss, width)
 
 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 03605ca..db60d68 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -156,11 +156,14 @@
     Idiv,
     Imul,
     Label,
+    Lea,
     Load,
     Mfence,
     Mov,
+    Movd,
     Movp,
     Movq,
+    Movss,
     Movsx,
     Movzx,
     Mul,
@@ -172,6 +175,8 @@
     Pand,
     Pcmpeq,
     Pcmpgt,
+    Pextrw,
+    Pinsrw,
     Pmullw,
     Pmuludq,
     Pop,
@@ -430,7 +435,11 @@
     Ostream &Str = Func->getContext()->getStrEmit();
     assert(getSrcSize() == 3);
     Str << "\t" << Opcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
     getSrc(1)->emit(Func);
+    Str << ", ";
+    getSrc(2)->emit(Func);
     Str << "\n";
   }
   virtual void dump(const Cfg *Func) const {
@@ -454,8 +463,54 @@
   static const char *Opcode;
 };
 
+// Instructions of the form x := y op z
+template <InstX8632::InstKindX8632 K>
+class InstX8632ThreeAddressop : public InstX8632 {
+public:
+  static InstX8632ThreeAddressop *create(Cfg *Func, Variable *Dest,
+                                         Operand *Source0, Operand *Source1) {
+    return new (Func->allocate<InstX8632ThreeAddressop>())
+        InstX8632ThreeAddressop(Func, Dest, Source0, Source1);
+  }
+  virtual void emit(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(getSrcSize() == 2);
+    Str << "\t" << Opcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+    getSrc(0)->emit(Func);
+    Str << ", ";
+    getSrc(1)->emit(Func);
+    Str << "\n";
+  }
+  virtual void dump(const Cfg *Func) const {
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstX8632ThreeAddressop(Cfg *Func, Variable *Dest, Operand *Source0,
+                          Operand *Source1)
+      : InstX8632(Func, K, 2, Dest) {
+    addSource(Source0);
+    addSource(Source1);
+  }
+  InstX8632ThreeAddressop(const InstX8632ThreeAddressop &)
+      LLVM_DELETED_FUNCTION;
+  InstX8632ThreeAddressop &
+  operator=(const InstX8632ThreeAddressop &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632ThreeAddressop() {}
+  static const char *Opcode;
+};
+
 typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf;
 typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
+typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
+typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
+typedef InstX8632Unaryop<InstX8632::Movss> InstX8632Movss;
 typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
 typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
@@ -489,6 +544,10 @@
 typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
+typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;
+typedef InstX8632Ternop<InstX8632::Shufps> InstX8632Shufps;
+typedef InstX8632ThreeAddressop<InstX8632::Pextrw> InstX8632Pextrw;
+typedef InstX8632ThreeAddressop<InstX8632::Pshufd> InstX8632Pshufd;
 
 // Base class for a lockable x86-32 instruction (emits a locked prefix).
 class InstX8632Lockable : public InstX8632 {
@@ -994,27 +1053,6 @@
   virtual ~InstX8632Push() {}
 };
 
-// Pshufd - shuffle a vector of doublewords 
-class InstX8632Pshufd : public InstX8632 {
-public:
-  static InstX8632Pshufd *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2) {
-    return new (Func->allocate<InstX8632Pshufd>())
-        InstX8632Pshufd(Func, Dest, Source1, Source2);
-  }
-  virtual void emit(const Cfg *Func) const;
-  virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Pshufd); }
-
-private:
-  InstX8632Pshufd(Cfg *Func, Variable *Dest, Operand *Source1,
-                  Operand *Source2);
-  InstX8632Pshufd(const InstX8632Pshufd &) LLVM_DELETED_FUNCTION;
-  InstX8632Pshufd &operator=(const InstX8632Pshufd &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Pshufd() {}
-  static const char *Opcode;
-};
-
 // Ret instruction.  Currently only supports the "ret" version that
 // does not pop arguments.  This instruction takes a Source operand
 // (for non-void returning functions) for liveness analysis, though
@@ -1035,27 +1073,6 @@
   virtual ~InstX8632Ret() {}
 };
 
-// Shufps - select from two vectors of floating point values
-class InstX8632Shufps : public InstX8632 {
-public:
-  static InstX8632Shufps *create(Cfg *Func, Variable *Dest, Operand *Source1,
-                                 Operand *Source2) {
-    return new (Func->allocate<InstX8632Shufps>())
-        InstX8632Shufps(Func, Dest, Source1, Source2);
-  }
-  virtual void emit(const Cfg *Func) const;
-  virtual void dump(const Cfg *Func) const;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Shufps); }
-
-private:
-  InstX8632Shufps(Cfg *Func, Variable *Dest, Operand *Source1,
-                  Operand *Source2);
-  InstX8632Shufps(const InstX8632Shufps &) LLVM_DELETED_FUNCTION;
-  InstX8632Shufps &operator=(const InstX8632Shufps &) LLVM_DELETED_FUNCTION;
-  virtual ~InstX8632Shufps() {}
-  static const char *Opcode;
-};
-
 // Exchanging Add instruction.  Exchanges the first operand (destination
 // operand) with the second operand (source operand), then loads the sum
 // of the two values into the destination operand. The destination may be
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index a5dd39a..3f6098c 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -110,12 +110,18 @@
   case Inst::Cast:
     lowerCast(llvm::dyn_cast<InstCast>(Inst));
     break;
+  case Inst::ExtractElement:
+    lowerExtractElement(llvm::dyn_cast<InstExtractElement>(Inst));
+    break;
   case Inst::Fcmp:
     lowerFcmp(llvm::dyn_cast<InstFcmp>(Inst));
     break;
   case Inst::Icmp:
     lowerIcmp(llvm::dyn_cast<InstIcmp>(Inst));
     break;
+  case Inst::InsertElement:
+    lowerInsertElement(llvm::dyn_cast<InstInsertElement>(Inst));
+    break;
   case Inst::IntrinsicCall:
     lowerIntrinsicCall(llvm::dyn_cast<InstIntrinsicCall>(Inst));
     break;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index ed5389c..c798943 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -169,7 +169,9 @@
   virtual void lowerCall(const InstCall *Inst) = 0;
   virtual void lowerCast(const InstCast *Inst) = 0;
   virtual void lowerFcmp(const InstFcmp *Inst) = 0;
+  virtual void lowerExtractElement(const InstExtractElement *Inst) = 0;
   virtual void lowerIcmp(const InstIcmp *Inst) = 0;
+  virtual void lowerInsertElement(const InstInsertElement *Inst) = 0;
   virtual void lowerIntrinsicCall(const InstIntrinsicCall *Inst) = 0;
   virtual void lowerLoad(const InstLoad *Inst) = 0;
   virtual void lowerPhi(const InstPhi *Inst) = 0;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 2b14a65..af9ebc5 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -85,6 +85,27 @@
   return TableIcmp32[Index].Mapping;
 }
 
+const struct TableTypeX8632Attributes_ {
+  Type InVectorElementType;
+} TableTypeX8632Attributes[] = {
+#define X(tag, elementty, cvt, sdss, pack, width)                              \
+  { elementty }                                                                \
+  ,
+    ICETYPEX8632_TABLE
+#undef X
+  };
+const size_t TableTypeX8632AttributesSize =
+    llvm::array_lengthof(TableTypeX8632Attributes);
+
+// Return the type which the elements of the vector have in the X86
+// representation of the vector.
+Type getInVectorElementType(Type Ty) {
+  assert(isVectorType(Ty));
+  size_t Index = static_cast<size_t>(Ty);
+  assert(Index < TableTypeX8632AttributesSize);
+  return TableTypeX8632Attributes[Ty].InVectorElementType;
+}
+
 // The maximum number of arguments to pass in XMM registers
 const unsigned X86_MAX_XMM_ARGS = 4;
 // The number of bits in a byte
@@ -173,7 +194,7 @@
     // Define a temporary set of enum values based on low-level
     // table entries.
     enum _tmp_enum {
-#define X(tag, cvt, sdss, pack, width) _tmp_##tag,
+#define X(tag, elementty, cvt, sdss, pack, width) _tmp_##tag,
       ICETYPEX8632_TABLE
 #undef X
           _num
@@ -185,7 +206,7 @@
 #undef X
 // Define a set of constants based on low-level table entries,
 // and ensure the table entry keys are consistent.
-#define X(tag, cvt, sdss, pack, width)                                         \
+#define X(tag, elementty, cvt, sdss, pack, width)                              \
   static const int _table2_##tag = _tmp_##tag;                                 \
   STATIC_ASSERT(_table1_##tag == _table2_##tag);
     ICETYPEX8632_TABLE;
@@ -2107,6 +2128,85 @@
   }
 }
 
+void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
+  Operand *SourceVectOperand = Inst->getSrc(0);
+  ConstantInteger *ElementIndex =
+      llvm::dyn_cast<ConstantInteger>(Inst->getSrc(1));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+
+  unsigned Index = ElementIndex->getValue();
+  Type Ty = SourceVectOperand->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = getInVectorElementType(Ty);
+  Variable *ExtractedElement = makeReg(InVectorElementTy);
+
+  // TODO(wala): Determine the best lowering sequences for each type.
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Lower extractelement operations where the element is 32 bits
+    // wide with pshufd.
+    // TODO(wala): SSE4.1 has extractps and pextrd
+    //
+    // ALIGNHACK: Force vector operands to registers in instructions that
+    // require aligned memory operands until support for stack alignment
+    // is implemented.
+#define ALIGN_HACK(Vect) legalizeToVar((Vect))
+    Operand *T = NULL;
+    if (Index) {
+      // The shuffle only needs to occur if the element to be extracted
+      // is not at the lowest index.
+      Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
+      T = makeReg(Ty);
+      _pshufd(llvm::cast<Variable>(T), ALIGN_HACK(SourceVectOperand), Mask);
+    } else {
+      // TODO(wala): If SourceVectOperand is in memory, express it as
+      // mem32 so that the call to legalizeToVar() is made unnecessary.
+      // _movd and _movss only take mem32 memory operands.
+      T = legalizeToVar(SourceVectOperand);
+    }
+
+    if (InVectorElementTy == IceType_i32) {
+      _movd(ExtractedElement, T);
+    } else { // InVectorElementTy == IceType_f32
+      // TODO: _mov should be able to be used here.
+      _movss(ExtractedElement, T);
+    }
+#undef ALIGN_HACK
+  } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
+    Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
+    _pextrw(ExtractedElement, legalizeToVar(SourceVectOperand), Mask);
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and do the extraction in memory.
+    // TODO(wala): SSE4.1 has pextrb.
+    //
+    // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
+    // support for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty, Context.getNode());
+    Slot->setWeight(RegWeight::Zero);
+    _movp(Slot, legalizeToVar(SourceVectOperand));
+
+    // Compute the location of the element in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    OperandX8632Mem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _mov(ExtractedElement, Loc);
+  }
+
+  if (ElementTy == IceType_i1) {
+    // Truncate extracted integers to i1s if necessary.
+    Variable *T = makeReg(IceType_i1);
+    InstCast *Cast =
+        InstCast::create(Func, InstCast::Trunc, T, ExtractedElement);
+    lowerCast(Cast);
+    ExtractedElement = T;
+  }
+
+  // Copy the element to the destination.
+  Variable *Dest = Inst->getDest();
+  _mov(Dest, ExtractedElement);
+}
+
 void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
   Operand *Src0 = Inst->getSrc(0);
   Operand *Src1 = Inst->getSrc(1);
@@ -2238,6 +2338,123 @@
   Context.insert(Label);
 }
 
+void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
+  Operand *SourceVectOperand = Inst->getSrc(0);
+  Operand *ElementToInsert = Inst->getSrc(1);
+  ConstantInteger *ElementIndex =
+      llvm::dyn_cast<ConstantInteger>(Inst->getSrc(2));
+  // Only constant indices are allowed in PNaCl IR.
+  assert(ElementIndex);
+  unsigned Index = ElementIndex->getValue();
+
+  Type Ty = SourceVectOperand->getType();
+  Type ElementTy = typeElementType(Ty);
+  Type InVectorElementTy = getInVectorElementType(Ty);
+
+  if (ElementTy == IceType_i1) {
+    // Expand the element to the appropriate size for it to be inserted
+    // in the vector.
+    Variable *Expanded =
+        Func->makeVariable(InVectorElementTy, Context.getNode());
+    InstCast *Cast =
+        InstCast::create(Func, InstCast::Zext, Expanded, ElementToInsert);
+    lowerCast(Cast);
+    ElementToInsert = Expanded;
+  }
+
+  if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
+    // Lower insertelement with 32-bit wide elements using shufps.
+    // TODO(wala): SSE4.1 has pinsrd and insertps.
+    Variable *Element = NULL;
+    if (InVectorElementTy == IceType_f32) {
+      // Element will be in an XMM register since it is floating point.
+      Element = legalizeToVar(ElementToInsert);
+    } else {
+      // Copy an integer to an XMM register.
+      Operand *T = legalize(ElementToInsert, Legal_Reg | Legal_Mem);
+      Element = makeReg(Ty);
+      _movd(Element, T);
+    }
+
+    // shufps treats the source and desination operands as vectors of
+    // four doublewords.  The destination's two high doublewords are
+    // selected from the source operand and the two low doublewords are
+    // selected from the (original value of) the destination operand.
+    // An insertelement operation can be effected with a sequence of two
+    // shufps operations with appropriate masks.  In all cases below,
+    // Element[0] is being inserted into SourceVectOperand.  Indices are
+    // ordered from left to right.
+    //
+    // insertelement into index 0 (result is stored in Element):
+    //   Element := Element[0, 0] SourceVectOperand[0, 1]
+    //   Element := Element[0, 3] SourceVectOperand[2, 3]
+    //
+    // insertelement into index 1 (result is stored in Element):
+    //   Element := Element[0, 0] SourceVectOperand[0, 0]
+    //   Element := Element[3, 0] SourceVectOperand[2, 3]
+    //
+    // insertelement into index 2 (result is stored in T):
+    //   T := SourceVectOperand
+    //   Element := Element[0, 0] T[0, 3]
+    //   T := T[0, 1] Element[0, 3]
+    //
+    // insertelement into index 3 (result is stored in T):
+    //   T := SourceVectOperand
+    //   Element := Element[0, 0] T[0, 2]
+    //   T := T[0, 1] Element[3, 0]
+    const unsigned char Mask1[4] = {64, 0, 192, 128};
+    const unsigned char Mask2[4] = {236, 227, 196, 52};
+
+    Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index]);
+    Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index]);
+
+    // ALIGNHACK: Force vector operands to registers in instructions that
+    // require aligned memory operands until support for stack alignment
+    // is implemented.
+#define ALIGN_HACK(Vect) legalizeToVar((Vect))
+    if (Index < 2) {
+      SourceVectOperand = ALIGN_HACK(SourceVectOperand);
+      _shufps(Element, SourceVectOperand, Mask1Constant);
+      _shufps(Element, SourceVectOperand, Mask2Constant);
+      _movp(Inst->getDest(), Element);
+    } else {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectOperand);
+      _shufps(Element, T, Mask1Constant);
+      _shufps(T, Element, Mask2Constant);
+      _movp(Inst->getDest(), T);
+    }
+#undef ALIGN_HACK
+  } else if (Ty == IceType_v8i16 || Ty == IceType_v8i1) {
+    Operand *Element = legalize(ElementToInsert, Legal_Mem | Legal_Reg);
+    Variable *T = makeReg(Ty);
+    _movp(T, SourceVectOperand);
+    _pinsrw(T, Element, Ctx->getConstantInt(IceType_i8, Index));
+    _movp(Inst->getDest(), T);
+  } else {
+    assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
+    // Spill the value to a stack slot and perform the insertion in
+    // memory.
+    // TODO(wala): SSE4.1 has pinsrb.
+    //
+    // TODO(wala): use legalize(SourceVectOperand, Legal_Mem) when
+    // support for legalizing to mem is implemented.
+    Variable *Slot = Func->makeVariable(Ty, Context.getNode());
+    Slot->setWeight(RegWeight::Zero);
+    _movp(Slot, legalizeToVar(SourceVectOperand));
+
+    // Compute the location of the position to insert in memory.
+    unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
+    OperandX8632Mem *Loc =
+        getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
+    _store(legalizeToVar(ElementToInsert), Loc);
+
+    Variable *T = makeReg(Ty);
+    _movp(T, Slot);
+    _movp(Inst->getDest(), T);
+  }
+}
+
 void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
   switch (Instr->getIntrinsicInfo().ID) {
   case Intrinsics::AtomicCmpxchg: {
@@ -3169,6 +3386,23 @@
   return Dest;
 }
 
+OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
+                                                           Variable *Slot,
+                                                           uint32_t Offset) {
+  // Ensure that Loc is a stack slot.
+  assert(Slot->getWeight() == RegWeight::Zero);
+  assert(Slot->getRegNum() == Variable::NoRegister);
+  // Compute the location of Loc in memory.
+  // TODO(wala,stichnot): lea should not be required.  The address of
+  // the stack slot is known at compile time (although not until after
+  // addProlog()).
+  const Type PointerType = IceType_i32;
+  Variable *Loc = makeReg(PointerType);
+  _lea(Loc, Slot);
+  Constant *ConstantOffset = Ctx->getConstantInt(IceType_i32, Offset);
+  return OperandX8632Mem::create(Func, Ty, Loc, ConstantOffset);
+}
+
 // Helper for legalize() to emit the right code to lower an operand to a
 // register of the appropriate type.
 Variable *TargetX8632::copyToReg(Operand *Src, int32_t RegNum) {
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 58d8781..fefc7fd 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -82,9 +82,11 @@
   virtual void lowerBr(const InstBr *Inst);
   virtual void lowerCall(const InstCall *Inst);
   virtual void lowerCast(const InstCast *Inst);
+  virtual void lowerExtractElement(const InstExtractElement *Inst);
   virtual void lowerFcmp(const InstFcmp *Inst);
   virtual void lowerIcmp(const InstIcmp *Inst);
   virtual void lowerIntrinsicCall(const InstIntrinsicCall *Inst);
+  virtual void lowerInsertElement(const InstInsertElement *Inst);
   virtual void lowerLoad(const InstLoad *Inst);
   virtual void lowerPhi(const InstPhi *Inst);
   virtual void lowerRet(const InstRet *Inst);
@@ -152,6 +154,10 @@
   Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
   Variable *makeVectorOfOnes(Type Ty, int32_t RegNum = Variable::NoRegister);
 
+  // Return a memory operand corresponding to a stack allocated Variable.
+  OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
+                                                uint32_t Offset = 0);
+
   // The following are helpers that insert lowered x86 instructions
   // with minimal syntactic overhead, so that the lowering code can
   // look as close to assembly as practical.
@@ -237,6 +243,9 @@
   void _imul(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Imul::create(Func, Dest, Src0));
   }
+  void _lea(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Lea::create(Func, Dest, Src0));
+  }
   void _mfence() { Context.insert(InstX8632Mfence::create(Func)); }
   // If Dest=NULL is passed in, then a new variable is created, marked
   // as infinite register allocation weight, and returned through the
@@ -249,12 +258,18 @@
       Context.insert(InstX8632Mov::create(Func, Dest, Src0));
     }
   }
+  void _movd(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movd::create(Func, Dest, Src0));
+  }
   void _movp(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Movp::create(Func, Dest, Src0));
   }
   void _movq(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Movq::create(Func, Dest, Src0));
   }
+  void _movss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Movss::create(Func, Dest, Src0));
+  }
   void _movsx(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Movsx::create(Func, Dest, Src0));
   }
@@ -288,6 +303,12 @@
   void _pcmpgt(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pcmpgt::create(Func, Dest, Src0));
   }
+  void _pextrw(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pextrw::create(Func, Dest, Src0, Src1));
+  }
+  void _pinsrw(Variable *Dest, Operand *Src0, Operand *Src1) {
+    Context.insert(InstX8632Pinsrw::create(Func, Dest, Src0, Src1));
+  }
   void _pmullw(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Pmullw::create(Func, Dest, Src0));
   }
diff --git a/tests_lit/llvm2ice_tests/vector-ops.ll b/tests_lit/llvm2ice_tests/vector-ops.ll
new file mode 100644
index 0000000..c4d3e9d
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll
@@ -0,0 +1,147 @@
+; This checks support for insertelement and extractelement.
+
+; RUN: %llvm2ice --verbose inst %s | FileCheck %s
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+; insertelement operations
+
+define <4 x float> @insertelement_v4f32(<4 x float> %vec, float %elt) {
+entry:
+  %res = insertelement <4 x float> %vec, float %elt, i32 1
+  ret <4 x float> %res
+; CHECK-LABEL: insertelement_v4f32:
+; CHECK: shufps
+; CHECK: shufps
+}
+
+define <4 x i32> @insertelement_v4i32(<4 x i32> %vec, i32 %elt) {
+entry:
+  %res = insertelement <4 x i32> %vec, i32 %elt, i32 1
+  ret <4 x i32> %res
+; CHECK-LABEL: insertelement_v4i32:
+; CHECK: shufps
+; CHECK: shufps
+}
+
+define <8 x i16> @insertelement_v8i16(<8 x i16> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i16
+  %res = insertelement <8 x i16> %vec, i16 %elt, i32 1
+  ret <8 x i16> %res
+; CHECK-LABEL: insertelement_v8i16
+; CHECK: pinsrw
+}
+
+define <16 x i8> @insertelement_v16i8(<16 x i8> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i8
+  %res = insertelement <16 x i8> %vec, i8 %elt, i32 1
+  ret <16 x i8> %res
+; CHECK-LABEL: insertelement_v16i8:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+define <4 x i1> @insertelement_v4i1(<4 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <4 x i1> %vec, i1 %elt, i32 1
+  ret <4 x i1> %res
+; CHECK-LABEL: insertelement_v4i1:
+; CHECK: shufps
+; CHECK: shufps
+}
+
+define <8 x i1> @insertelement_v8i1(<8 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <8 x i1> %vec, i1 %elt, i32 1
+  ret <8 x i1> %res
+; CHECK-LABEL: insertelement_v8i1:
+; CHECK: pinsrw
+}
+
+define <16 x i1> @insertelement_v16i1(<16 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <16 x i1> %vec, i1 %elt, i32 1
+  ret <16 x i1> %res
+; CHECK-LABEL: insertelement_v16i1:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+; extractelement operations
+
+define float @extractelement_v4f32(<4 x float> %vec) {
+entry:
+  %res = extractelement <4 x float> %vec, i32 1
+  ret float %res
+; CHECK-LABEL: extractelement_v4f32:
+; CHECK: pshufd
+}
+
+define i32 @extractelement_v4i32(<4 x i32> %vec) {
+entry:
+  %res = extractelement <4 x i32> %vec, i32 1
+  ret i32 %res
+; CHECK-LABEL: extractelement_v4i32:
+; CHECK: pshufd
+}
+
+define i32 @extractelement_v8i16(<8 x i16> %vec) {
+entry:
+  %res = extractelement <8 x i16> %vec, i32 1
+  %res.ext = zext i16 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v8i16:
+; CHECK: pextrw
+}
+
+define i32 @extractelement_v16i8(<16 x i8> %vec) {
+entry:
+  %res = extractelement <16 x i8> %vec, i32 1
+  %res.ext = zext i8 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v16i8:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+define i32 @extractelement_v4i1(<4 x i1> %vec) {
+entry:
+  %res = extractelement <4 x i1> %vec, i32 1
+  %res.ext = zext i1 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v4i1:
+; CHECK: pshufd
+}
+
+define i32 @extractelement_v8i1(<8 x i1> %vec) {
+entry:
+  %res = extractelement <8 x i1> %vec, i32 1
+  %res.ext = zext i1 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v8i1:
+; CHECK: pextrw
+}
+
+define i32 @extractelement_v16i1(<16 x i1> %vec) {
+entry:
+  %res = extractelement <16 x i1> %vec, i32 1
+  %res.ext = zext i1 %res to i32
+  ret i32 %res.ext
+; CHECK-LABEL: extractelement_v16i1:
+; CHECK: movups
+; CHECK: lea
+; CHECK: mov
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ