Subzero. X86. Lowers shufflevector using xmm instructions.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4136
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1909013002 .
diff --git a/crosstest/crosstest.cfg b/crosstest/crosstest.cfg
index afc75b4..a7d30b6 100644
--- a/crosstest/crosstest.cfg
+++ b/crosstest/crosstest.cfg
@@ -56,4 +56,4 @@
 
 [test_vector_ops]
 driver: test_vector_ops_main.cpp
-test: test_vector_ops.ll
+test: test_vector_ops.cpp test_vector_ops_ll.ll
diff --git a/crosstest/test_vector_ops.cpp b/crosstest/test_vector_ops.cpp
new file mode 100644
index 0000000..3d55bc2
--- /dev/null
+++ b/crosstest/test_vector_ops.cpp
@@ -0,0 +1,753 @@
+//===- subzero/crosstest/test_vector_ops.cpp - Vector tests -----*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the vector shuffle routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "test_vector_ops.h"
+
+#include <algorithm>
+#include <type_traits>
+
+namespace {
+// SHUFFLETESTS_TABLE declares several shufflevector test cases. Each test case
+// has 16 indexes because 16 is the max number of elements in a vector type in
+// PNaCl bitcode. For vector types with fewer than 16 elements, the additional
+// indexes are ignored. This strategy allows a single test table definition.
+#define SHUFFLETESTS_TABLE                                                     \
+  /* Indexes... */                                                             \
+  /* Simple tests splatting elements. */                                       \
+  X(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)                            \
+  X(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)                            \
+  X(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)                            \
+  X(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)                            \
+  X(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)                            \
+  X(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)                            \
+  X(6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6)                            \
+  X(7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7)                            \
+  X(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8)                            \
+  X(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9)                            \
+  X(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10)            \
+  X(11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11)            \
+  X(12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12)            \
+  X(13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13)            \
+  X(14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14)            \
+  X(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15)            \
+  X(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16)            \
+  X(17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17)            \
+  X(18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18)            \
+  X(19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19)            \
+  X(20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20)            \
+  X(21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21)            \
+  X(22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22)            \
+  X(23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23)            \
+  X(24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24)            \
+  X(25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25)            \
+  X(26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26)            \
+  X(27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27)            \
+  X(28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28)            \
+  X(29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29)            \
+  X(30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30)            \
+  X(31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31)            \
+  /* Rotating vectors. */                                                      \
+  X(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)                     \
+  X(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)                    \
+  X(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)                   \
+  X(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)                  \
+  X(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)                 \
+  X(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)                \
+  X(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22)               \
+  X(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)              \
+  X(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)             \
+  X(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)            \
+  X(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)            \
+  X(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)            \
+  X(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)            \
+  X(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)            \
+  X(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)            \
+  X(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)            \
+  X(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0)             \
+  X(18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1)              \
+  X(19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2)               \
+  X(20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3)                \
+  X(21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4)                 \
+  X(22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5)                  \
+  X(23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6)                   \
+  X(24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7)                    \
+  X(25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8)                     \
+  X(26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)                      \
+  X(27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)                      \
+  X(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)                      \
+  X(29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)                      \
+  X(30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)                      \
+  X(31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)                      \
+  /* Swapping elements. */                                                     \
+  X(1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(4, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(6, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(7, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(11, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(18, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(19, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(22, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(23, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(24, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(25, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(26, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(27, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(28, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(29, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(30, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(31, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 4, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 5, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 6, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 7, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 10, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 12, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 13, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 14, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 15, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 17, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 18, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 19, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 20, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 21, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 22, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 23, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 24, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 25, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 26, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 27, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 28, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 29, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 30, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 31, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 4, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 5, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 6, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 7, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 8, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 9, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 10, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 11, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 12, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 13, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 14, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 15, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 16, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 17, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 18, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 19, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 20, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 21, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 22, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 23, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 24, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 25, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 26, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 27, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 28, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 29, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 30, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 31, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 6, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 9, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 10, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 11, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 13, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 14, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 16, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 17, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 18, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 20, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 21, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 22, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 24, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 25, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 26, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 27, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 28, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 29, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 30, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 31, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 6, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 7, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 8, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 9, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 11, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 12, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 13, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 14, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 15, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 16, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 17, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 18, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 19, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 21, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 22, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 22, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 23, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 24, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 25, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 26, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 27, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 28, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 29, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 31, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 8, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 9, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 11, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 12, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 13, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 14, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 15, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 17, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 18, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 19, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 20, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 21, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 22, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 23, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 24, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 25, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 26, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 27, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 28, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 29, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 30, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 31, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 1, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 8, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 9, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 11, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 12, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 13, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 14, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 15, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 17, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 18, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 19, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 20, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 21, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 22, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 23, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 24, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 25, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 26, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 27, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 28, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 29, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 30, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 31, 7, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 0, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 1, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 2, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 3, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 4, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 5, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 6, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 8, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 9, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 10, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 11, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 12, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 13, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 14, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 15, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 16, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 17, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 18, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 19, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 20, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 21, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 22, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 23, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 24, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 25, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 26, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 27, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 28, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 29, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 30, 8, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 31, 0, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 1, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 2, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 3, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 4, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 5, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 6, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 7, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 9, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 10, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 11, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 12, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 13, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 14, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 15, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 16, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 17, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 18, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 19, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 20, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 21, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 22, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 23, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 24, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 25, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 26, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 27, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 28, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 29, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 30, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 31, 9, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 2, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 3, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 4, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 5, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 6, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 7, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 13, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 15, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 17, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 18, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 19, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 20, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 21, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 22, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 23, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 24, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 25, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 26, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 27, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 28, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 29, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 30, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 31, 10, 11, 12, 13, 14, 15)                     \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 11, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 17, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 19, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 21, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 22, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 23, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 24, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 25, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 26, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 27, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 28, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 29, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 30, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 31, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 4, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 5, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 6, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 7, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 8, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 12, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 19, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 21, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 22, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 23, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 24, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 26, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 27, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 28, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 29, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 30, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 31, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 3, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 4, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 5, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 6, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 7, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 8, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 9, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 13, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 19, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 21, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 22, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 23, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 24, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 25, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 26, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 27, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 29, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 30, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 31, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 5, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 6, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 7, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 8, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 9, 14, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 10, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 17, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 19, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 20, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 21, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 22, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 23, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 24, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 25, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 26, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 27, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 28, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 29, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 30, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 31, 14, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 3, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 4, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 5, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 6, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 7, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 8, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 9, 15)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 10, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 11, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 17, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 20, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 21, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 22, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 23, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 24, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 25, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 26, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 27, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 28, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 29, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 31, 15)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 2)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 3)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 4)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 5)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 6)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 7)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 8)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9)                       \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 10)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 11)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 12)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 18)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 22)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 23)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 24)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 25)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 26)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 27)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 28)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 29)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 30)                      \
+  X(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31)                      \
+  /* Testing the optimized shufflevectors for x86. */                          \
+  /* (Src0, Src0, Src0, Src0) */                                               \
+  X(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0)                            \
+  X(2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3)                            \
+  /* (Src0, Src0, Src0, Src1) */                                               \
+  X(0, 1, 3, 7, 0, 1, 3, 7, 0, 1, 3, 7, 0, 1, 3, 7)                            \
+  X(2, 2, 2, 5, 2, 2, 2, 5, 2, 2, 2, 5, 2, 2, 2, 5)                            \
+  /* (Src0, Src0, Src1, Src0) */                                               \
+  X(1, 2, 6, 0, 1, 2, 6, 0, 1, 2, 6, 0, 1, 2, 6, 0)                            \
+  X(3, 2, 3, 5, 3, 2, 3, 5, 3, 2, 3, 5, 3, 2, 3, 5)                            \
+  /* (Src0, Src0, Src1, Src1) */                                               \
+  X(2, 3, 5, 7, 2, 3, 5, 7, 2, 3, 5, 7, 2, 3, 5, 7)                            \
+  X(3, 3, 7, 6, 3, 3, 7, 6, 3, 3, 7, 6, 3, 3, 7, 6)                            \
+  /* (Src0, Src1, Src0, Src0) */                                               \
+  X(3, 7, 0, 0, 3, 7, 0, 0, 3, 7, 0, 0, 3, 7, 0, 0)                            \
+  X(1, 7, 3, 3, 1, 7, 3, 3, 1, 7, 3, 3, 1, 7, 3, 3)                            \
+  /* (Src0, Src1, Src0, Src1) */                                               \
+  X(0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5)                            \
+  X(0, 7, 0, 7, 0, 7, 0, 7, 0, 7, 0, 7, 0, 7, 0, 7)                            \
+  X(3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4)                            \
+  X(1, 7, 3, 6, 1, 7, 3, 6, 1, 7, 3, 6, 1, 7, 3, 6)                            \
+  X(0, 6, 3, 7, 0, 6, 3, 7, 0, 6, 3, 7, 0, 6, 3, 7)                            \
+  /* (Src0, Src1, Src1, Src0) */                                               \
+  X(0, 7, 7, 0, 0, 7, 7, 0, 0, 7, 7, 0, 0, 7, 7, 0)                            \
+  X(3, 4, 4, 3, 3, 4, 4, 3, 3, 4, 4, 3, 3, 4, 4, 3)                            \
+  X(1, 5, 3, 6, 1, 5, 3, 6, 1, 5, 3, 6, 1, 5, 3, 6)                            \
+  X(0, 6, 2, 6, 0, 6, 2, 6, 0, 6, 2, 6, 0, 6, 2, 6)                            \
+  /* (Src0, Src1, Src1, Src1) */                                               \
+  X(0, 7, 7, 7, 0, 7, 7, 7, 0, 7, 7, 7, 0, 7, 7, 7)                            \
+  X(3, 6, 7, 4, 3, 6, 7, 4, 3, 6, 7, 4, 3, 6, 7, 4)                            \
+  /* (Src1, Src0, Src0, Src0) */                                               \
+  X(4, 3, 3, 0, 4, 3, 3, 0, 4, 3, 3, 0, 4, 3, 3, 0)                            \
+  X(6, 0, 0, 3, 6, 0, 0, 3, 6, 0, 0, 3, 6, 0, 0, 3)                            \
+  /* (Src1, Src0, Src0, Src1) */                                               \
+  X(4, 3, 2, 6, 4, 3, 2, 6, 4, 3, 2, 6, 4, 3, 2, 6)                            \
+  X(5, 2, 1, 4, 5, 2, 1, 4, 5, 2, 1, 4, 5, 2, 1, 4)                            \
+  X(6, 0, 0, 4, 6, 0, 0, 4, 6, 0, 0, 4, 6, 0, 0, 4)                            \
+  X(5, 2, 2, 6, 5, 2, 2, 6, 5, 2, 2, 6, 5, 2, 2, 6)                            \
+  /* (Src1, Src0, Src1, Src0) */                                               \
+  X(4, 0, 5, 1, 4, 0, 5, 1, 4, 0, 5, 1, 4, 0, 5, 1)                            \
+  X(7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2, 7, 2)                            \
+  X(4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3)                            \
+  X(7, 1, 5, 0, 7, 1, 5, 0, 7, 1, 5, 0, 7, 1, 5, 0)                            \
+  X(4, 3, 6, 2, 4, 3, 6, 2, 4, 3, 6, 2, 4, 3, 6, 2)                            \
+  /* (Src1, Src0, Src1, Src1) */                                               \
+  X(6, 0, 3, 2, 6, 0, 3, 2, 6, 0, 3, 2, 6, 0, 3, 2)                            \
+  X(4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7)                            \
+  /* (Src1, Src1, Src0, Src0) */                                               \
+  X(6, 5, 2, 3, 6, 5, 2, 3, 6, 5, 2, 3, 6, 5, 2, 3)                            \
+  X(7, 7, 0, 1, 7, 7, 0, 1, 7, 7, 0, 1, 7, 7, 0, 1)                            \
+  /* (Src1, Src1, Src0, Src1) */                                               \
+  X(7, 6, 0, 5, 7, 6, 0, 5, 7, 6, 0, 5, 7, 6, 0, 5)                            \
+  X(4, 5, 3, 7, 4, 5, 3, 7, 4, 5, 3, 7, 4, 5, 3, 7)                            \
+  /* (Src1, Src1, Src1, Src0) */                                               \
+  X(6, 6, 4, 0, 6, 6, 4, 0, 6, 6, 4, 0, 6, 6, 4, 0)                            \
+  X(7, 4, 6, 1, 7, 4, 6, 1, 7, 4, 6, 1, 7, 4, 6, 1)                            \
+  /* (Src1, Src1, Src1, Src1) */                                               \
+  X(7, 6, 4, 4, 7, 6, 4, 4, 7, 6, 4, 4, 7, 6, 4, 4)                            \
+  X(5, 7, 7, 6, 5, 7, 7, 6, 5, 7, 7, 6, 5, 7, 7, 6)
+/* End of x86-optimized shufflevectors. */
+//#define X(...)
+
+// ShuffleVectorTest declares the template functions that are used to shuffle
+// the test vectors. It has specific template methods depending on how many
+// elements VecTy has.
+template <typename VecTy> class ShuffleVectorTest {
+  //----------------------------------------------------------------------------
+  //
+  // V4??? Shuffles.
+  //
+  //----------------------------------------------------------------------------
+  template <typename Ty, uint8_t Idx0, uint8_t Idx1, uint8_t Idx2, uint8_t Idx3,
+            uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t,
+            uint8_t, uint8_t, uint8_t, uint8_t, uint8_t>
+  static typename std::enable_if<Vectors<Ty>::NumElements == 4, Ty>::type
+  shufflevector(Ty V1, Ty V2) {
+    const uint8_t NumElements = 4;
+    return __builtin_shufflevector(
+        V1, V2, Idx0 % (NumElements * 2), Idx1 % (NumElements * 2),
+        Idx2 % (NumElements * 2), Idx3 % (NumElements * 2));
+  }
+
+  //----------------------------------------------------------------------------
+  //
+  // V8??? Shuffles.
+  //
+  //----------------------------------------------------------------------------
+  template <typename Ty, uint8_t Idx0, uint8_t Idx1, uint8_t Idx2, uint8_t Idx3,
+            uint8_t Idx4, uint8_t Idx5, uint8_t Idx6, uint8_t Idx7, uint8_t,
+            uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t>
+  static typename std::enable_if<Vectors<Ty>::NumElements == 8, Ty>::type
+  shufflevector(Ty V1, Ty V2) {
+    const uint8_t NumElements = 8;
+    return __builtin_shufflevector(
+        V1, V2, Idx0 % (NumElements * 2), Idx1 % (NumElements * 2),
+        Idx2 % (NumElements * 2), Idx3 % (NumElements * 2),
+        Idx4 % (NumElements * 2), Idx5 % (NumElements * 2),
+        Idx6 % (NumElements * 2), Idx7 % (NumElements * 2));
+  }
+
+  //----------------------------------------------------------------------------
+  //
+  // V16??? Shuffles.
+  //
+  //----------------------------------------------------------------------------
+  template <typename Ty, uint8_t Idx0, uint8_t Idx1, uint8_t Idx2, uint8_t Idx3,
+            uint8_t Idx4, uint8_t Idx5, uint8_t Idx6, uint8_t Idx7,
+            uint8_t Idx8, uint8_t Idx9, uint8_t Idx10, uint8_t Idx11,
+            uint8_t Idx12, uint8_t Idx13, uint8_t Idx14, uint8_t Idx15>
+  static typename std::enable_if<Vectors<Ty>::NumElements == 16, Ty>::type
+  shufflevector(Ty V1, Ty V2) {
+    const uint8_t NumElements = 16;
+    return __builtin_shufflevector(
+        V1, V2, Idx0, Idx1 % (NumElements * 2), Idx2 % (NumElements * 2),
+        Idx3 % (NumElements * 2), Idx4 % (NumElements * 2),
+        Idx5 % (NumElements * 2), Idx6 % (NumElements * 2),
+        Idx7 % (NumElements * 2), Idx8 % (NumElements * 2),
+        Idx9 % (NumElements * 2), Idx10 % (NumElements * 2),
+        Idx11 % (NumElements * 2), Idx12 % (NumElements * 2),
+        Idx13 % (NumElements * 2), Idx14 % (NumElements * 2),
+        Idx15 % (NumElements * 2));
+  }
+
+public:
+  typedef VecTy (*TestFn)(VecTy V1, VecTy V2);
+  static TestFn Tests[];
+  static const uint32_t NumTests;
+};
+
+template <typename Ty>
+typename ShuffleVectorTest<Ty>::TestFn ShuffleVectorTest<Ty>::Tests[] = {
+#define X(...) &ShuffleVectorTest<Ty>::shufflevector<Ty, __VA_ARGS__>,
+    SHUFFLETESTS_TABLE
+#undef X
+};
+
+template <typename Ty>
+const uint32_t ShuffleVectorTest<Ty>::NumTests = 0
+#define X(...) +1
+    SHUFFLETESTS_TABLE
+#undef X
+    ;
+} // end of anonymous namespace
+
+extern "C" {
+#define X(Ty, ElmtTy, CastTy)                                                  \
+  TY(Ty) shufflevector_##Ty(TY(Ty) V1, TY(Ty) V2, uint32_t Which) {            \
+    return (*ShuffleVectorTest<TY(Ty)>::Tests[Which])(V1, V2);                 \
+  }                                                                            \
+  uint32_t shufflevector_count_##Ty() {                                        \
+    return ShuffleVectorTest<TY(Ty)>::NumTests;                                \
+  }
+VECTOR_TYPE_TABLE
+#undef X
+
+#define X(I1Ty, Ty, numelements)                                               \
+  TY(I1Ty) shufflevector_##I1Ty(TY(I1Ty) V1, TY(I1Ty) V2, uint32_t Which) {    \
+    return (*ShuffleVectorTest<TY(I1Ty)>::Tests[Which])(V1, V2);               \
+  }                                                                            \
+  uint32_t shufflevector_count_##I1Ty() {                                      \
+    return ShuffleVectorTest<TY(I1Ty)>::NumTests;                              \
+  }
+I1_VECTOR_TYPE_TABLE
+#undef X
+} // end of extern "C"
diff --git a/crosstest/test_vector_ops.h b/crosstest/test_vector_ops.h
index 32903a9..e9e8737 100644
--- a/crosstest/test_vector_ops.h
+++ b/crosstest/test_vector_ops.h
@@ -18,8 +18,8 @@
 #include "vectors.h"
 
 // The VectorOps<> class acts like Vectors<> but also has insertelement,
-// Subzero_insertelement, extractelement, and Subzero_extractelement
-// fields.
+// Subzero_insertelement, extractelement, Subzero_extractelement,
+// shufflevector, Subzero_shufflevector, and shufflevector_count fields.
 
 template <typename T> struct VectorOps;
 #define FIELD(TYNAME, FIELDNAME) VectorOps<TYNAME>::FIELDNAME
@@ -28,15 +28,21 @@
 #define DECLARE_VECTOR_OPS(NAME)                                               \
   template <> struct VectorOps<NAME> : public Vectors<NAME> {                  \
     static Ty (*insertelement)(Ty, CastTy, int32_t);                           \
+    static Ty (*shufflevector)(Ty, Ty, uint32_t);                              \
     static CastTy (*extractelement)(Ty, int32_t);                              \
     static Ty (*Subzero_insertelement)(Ty, CastTy, int32_t);                   \
+    static Ty (*Subzero_shufflevector)(Ty, Ty, uint32_t);                      \
     static CastTy (*Subzero_extractelement)(Ty, int32_t);                      \
+    static uint32_t (*shufflevector_count)();                                  \
   };                                                                           \
   extern "C" {                                                                 \
   TY(NAME) insertelement_##NAME(TY(NAME), CASTTY(NAME), int32_t);              \
   TY(NAME) Subzero_insertelement_##NAME(TY(NAME), CASTTY(NAME), int32_t);      \
   CASTTY(NAME) extractelement_##NAME(TY(NAME), int32_t);                       \
   CASTTY(NAME) Subzero_extractelement_##NAME(TY(NAME), int32_t);               \
+  TY(NAME) shufflevector_##NAME(TY(NAME), TY(NAME), uint32_t);                 \
+  TY(NAME) Subzero_shufflevector_##NAME(TY(NAME), TY(NAME), uint32_t);         \
+  uint32_t shufflevector_count_##NAME();                                       \
   }                                                                            \
   TY(NAME) (*FIELD(NAME, insertelement))(TY(NAME), CASTTY(NAME), int32_t) =    \
       &insertelement_##NAME;                                                   \
@@ -45,7 +51,12 @@
   CASTTY(NAME) (*FIELD(NAME, extractelement))(TY(NAME), int32_t) =             \
       &extractelement_##NAME;                                                  \
   CASTTY(NAME) (*FIELD(NAME, Subzero_extractelement))(TY(NAME), int32_t) =     \
-      &Subzero_extractelement_##NAME;
+      &Subzero_extractelement_##NAME;                                          \
+  TY(NAME) (*FIELD(NAME, shufflevector))(TY(NAME), TY(NAME), uint32_t) =       \
+      &shufflevector_##NAME;                                                   \
+  TY(NAME) (*FIELD(NAME, Subzero_shufflevector))(                              \
+      TY(NAME), TY(NAME), uint32_t) = &Subzero_shufflevector_##NAME;           \
+  uint32_t (*FIELD(NAME, shufflevector_count))() = &shufflevector_count_##NAME;
 
 #define X(ty, eltty, castty) DECLARE_VECTOR_OPS(ty)
 VECTOR_TYPE_TABLE
diff --git a/crosstest/test_vector_ops.ll b/crosstest/test_vector_ops_ll.ll
similarity index 100%
rename from crosstest/test_vector_ops.ll
rename to crosstest/test_vector_ops_ll.ll
diff --git a/crosstest/test_vector_ops_main.cpp b/crosstest/test_vector_ops_main.cpp
index 1232799..9f27417 100644
--- a/crosstest/test_vector_ops_main.cpp
+++ b/crosstest/test_vector_ops_main.cpp
@@ -130,6 +130,41 @@
   free(TestVectors);
 }
 
+template <typename T>
+void testShuffleVector(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+  typedef typename VectorOps<T>::Ty Ty;
+  typedef typename VectorOps<T>::ElementTy ElementTy;
+
+  size_t NumTestVectors;
+  Ty *TestVectors = getTestVectors<T>(NumTestVectors);
+
+  for (size_t VI = 0; VI < NumTestVectors; ++VI) {
+    Ty Vect0 = TestVectors[VI];
+    for (size_t VJ = 0; VJ < NumTestVectors; ++VJ) {
+      Ty Vect1 = TestVectors[VJ];
+      for (uint32_t Which = 0; Which < VectorOps<T>::shufflevector_count();
+           ++Which) {
+        Ty ResultLlc = VectorOps<T>::shufflevector(Vect0, Vect1, Which);
+        Ty ResultSz = VectorOps<T>::Subzero_shufflevector(Vect0, Vect1, Which);
+        ++TotalTests;
+        if (!memcmp(&ResultLlc, &ResultSz, sizeof(ResultLlc))) {
+          ++Passes;
+        } else {
+          ++Failures;
+          std::cout << "shufflevector<" << VectorOps<T>::TypeName << ">(Vect0=";
+          std::cout << vectAsString<T>(Vect0)
+                    << ", Vect1=" << vectAsString<T>(Vect1) << ", Which=" << VJ
+                    << ")\n";
+          std::cout << "llc=" << vectAsString<T>(ResultLlc) << "\n";
+          std::cout << "sz =" << vectAsString<T>(ResultSz) << "\n";
+        }
+      }
+    }
+  }
+
+  free(TestVectors);
+}
+
 int main(int argc, char *argv[]) {
   size_t TotalTests = 0;
   size_t Passes = 0;
@@ -157,6 +192,17 @@
   testExtractElement<v4ui32>(TotalTests, Passes, Failures);
   testExtractElement<v4f32>(TotalTests, Passes, Failures);
 
+  testShuffleVector<v4i1>(TotalTests, Passes, Failures);
+  testShuffleVector<v8i1>(TotalTests, Passes, Failures);
+  testShuffleVector<v16i1>(TotalTests, Passes, Failures);
+  testShuffleVector<v16si8>(TotalTests, Passes, Failures);
+  testShuffleVector<v16ui8>(TotalTests, Passes, Failures);
+  testShuffleVector<v8si16>(TotalTests, Passes, Failures);
+  testShuffleVector<v8ui16>(TotalTests, Passes, Failures);
+  testShuffleVector<v4si32>(TotalTests, Passes, Failures);
+  testShuffleVector<v4ui32>(TotalTests, Passes, Failures);
+  testShuffleVector<v4f32>(TotalTests, Passes, Failures);
+
   std::cout << "TotalTests=" << TotalTests << " Passes=" << Passes
             << " Failures=" << Failures << "\n";
 
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 265085d..bd56dbc 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -432,6 +432,8 @@
   void pshufd(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
   void pshufd(Type Ty, XmmRegister dst, const Address &src,
               const Immediate &mask);
+  void punpckldq(Type, XmmRegister Dst, XmmRegister Src);
+  void punpckldq(Type, XmmRegister Dst, const Address &Src);
   void shufps(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
   void shufps(Type Ty, XmmRegister dst, const Address &src,
               const Immediate &mask);
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 1bf1550..a986515 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -1565,6 +1565,29 @@
 }
 
 template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
+                                             XmmRegister Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitRexRB(RexTypeIrrelevant, Dst, Src);
+  emitUint8(0x0F);
+  emitUint8(0x62);
+  emitXmmRegisterOperand(Dst, Src);
+}
+
+template <typename TraitsType>
+void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
+                                             const Address &Src) {
+  AssemblerBuffer::EnsureCapacity ensured(&Buffer);
+  emitUint8(0x66);
+  emitAddrSizeOverridePrefix();
+  emitRex(RexTypeIrrelevant, Src, Dst);
+  emitUint8(0x0F);
+  emitUint8(0x62);
+  emitOperand(gprEncoding(Dst), Src);
+}
+
+template <typename TraitsType>
 void AssemblerX86Base<TraitsType>::shufps(Type /* Ty */, XmmRegister dst,
                                           XmmRegister src,
                                           const Immediate &imm) {
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 006d781..a0ff546 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -50,8 +50,8 @@
 
 const TargetX8632Traits::TypeAttributesType
     TargetX8632Traits::TypeAttributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld)             \
-  { cvt, sdss, pdps, spsd, pack, width, fld }                                  \
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+  { cvt, sdss, pdps, spsd, pack, unpack, width, fld }                          \
   ,
         ICETYPEX8632_TABLE
 #undef X
diff --git a/src/IceInstX8632.def b/src/IceInstX8632.def
index 0ed4b80..173164f 100644
--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -212,22 +212,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8632_TABLE                                                     \
-  /* tag,  element type, cvt , sdss, pdps, spsd, pack, width, fld */           \
-  X(void,  void,         "?",  "",   "",   "",   "",   "",    "")              \
-  X(i1,    void,         "si", "",   "",   "",   "",   "b",   "")              \
-  X(i8,    void,         "si", "",   "",   "",   "",   "b",   "")              \
-  X(i16,   void,         "si", "",   "",   "",   "",   "w",   "")              \
-  X(i32,   void,         "si", "",   "",   "",   "",   "l",   "")              \
-  X(i64,   void,         "si", "",   "",   "",   "",   "q",   "")              \
-  X(f32,   void,         "ss", "ss", "ps", "ss", "d",  "",    "s")             \
-  X(f64,   void,         "sd", "sd", "pd", "sd", "q",  "",    "l")             \
-  X(v4i1,  i32,          "?",  "",   "",   "",   "d",  "",    "")              \
-  X(v8i1,  i16,          "?",  "",   "",   "",   "w",  "",    "")              \
-  X(v16i1, i8,           "?",  "",   "",   "",   "b",  "",    "")              \
-  X(v16i8, i8,           "?",  "",   "",   "",   "b",  "",    "")              \
-  X(v8i16, i16,          "?",  "",   "",   "",   "w",  "",    "")              \
-  X(v4i32, i32,          "dq", "",   "",   "",   "d",  "",    "")              \
-  X(v4f32, f32,          "ps", "",   "ps", "ps", "d",  "",    "")
-//#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld)
+  /* tag,  element type, cvt , sdss, pdps, spsd, pack, unpack, width, fld */   \
+  X(void,  void,         "?",  "",   "",   "",   "",   "",     "",    "")      \
+  X(i1,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
+  X(i8,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
+  X(i16,   void,         "si", "",   "",   "",   "",   "",     "w",   "")      \
+  X(i32,   void,         "si", "",   "",   "",   "",   "",     "l",   "")      \
+  X(i64,   void,         "si", "",   "",   "",   "",   "",     "q",   "")      \
+  X(f32,   void,         "ss", "ss", "ps", "ss", "d",  "",     "",    "s")     \
+  X(f64,   void,         "sd", "sd", "pd", "sd", "q",  "",     "",    "l")     \
+  X(v4i1,  i32,          "?",  "",   "",   "",   "d",  "dq",   "",    "")      \
+  X(v8i1,  i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
+  X(v16i1, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
+  X(v16i8, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
+  X(v8i16, i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
+  X(v4i32, i32,          "dq", "",   "",   "",   "d",  "dq",   "",    "")      \
+  X(v4f32, f32,          "ps", "",   "ps", "ps", "d",  "dq",   "",    "")
+//#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
diff --git a/src/IceInstX8664.cpp b/src/IceInstX8664.cpp
index 0915bc8..afb4580 100644
--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -51,8 +51,8 @@
 
 const TargetX8664Traits::TypeAttributesType
     TargetX8664Traits::TypeAttributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld)             \
-  { cvt, sdss, pdps, spsd, pack, width, fld }                                  \
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+  { cvt, sdss, pdps, spsd, pack, unpack, width, fld }                          \
   ,
         ICETYPEX8664_TABLE
 #undef X
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index 8ed221d..ad686c7 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -293,22 +293,22 @@
 //#define X(val, emit)
 
 #define ICETYPEX8664_TABLE                                                     \
-  /* tag,  element type, cvt , sdss, pdps, spsd, pack, width, fld */           \
-  X(void,  void,         "?",  "",   "",   "",   "",   "",    "")              \
-  X(i1,    void,         "si", "",   "",   "",   "",   "b",   "")              \
-  X(i8,    void,         "si", "",   "",   "",   "",   "b",   "")              \
-  X(i16,   void,         "si", "",   "",   "",   "",   "w",   "")              \
-  X(i32,   void,         "si", "",   "",   "",   "",   "l",   "")              \
-  X(i64,   void,         "si", "",   "",   "",   "",   "q",   "")              \
-  X(f32,   void,         "ss", "ss", "ps", "ss", "d",  "",    "s")             \
-  X(f64,   void,         "sd", "sd", "pd", "sd", "q",  "",    "l")             \
-  X(v4i1,  i32,          "?",  "",   "",   "",   "d",  "",    "")              \
-  X(v8i1,  i16,          "?",  "",   "",   "",   "w",  "",    "")              \
-  X(v16i1, i8,           "?",  "",   "",   "",   "b",  "",    "")              \
-  X(v16i8, i8,           "?",  "",   "",   "",   "b",  "",    "")              \
-  X(v8i16, i16,          "?",  "",   "",   "",   "w",  "",    "")              \
-  X(v4i32, i32,          "dq", "",   "",   "",   "d",  "",    "")              \
-  X(v4f32, f32,          "ps", "",   "ps", "ps", "d",  "",    "")
-//#define X(tag, elementty, cvt, sdss, pdps, pack, width, fld)
+  /* tag,  element type, cvt , sdss, pdps, spsd, pack, unpack, width, fld */   \
+  X(void,  void,         "?",  "",   "",   "",   "",   "",     "",    "")      \
+  X(i1,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
+  X(i8,    void,         "si", "",   "",   "",   "",   "",     "b",   "")      \
+  X(i16,   void,         "si", "",   "",   "",   "",   "",     "w",   "")      \
+  X(i32,   void,         "si", "",   "",   "",   "",   "",     "l",   "")      \
+  X(i64,   void,         "si", "",   "",   "",   "",   "",     "q",   "")      \
+  X(f32,   void,         "ss", "ss", "ps", "ss", "d",  "",     "",    "s")     \
+  X(f64,   void,         "sd", "sd", "pd", "sd", "q",  "",     "",    "l")     \
+  X(v4i1,  i32,          "?",  "",   "",   "",   "d",  "dq",   "",    "")      \
+  X(v8i1,  i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
+  X(v16i1, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
+  X(v16i8, i8,           "?",  "",   "",   "",   "b",  "bw",   "",    "")      \
+  X(v8i16, i16,          "?",  "",   "",   "",   "w",  "wd",   "",    "")      \
+  X(v4i32, i32,          "dq", "",   "",   "",   "d",  "dq",   "",    "")      \
+  X(v4f32, f32,          "ps", "",   "ps", "ps", "d",  "dq",   "",    "")
+//#define X(tag, elementty, cvt, sdss, pdps, pack, unpack, width, fld)
 
 #endif // SUBZERO_SRC_ICEINSTX8664_DEF
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 1c2e8e6..c29538a 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -143,6 +143,7 @@
       Pop,
       Por,
       Pshufd,
+      Punpckl,
       Psll,
       Psra,
       Psrl,
@@ -183,7 +184,7 @@
       IacaEnd
     };
 
-    enum SseSuffix { None, Packed, Scalar, Integral };
+    enum SseSuffix { None, Packed, Unpack, Scalar, Integral };
 
     static const char *getWidthString(Type Ty);
     static const char *getFldString(Type Ty);
@@ -841,6 +842,9 @@
       case InstX86Base::SseSuffix::Packed:
         SuffixString = Traits::TypeAttributes[DestTy].PdPsString;
         break;
+      case InstX86Base::SseSuffix::Unpack:
+        SuffixString = Traits::TypeAttributes[DestTy].UnpackString;
+        break;
       case InstX86Base::SseSuffix::Scalar:
         SuffixString = Traits::TypeAttributes[DestTy].SdSsString;
         break;
@@ -2839,6 +2843,23 @@
   private:
     InstX86IacaEnd(Cfg *Func);
   };
+
+  class InstX86Punpckl
+      : public InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
+                                   InstX86Base::SseSuffix::Unpack> {
+  public:
+    static InstX86Punpckl *create(Cfg *Func, Variable *Dest, Operand *Source) {
+      return new (Func->allocate<InstX86Punpckl>())
+          InstX86Punpckl(Func, Dest, Source);
+    }
+
+  private:
+    InstX86Punpckl(Cfg *Func, Variable *Dest, Operand *Source)
+        : InstX86BaseBinopXmm<InstX86Base::Punpckl, false,
+                              InstX86Base::SseSuffix::Unpack>(Func, Dest,
+                                                              Source) {}
+  };
+
 }; // struct InstImpl
 
 /// struct Insts is a template that can be used to instantiate all the X86
@@ -2960,6 +2981,8 @@
 
   using IacaStart = typename InstImpl<TraitsType>::InstX86IacaStart;
   using IacaEnd = typename InstImpl<TraitsType>::InstX86IacaEnd;
+
+  using Punpckl = typename InstImpl<TraitsType>::InstX86Punpckl;
 };
 
 /// X86 Instructions have static data (particularly, opcodes and instruction
@@ -3189,6 +3212,9 @@
   template <>                                                                  \
   template <>                                                                  \
   const char *InstImpl<TraitsType>::InstX86Pshufd::Base::Opcode = "pshufd";    \
+  template <>                                                                  \
+  template <>                                                                  \
+  const char *InstImpl<TraitsType>::InstX86Punpckl::Base::Opcode = "punpckl";  \
   /* Inplace GPR ops */                                                        \
   template <>                                                                  \
   template <>                                                                  \
@@ -3550,6 +3576,12 @@
           &InstImpl<TraitsType>::Assembler::psrl,                              \
           &InstImpl<TraitsType>::Assembler::psrl,                              \
           &InstImpl<TraitsType>::Assembler::psrl};                             \
+  template <>                                                                  \
+  template <>                                                                  \
+  const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp                       \
+      InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = {                  \
+          &InstImpl<TraitsType>::Assembler::punpckldq,                         \
+          &InstImpl<TraitsType>::Assembler::punpckldq};                        \
   }                                                                            \
   }
 
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index f69d19a..a612268 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -101,7 +101,7 @@
 
 const TargetX8632Traits::TableTypeX8632AttributesType
     TargetX8632Traits::TableTypeX8632Attributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld)             \
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
   { IceType_##elementty }                                                      \
   ,
         ICETYPEX8632_TABLE
@@ -459,7 +459,8 @@
 namespace dummy3 {
 // Define a temporary set of enum values based on low-level table entries.
 enum _tmp_enum {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld) _tmp_##tag,
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+  _tmp_##tag,
   ICETYPEX8632_TABLE
 #undef X
       _num
@@ -471,7 +472,7 @@
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
 // table entry keys are consistent.
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld)             \
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
   static const int _table2_##tag = _tmp_##tag;                                 \
   static_assert(_table1_##tag == _table2_##tag,                                \
                 "Inconsistency between ICETYPEX8632_TABLE and ICETYPE_TABLE");
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index 554b916..2716a34 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -976,13 +976,14 @@
   } InstCmppsAttributes[];
 
   static const struct TypeAttributesType {
-    const char *CvtString;   // i (integer), s (single FP), d (double FP)
-    const char *SdSsString;  // ss, sd, or <blank>
-    const char *PdPsString;  // ps, pd, or <blank>
-    const char *SpsdString;  // ss, sd, ps, pd, or <blank>
-    const char *PackString;  // b, w, d, or <blank>
-    const char *WidthString; // b, w, l, q, or <blank>
-    const char *FldString;   // s, l, or <blank>
+    const char *CvtString;    // i (integer), s (single FP), d (double FP)
+    const char *SdSsString;   // ss, sd, or <blank>
+    const char *PdPsString;   // ps, pd, or <blank>
+    const char *SpsdString;   // ss, sd, ps, pd, or <blank>
+    const char *PackString;   // b, w, d, or <blank>
+    const char *UnpackString; // bw, wd, dq, or <blank>
+    const char *WidthString;  // b, w, l, q, or <blank>
+    const char *FldString;    // s, l, or <blank>
   } TypeAttributes[];
 
   static const char *InstSegmentRegNames[];
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 33606d4..73ad386 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -94,7 +94,7 @@
 
 const TargetX8664Traits::TableTypeX8664AttributesType
     TargetX8664Traits::TableTypeX8664Attributes[] = {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld)             \
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
   { IceType_##elementty }                                                      \
   ,
         ICETYPEX8664_TABLE
@@ -787,7 +787,8 @@
 namespace dummy3 {
 // Define a temporary set of enum values based on low-level table entries.
 enum _tmp_enum {
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld) _tmp_##tag,
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
+  _tmp_##tag,
   ICETYPEX8664_TABLE
 #undef X
       _num
@@ -799,7 +800,7 @@
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
 // table entry keys are consistent.
-#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, width, fld)             \
+#define X(tag, elementty, cvt, sdss, pdps, spsd, pack, unpack, width, fld)     \
   static const int _table2_##tag = _tmp_##tag;                                 \
   static_assert(_table1_##tag == _table2_##tag,                                \
                 "Inconsistency between ICETYPEX8664_TABLE and ICETYPE_TABLE");
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index 0fc4800..4e9173a 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -1021,13 +1021,14 @@
   } InstCmppsAttributes[];
 
   static const struct TypeAttributesType {
-    const char *CvtString;   // i (integer), s (single FP), d (double FP)
-    const char *SdSsString;  // ss, sd, or <blank>
-    const char *PdPsString;  // ps, pd, or <blank>
-    const char *SpSdString;  // ss, sd, ps, pd, or <blank>
-    const char *PackString;  // b, w, d, or <blank>
-    const char *WidthString; // b, w, l, q, or <blank>
-    const char *FldString;   // s, l, or <blank>
+    const char *CvtString;    // i (integer), s (single FP), d (double FP)
+    const char *SdSsString;   // ss, sd, or <blank>
+    const char *PdPsString;   // ps, pd, or <blank>
+    const char *SpSdString;   // ss, sd, ps, pd, or <blank>
+    const char *PackString;   // b, w, d, or <blank>
+    const char *UnpackString; // bw, wd, dq, or <blank>
+    const char *WidthString;  // b, w, l, q, or <blank>
+    const char *FldString;    // s, l, or <blank>
   } TypeAttributes[];
 };
 
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index f84c6df..71b824f 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -801,6 +801,10 @@
     AutoMemorySandboxer<> _(this, &Dest, &Src0);
     Context.insert<typename Traits::Insts::Por>(Dest, Src0);
   }
+  void _punpckl(Variable *Dest, Operand *Src0) {
+    AutoMemorySandboxer<> _(this, &Dest, &Src0);
+    Context.insert<typename Traits::Insts::Punpckl>(Dest, Src0);
+  }
   void _pshufd(Variable *Dest, Operand *Src0, Operand *Src1) {
     AutoMemorySandboxer<> _(this, &Dest, &Src0, &Src1);
     Context.insert<typename Traits::Insts::Pshufd>(Dest, Src0, Src1);
@@ -1082,6 +1086,23 @@
 
   BoolFolding<Traits> FoldingInfo;
 
+  /// Helpers for lowering ShuffleVector
+  /// @{
+  Variable *lowerShuffleVector_AllFromSameSrc(Variable *Src, SizeT Index0,
+                                              SizeT Index1, SizeT Index2,
+                                              SizeT Index3);
+  static constexpr SizeT IGNORE_INDEX = 0x80000000u;
+  Variable *lowerShuffleVector_TwoFromSameSrc(Variable *Src0, SizeT Index0,
+                                              SizeT Index1, Variable *Src1,
+                                              SizeT Index2, SizeT Index3);
+  static constexpr SizeT UNIFIED_INDEX_0 = 0;
+  static constexpr SizeT UNIFIED_INDEX_1 = 2;
+  Variable *lowerShuffleVector_UnifyFromDifferentSrcs(Variable *Src0,
+                                                      SizeT Index0,
+                                                      Variable *Src1,
+                                                      SizeT Index1);
+  /// @}
+
   static FixupKind PcRelFixup;
   static FixupKind AbsFixup;
 };
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index d96c0ca..35d7ea0 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -5610,25 +5610,295 @@
   keepEspLiveAtExit();
 }
 
+inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
+                               SizeT Index3) {
+  const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
+                     ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
+  assert(Mask < 256);
+  return Mask;
+}
+
+template <typename TraitsType>
+Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
+    Variable *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  assert((Index0 & SrcBit) == (Index1 & SrcBit));
+  assert((Index0 & SrcBit) == (Index2 & SrcBit));
+  assert((Index0 & SrcBit) == (Index3 & SrcBit));
+  (void)SrcBit;
+
+  const Type SrcTy = Src->getType();
+  auto *T = makeReg(SrcTy);
+  auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
+  auto *Mask =
+      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
+  _pshufd(T, SrcRM, Mask);
+  return T;
+}
+
+template <typename TraitsType>
+Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
+    Variable *Src0, SizeT Index0, SizeT Index1, Variable *Src1, SizeT Index2,
+    SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
+  assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
+  (void)SrcBit;
+
+  const Type SrcTy = Src0->getType();
+  assert(Src1->getType() == SrcTy);
+  auto *T = makeReg(SrcTy);
+  auto *Src0R = legalizeToReg(Src0);
+  auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+  auto *Mask =
+      Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
+  _movp(T, Src0R);
+  _shufps(T, Src1RM, Mask);
+  return T;
+}
+
+template <typename TraitsType>
+Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
+    Variable *Src0, SizeT Index0, Variable *Src1, SizeT Index1) {
+  return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
+                                           Index1, IGNORE_INDEX);
+}
+
+inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
+                               SizeT Index3) {
+  constexpr SizeT SrcBit = 1 << 2;
+  const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
+  const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
+  const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
+  const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
+  return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
+}
+
 template <typename TraitsType>
 void TargetX86Base<TraitsType>::lowerShuffleVector(
     const InstShuffleVector *Instr) {
   auto *Dest = Instr->getDest();
   const Type DestTy = Dest->getType();
+  auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
+  auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
+  const SizeT NumElements = typeNumElements(DestTy);
 
   auto *T = makeReg(DestTy);
 
   switch (DestTy) {
   default:
     break;
-    // TODO(jpp): figure out how to properly lower this without scalarization.
+  // TODO(jpp): figure out how to properly lower the remaining cases without
+  // scalarization.
+  case IceType_v4i1:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    const SizeT Index0 = Instr->getIndex(0)->getValue();
+    const SizeT Index1 = Instr->getIndex(1)->getValue();
+    const SizeT Index2 = Instr->getIndex(2)->getValue();
+    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    Variable *T = nullptr;
+    switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
+#define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
+  case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
+      CASE_SRCS_IN(0, 0, 0, 0) : {
+        T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
+                                              Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 0, 1) : {
+        assert(false && "Following code is untested but likely correct; test "
+                        "and remove assert.");
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
+                                                                  Src1, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 1, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
+                                                                  Src0, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(0, 0, 1, 1) : {
+        assert(false && "Following code is untested but likely correct; test "
+                        "and remove assert.");
+        T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
+                                              Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 0, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
+                                                                  Src1, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 0, 1) : {
+        if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
+            (Index3 - ExpectedNumElements) == 1) {
+          assert(false && "Following code is untested but likely correct; test "
+                          "and remove assert.");
+          auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
+          auto *Src0R = legalizeToReg(Src0);
+          T = makeReg(DestTy);
+          _movp(T, Src0R);
+          _punpckl(T, Src1RM);
+        } else if (Index0 == Index2 && Index1 == Index3) {
+          assert(false && "Following code is untested but likely correct; test "
+                          "and remove assert.");
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
+              UNIFIED_INDEX_1);
+        } else {
+          assert(false && "Following code is untested but likely correct; test "
+                          "and remove assert.");
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index2, Src1, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 1, 0) : {
+        if (Index0 == Index3 && Index1 == Index2) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
+              UNIFIED_INDEX_0);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index0, Src1, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index2, Src0, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(0, 1, 1, 1) : {
+        assert(false && "Following code is untested but likely correct; test "
+                        "and remove assert.");
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
+                                                                  Src1, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 0, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
+                                                                  Src0, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 0, 1) : {
+        if (Index0 == Index3 && Index1 == Index2) {
+          assert(false && "Following code is untested but likely correct; test "
+                          "and remove assert.");
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
+              UNIFIED_INDEX_0);
+        } else {
+          assert(false && "Following code is untested but likely correct; test "
+                          "and remove assert.");
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src0, Index2, Src1, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 1, 0) : {
+        if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
+            (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
+          auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
+          auto *Src0R = legalizeToReg(Src1);
+          T = makeReg(DestTy);
+          _movp(T, Src0R);
+          _punpckl(T, Src1RM);
+        } else if (Index0 == Index2 && Index1 == Index3) {
+          auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          T = lowerShuffleVector_AllFromSameSrc(
+              Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
+              UNIFIED_INDEX_1);
+        } else {
+          auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index0, Src0, Index1);
+          auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
+              Src1, Index2, Src0, Index3);
+          T = lowerShuffleVector_TwoFromSameSrc(
+              Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
+              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+        }
+      }
+      break;
+      CASE_SRCS_IN(1, 0, 1, 1) : {
+        assert(false && "Following code is untested but likely correct; test "
+                        "and remove assert.");
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
+                                                                  Src0, Index1);
+        T = lowerShuffleVector_TwoFromSameSrc(
+            Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 0, 0) : {
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
+                                              Index2, Index3);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 0, 1) : {
+        assert(false && "Following code is untested but likely correct; test "
+                        "and remove assert.");
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
+                                                                  Src1, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 1, 0) : {
+        auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
+                                                                  Src0, Index3);
+        T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
+                                              UNIFIED_INDEX_0, UNIFIED_INDEX_1);
+      }
+      break;
+      CASE_SRCS_IN(1, 1, 1, 1) : {
+        assert(false && "Following code is untested but likely correct; test "
+                        "and remove assert.");
+        T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
+                                              Index3);
+      }
+      break;
+#undef CASE_SRCS_IN
+    }
+
+    assert(T != nullptr);
+    assert(T->getType() == DestTy);
+    _movp(Dest, T);
+    return;
+  } break;
   }
 
   // Unoptimized shuffle. Perform a series of inserts and extracts.
   Context.insert<InstFakeDef>(T);
-  auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
-  auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
-  const SizeT NumElements = typeNumElements(DestTy);
   const Type ElementType = typeElementType(DestTy);
   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
     auto *Index = Instr->getIndex(I);
diff --git a/tests_lit/llvm2ice_tests/vector-shuffle.ll b/tests_lit/llvm2ice_tests/vector-shuffle.ll
new file mode 100644
index 0000000..62fbc9d
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-shuffle.ll
@@ -0,0 +1,53 @@
+; Some shufflevector optimized lowering. This list is by no means exhaustive. It
+; is only a **basic** smoke test. the vector_ops crosstest has a broader range
+; of test cases.
+
+; RUN: %p2i -i %s --target=x8632 --filetype=obj --disassemble -a -O2 \
+; RUN:     --allow-externally-defined-symbols | FileCheck %s --check-prefix=X86
+
+declare void @useV4I32(<4 x i32> %t);
+
+define internal void @shuffleV4I32(<4 x i32> %a, <4 x i32> %b) {
+; X86-LABEL: shuffleV4I32
+  %a_0 = extractelement <4 x i32> %a, i32 0
+  %a_1 = extractelement <4 x i32> %a, i32 1
+  %a_2 = extractelement <4 x i32> %a, i32 2
+  %a_3 = extractelement <4 x i32> %a, i32 3
+
+  %b_0 = extractelement <4 x i32> %b, i32 0
+  %b_1 = extractelement <4 x i32> %b, i32 1
+  %b_2 = extractelement <4 x i32> %b, i32 2
+  %b_3 = extractelement <4 x i32> %b, i32 3
+
+  %t0_0 = insertelement <4 x i32> undef, i32 %a_0, i32 0
+  %t0_1 = insertelement <4 x i32> %t0_0, i32 %b_0, i32 1
+  %t0_2 = insertelement <4 x i32> %t0_1, i32 %a_1, i32 2
+  %t0   = insertelement <4 x i32> %t0_2, i32 %b_1, i32 3
+; X86: punpckldq {{.*}}
+
+  call void @useV4I32(<4 x i32> %t0)
+; X86: call
+
+  %t1_0 = insertelement <4 x i32> undef, i32 %a_0, i32 0
+  %t1_1 = insertelement <4 x i32> %t1_0, i32 %b_1, i32 1
+  %t1_2 = insertelement <4 x i32> %t1_1, i32 %b_1, i32 2
+  %t1   = insertelement <4 x i32> %t1_2, i32 %a_0, i32 3
+; X86: shufps [[T:xmm[0-9]+]],{{.*}},0x10
+; X86: pshufd {{.*}},[[T]],0x28
+
+  call void @useV4I32(<4 x i32> %t1)
+; X86: call
+
+  %t2_0 = insertelement <4 x i32> undef, i32 %a_0, i32 0
+  %t2_1 = insertelement <4 x i32> %t2_0, i32 %b_3, i32 1
+  %t2_2 = insertelement <4 x i32> %t2_1, i32 %a_2, i32 2
+  %t2   = insertelement <4 x i32> %t2_2, i32 %b_2, i32 3
+; X86: shufps {{.*}},0x30
+; X86: shufps {{.*}},0x22
+; X86: shufps {{.*}},0x88
+
+  call void @useV4I32(<4 x i32> %t2)
+; X86: call
+
+  ret void
+}
diff --git a/unittest/AssemblerX8632/XmmArith.cpp b/unittest/AssemblerX8632/XmmArith.cpp
index cf05b79..d19fcb8 100644
--- a/unittest/AssemblerX8632/XmmArith.cpp
+++ b/unittest/AssemblerX8632/XmmArith.cpp
@@ -995,27 +995,6 @@
     reset();                                                                   \
   } while (0)
 
-#define TestImplSingleXmmXmmUntyped(Dst, Src, Inst)                            \
-  do {                                                                         \
-    static constexpr char TestString[] =                                       \
-        "(" #Dst ", " #Src ", " #Inst ", Untyped)";                            \
-    const uint32_t T0 = allocateDqword();                                      \
-    const uint32_t T1 = allocateDqword();                                      \
-                                                                               \
-    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
-    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
-    __ Inst(XmmRegister::Encoded_Reg_##Dst, XmmRegister::Encoded_Reg_##Src,    \
-            Immediate(Inst##Imm));                                             \
-                                                                               \
-    AssembledTest test = assemble();                                           \
-    test.setDqwordTo(T0, V0);                                                  \
-    test.setDqwordTo(T1, V1);                                                  \
-    test.run();                                                                \
-                                                                               \
-    ASSERT_EQ(Inst##UntypedExpected, test.Dst<Dqword>()) << TestString;        \
-    reset();                                                                   \
-  } while (0)
-
 #define TestImpl(Dst, Src)                                                     \
   do {                                                                         \
     TestImplSingleXmmXmm(Dst, Src, pshufd);                                    \
@@ -1034,11 +1013,77 @@
   TestImpl(xmm7, xmm0);
 
 #undef TestImpl
-#undef TestImplSingleXmmXmmUntyped
 #undef TestImplSingleXmmAddr
 #undef TestImplSingleXmmXmm
 }
 
+TEST_F(AssemblerX8632Test, Punpckldq) {
+  const Dqword V0(uint64_t(0x1111111122222222ull),
+                  uint64_t(0x5555555577777777ull));
+  const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
+                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+
+  const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
+                        uint64_t(0xAAAAAAAA11111111ull));
+
+#define TestImplXmmXmm(Dst, Src, Inst)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst)                                             \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, punpckldq);                                       \
+    TestImplXmmAddr(Dst, punpckldq);                                           \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
 TEST_F(AssemblerX8632Test, Cvt) {
   const Dqword dq2ps32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
   const Dqword dq2ps32SrcValue(-5, 3, 100, 200);
diff --git a/unittest/AssemblerX8664/XmmArith.cpp b/unittest/AssemblerX8664/XmmArith.cpp
index 6a0d9f5..c037520 100644
--- a/unittest/AssemblerX8664/XmmArith.cpp
+++ b/unittest/AssemblerX8664/XmmArith.cpp
@@ -1083,6 +1083,81 @@
 #undef TestImplSingleXmmXmm
 }
 
+TEST_F(AssemblerX8664Test, Punpckldq) {
+  const Dqword V0(uint64_t(0x1111111122222222ull),
+                  uint64_t(0x5555555577777777ull));
+  const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
+                  uint64_t(0xCCCCCCCCDDDDDDDDull));
+
+  const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
+                        uint64_t(0xAAAAAAAA11111111ull));
+
+#define TestImplXmmXmm(Dst, Src, Inst)                                         \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")";    \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst,                      \
+            XmmRegister::Encoded_Reg_##Src);                                   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImplXmmAddr(Dst, Inst)                                             \
+  do {                                                                         \
+    static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")";        \
+    const uint32_t T0 = allocateDqword();                                      \
+    const uint32_t T1 = allocateDqword();                                      \
+                                                                               \
+    __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0));               \
+    __ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1));   \
+                                                                               \
+    AssembledTest test = assemble();                                           \
+    test.setDqwordTo(T0, V0);                                                  \
+    test.setDqwordTo(T1, V1);                                                  \
+    test.run();                                                                \
+                                                                               \
+    ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString;                     \
+    reset();                                                                   \
+  } while (0)
+
+#define TestImpl(Dst, Src)                                                     \
+  do {                                                                         \
+    TestImplXmmXmm(Dst, Src, punpckldq);                                       \
+    TestImplXmmAddr(Dst, punpckldq);                                           \
+  } while (0)
+
+  TestImpl(xmm0, xmm1);
+  TestImpl(xmm1, xmm2);
+  TestImpl(xmm2, xmm3);
+  TestImpl(xmm3, xmm4);
+  TestImpl(xmm4, xmm5);
+  TestImpl(xmm5, xmm6);
+  TestImpl(xmm6, xmm7);
+  TestImpl(xmm7, xmm8);
+  TestImpl(xmm8, xmm9);
+  TestImpl(xmm9, xmm10);
+  TestImpl(xmm10, xmm11);
+  TestImpl(xmm11, xmm12);
+  TestImpl(xmm12, xmm13);
+  TestImpl(xmm13, xmm14);
+  TestImpl(xmm14, xmm15);
+  TestImpl(xmm15, xmm0);
+
+#undef TestImpl
+#undef TestImplXmmAddr
+#undef TestImplXmmXmm
+}
+
 TEST_F(AssemblerX8664Test, Cvt) {
   const Dqword dq2ps32DstValue(-1.0f, -1.0f, -1.0f, -1.0f);
   const Dqword dq2ps32SrcValue(-5, 3, 100, 200);