Add utility function to transpose BGRA data

The current transpose4x4() function assumes the input and the
output channels are in the same order, whereas, when using BGRA,
we want to reorder the channels so that the output is:
B0G0R0A0, B1G1R1A1, B2G2R2A2, B3G3R3A3
so the new transpose4x4zyxw() utility function was added for this
purpose.

Bug: b/204322086
Change-Id: Ic73118c8fb3ba307620041063863146e5053e263
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/69769
Tested-by: Alexis Hétu <sugoi@google.com>
Commit-Queue: Alexis Hétu <sugoi@google.com>
Reviewed-by: Jonah Ryan-Davis <jonahr@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 28c1e4b..ac9f91f 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -777,6 +777,19 @@
 	row3 = Float4(tmp2.zw, tmp3.zw);
 }
 
+void transpose4x4zyxw(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
+{
+	Float4 tmp0 = UnpackLow(row0, row1);
+	Float4 tmp1 = UnpackLow(row2, row3);
+	Float4 tmp2 = UnpackHigh(row0, row1);
+	Float4 tmp3 = UnpackHigh(row2, row3);
+
+	row2 = Float4(tmp0.xy, tmp1.xy);
+	row1 = Float4(tmp0.zw, tmp1.zw);
+	row0 = Float4(tmp2.xy, tmp3.xy);
+	row3 = Float4(tmp2.zw, tmp3.zw);
+}
+
 void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3)
 {
 	Float4 tmp0 = UnpackLow(row0, row1);
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index c49b01f..894291a 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -173,6 +173,7 @@
 void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
 void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
 void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
+void transpose4x4zyxw(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
 void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);