Replace sw::SIMD aliases of 4-wide vectors with rr::SIMD types
sw::SIMD::Float was an alias of rr::Float4, but is now rr::SIMD::Float.
Likewise for the Int and UInt counterparts. rr::Pointer4 has become
rr::SIMD::Pointer.
SIMD::Width was set to 4 for both backends so functionally nothing
changes and none of the asserts are triggered.
The new SIMD::Float4 type is equivalent to Vector4f except each
component is a SIMD vector. The latter is still used in places where
4-wide vectors are assumed, but should be eliminated in future changes.
Bug: b/214583550
Bug: b/236162233
Change-Id: Ib15ae2f9883b989b30de58fda16d7e24fdca4a1a
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/66752
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 3405458..6eb60b0 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -302,13 +302,14 @@
{
Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
- Vector4f colorf = alphaBlend(index, buffer, c[index], x);
+ SIMD::Float4 colorf = alphaBlend(index, buffer, c[index], x);
+ ASSERT(SIMD::Width == 4);
Vector4s color;
- color.x = convertFixed16(colorf.x, true);
- color.y = convertFixed16(colorf.y, true);
- color.z = convertFixed16(colorf.z, true);
- color.w = convertFixed16(colorf.w, true);
+ color.x = convertFixed16(Extract128(colorf.x, 0), true);
+ color.y = convertFixed16(Extract128(colorf.y, 0), true);
+ color.z = convertFixed16(Extract128(colorf.z, 0), true);
+ color.w = convertFixed16(Extract128(colorf.w, 0), true);
writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
}
break;
@@ -348,7 +349,13 @@
{
Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
- Vector4f color = alphaBlend(index, buffer, c[index], x);
+ SIMD::Float4 C = alphaBlend(index, buffer, c[index], x);
+ ASSERT(SIMD::Width == 4);
+ Vector4f color;
+ color.x = Extract128(C.x, 0);
+ color.y = Extract128(C.y, 0);
+ color.z = Extract128(C.z, 0);
+ color.w = Extract128(C.w, 0);
writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
}
break;
@@ -358,7 +365,7 @@
}
}
-void PixelProgram::clampColor(Vector4f color[MAX_COLOR_BUFFERS])
+void PixelProgram::clampColor(SIMD::Float4 color[MAX_COLOR_BUFFERS])
{
// "If the color attachment is fixed-point, the components of the source and destination values and blend factors
// are each clamped to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment
diff --git a/src/Pipeline/PixelProgram.hpp b/src/Pipeline/PixelProgram.hpp
index f367fee..35904a4 100644
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -38,10 +38,10 @@
private:
// Color outputs
- Vector4f c[MAX_COLOR_BUFFERS];
+ SIMD::Float4 c[MAX_COLOR_BUFFERS];
// Raster operations
- void clampColor(Vector4f color[MAX_COLOR_BUFFERS]);
+ void clampColor(SIMD::Float4 color[MAX_COLOR_BUFFERS]);
static SIMD::Int maskAny(Int cMask[4], const SampleSet &samples);
static SIMD::Int maskAny(Int cMask[4], Int sMask[4], Int zMask[4], const SampleSet &samples);
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 05cc345..ca86a18 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -140,6 +140,7 @@
occlusionSampleCount(zMask, sMask, samples);
}
+ ASSERT(SIMD::Width == 4);
SIMD::Float yyyy = SIMD::Float(Float(y)) + SIMD::Float(*Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16));
// Centroid locations
@@ -152,6 +153,7 @@
for(unsigned int q : samples)
{
+ ASSERT(SIMD::Width == 4);
XXXX += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]));
YYYY += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]));
WWWW += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]));
@@ -421,6 +423,7 @@
{
SIMD::Float Z = z;
+ ASSERT(SIMD::Width == 4);
Pointer<Byte> buffer = zBuffer + 4 * x;
Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
@@ -433,6 +436,7 @@
if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
{
+ ASSERT(SIMD::Width == 4);
zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
}
@@ -491,7 +495,8 @@
Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
{
- Short4 Z = convertFixed16(z, true);
+ ASSERT(SIMD::Width == 4);
+ Short4 Z = convertFixed16(Extract128(z, 0), true);
Pointer<Byte> buffer = zBuffer + 2 * x;
Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
@@ -744,14 +749,15 @@
for(unsigned int q : samples)
{
+ ASSERT(SIMD::Width == 4);
switch(state.depthFormat)
{
case VK_FORMAT_D16_UNORM:
- writeDepth16(zBuffer, q, x, z[q], zMask[q]);
+ writeDepth16(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
break;
case VK_FORMAT_D32_SFLOAT:
case VK_FORMAT_D32_SFLOAT_S8_UINT:
- writeDepth32F(zBuffer, q, x, z[q], zMask[q]);
+ writeDepth32F(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
break;
default:
UNSUPPORTED("Depth format: %d", int(state.depthFormat));
@@ -1814,7 +1820,7 @@
}
}
-void PixelRoutine::blendFactorRGB(Vector4f &blendFactor, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
+void PixelRoutine::blendFactorRGB(SIMD::Float4 &blendFactor, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
{
switch(colorBlendFactor)
{
@@ -2028,30 +2034,30 @@
(largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
}
-SIMD::Float PixelRoutine::maxRGB(Vector4f &c)
+SIMD::Float PixelRoutine::maxRGB(SIMD::Float4 &c)
{
return Max(Max(c.x, c.y), c.z);
}
-SIMD::Float PixelRoutine::minRGB(Vector4f &c)
+SIMD::Float PixelRoutine::minRGB(SIMD::Float4 &c)
{
return Min(Min(c.x, c.y), c.z);
}
-void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
+void PixelRoutine::setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
{
SIMD::Float minbase = minRGB(cbase);
SIMD::Float sbase = maxRGB(cbase) - minbase;
SIMD::Float ssat = maxRGB(csat) - minRGB(csat);
SIMD::Int isNonZero = CmpGT(sbase, 0.0f);
- Vector4f color;
+ SIMD::Float4 color;
color.x = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.x - minbase) * ssat / sbase));
color.y = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.y - minbase) * ssat / sbase));
color.z = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.z - minbase) * ssat / sbase));
setLum(color, clum, x, y, z);
}
-SIMD::Float PixelRoutine::lumRGB(Vector4f &c)
+SIMD::Float PixelRoutine::lumRGB(SIMD::Float4 &c)
{
return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
}
@@ -2064,13 +2070,13 @@
(~aboveOne & As<SIMD::Int>(color)))));
}
-void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
+void PixelRoutine::setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
{
SIMD::Float lbase = lumRGB(cbase);
SIMD::Float llum = lumRGB(clum);
SIMD::Float ldiff = llum - lbase;
- Vector4f color;
+ SIMD::Float4 color;
color.x = cbase.x + ldiff;
color.y = cbase.y + ldiff;
color.z = cbase.z + ldiff;
@@ -2087,7 +2093,7 @@
z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
}
-void PixelRoutine::premultiply(Vector4f &c)
+void PixelRoutine::premultiply(SIMD::Float4 &c)
{
SIMD::Int nonZeroAlpha = CmpNEQ(c.w, 0.0f);
c.x = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.x / c.w));
@@ -2095,15 +2101,15 @@
c.z = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.z / c.w));
}
-Vector4f PixelRoutine::computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor)
+SIMD::Float4 PixelRoutine::computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor)
{
- Vector4f srcColor = src;
+ SIMD::Float4 srcColor = src;
srcColor.x *= srcFactor.x;
srcColor.y *= srcFactor.y;
srcColor.z *= srcFactor.z;
srcColor.w *= srcFactor.w;
- Vector4f dstColor = dst;
+ SIMD::Float4 dstColor = dst;
dstColor.x *= dstFactor.x;
dstColor.y *= dstFactor.y;
dstColor.z *= dstFactor.z;
@@ -2112,7 +2118,7 @@
premultiply(srcColor);
premultiply(dstColor);
- Vector4f blendedColor;
+ SIMD::Float4 blendedColor;
switch(state.blendState[index].blendOperation)
{
@@ -2242,7 +2248,7 @@
}
}
-Vector4f PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x)
+SIMD::Float4 PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x)
{
if(!state.blendState[index].alphaBlendEnable)
{
@@ -2255,11 +2261,11 @@
Pointer<Byte> buffer = cBuffer;
Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
- // destColor holds four texel color values.
+ // texelColor holds four texel color values.
// Note: Despite the type being Vector4f, the colors may be stored as
// integers. Half-floats are stored as full 32-bit floats.
// Non-float and non-fixed point formats are not alpha blended.
- Vector4f destColor;
+ Vector4f texelColor;
switch(format)
{
@@ -2268,161 +2274,168 @@
case VK_FORMAT_R32_SFLOAT:
// FIXME: movlps
buffer += 4 * x;
- destColor.x.x = *Pointer<Float>(buffer + 0);
- destColor.x.y = *Pointer<Float>(buffer + 4);
+ texelColor.x.x = *Pointer<Float>(buffer + 0);
+ texelColor.x.y = *Pointer<Float>(buffer + 4);
buffer += pitchB;
// FIXME: movhps
- destColor.x.z = *Pointer<Float>(buffer + 0);
- destColor.x.w = *Pointer<Float>(buffer + 4);
- destColor.y = destColor.z = destColor.w = 1.0f;
+ texelColor.x.z = *Pointer<Float>(buffer + 0);
+ texelColor.x.w = *Pointer<Float>(buffer + 4);
+ texelColor.y = texelColor.z = texelColor.w = 1.0f;
break;
case VK_FORMAT_R32G32_SINT:
case VK_FORMAT_R32G32_UINT:
case VK_FORMAT_R32G32_SFLOAT:
buffer += 8 * x;
- destColor.x = *Pointer<Float4>(buffer, 16);
+ texelColor.x = *Pointer<Float4>(buffer, 16);
buffer += pitchB;
- destColor.y = *Pointer<Float4>(buffer, 16);
- destColor.z = destColor.x;
- destColor.x = ShuffleLowHigh(destColor.x, destColor.y, 0x0202);
- destColor.z = ShuffleLowHigh(destColor.z, destColor.y, 0x1313);
- destColor.y = destColor.z;
- destColor.z = destColor.w = 1.0f;
+ texelColor.y = *Pointer<Float4>(buffer, 16);
+ texelColor.z = texelColor.x;
+ texelColor.x = ShuffleLowHigh(texelColor.x, texelColor.y, 0x0202);
+ texelColor.z = ShuffleLowHigh(texelColor.z, texelColor.y, 0x1313);
+ texelColor.y = texelColor.z;
+ texelColor.z = texelColor.w = 1.0f;
break;
case VK_FORMAT_R32G32B32A32_SFLOAT:
case VK_FORMAT_R32G32B32A32_SINT:
case VK_FORMAT_R32G32B32A32_UINT:
buffer += 16 * x;
- destColor.x = *Pointer<Float4>(buffer + 0, 16);
- destColor.y = *Pointer<Float4>(buffer + 16, 16);
+ texelColor.x = *Pointer<Float4>(buffer + 0, 16);
+ texelColor.y = *Pointer<Float4>(buffer + 16, 16);
buffer += pitchB;
- destColor.z = *Pointer<Float4>(buffer + 0, 16);
- destColor.w = *Pointer<Float4>(buffer + 16, 16);
- transpose4x4(destColor.x, destColor.y, destColor.z, destColor.w);
+ texelColor.z = *Pointer<Float4>(buffer + 0, 16);
+ texelColor.w = *Pointer<Float4>(buffer + 16, 16);
+ transpose4x4(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
break;
case VK_FORMAT_R16_UNORM:
buffer += 2 * x;
- destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
- destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
+ texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
+ texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
buffer += pitchB;
- destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
- destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
- destColor.x *= (1.0f / 0xFFFF);
- destColor.y = destColor.z = destColor.w = 1.0f;
+ texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
+ texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
+ texelColor.x *= (1.0f / 0xFFFF);
+ texelColor.y = texelColor.z = texelColor.w = 1.0f;
break;
case VK_FORMAT_R16_SFLOAT:
buffer += 2 * x;
- destColor.x.x = Float(*Pointer<Half>(buffer + 0));
- destColor.x.y = Float(*Pointer<Half>(buffer + 2));
+ texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
+ texelColor.x.y = Float(*Pointer<Half>(buffer + 2));
buffer += pitchB;
- destColor.x.z = Float(*Pointer<Half>(buffer + 0));
- destColor.x.w = Float(*Pointer<Half>(buffer + 2));
- destColor.y = destColor.z = destColor.w = 1.0f;
+ texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
+ texelColor.x.w = Float(*Pointer<Half>(buffer + 2));
+ texelColor.y = texelColor.z = texelColor.w = 1.0f;
break;
case VK_FORMAT_R16G16_UNORM:
buffer += 4 * x;
- destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
- destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
- destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
- destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
+ texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
+ texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
+ texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
+ texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
buffer += pitchB;
- destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
- destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
- destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
- destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
- destColor.x *= (1.0f / 0xFFFF);
- destColor.y *= (1.0f / 0xFFFF);
- destColor.z = destColor.w = 1.0f;
+ texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
+ texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
+ texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
+ texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
+ texelColor.x *= (1.0f / 0xFFFF);
+ texelColor.y *= (1.0f / 0xFFFF);
+ texelColor.z = texelColor.w = 1.0f;
break;
case VK_FORMAT_R16G16_SFLOAT:
buffer += 4 * x;
- destColor.x.x = Float(*Pointer<Half>(buffer + 0));
- destColor.y.x = Float(*Pointer<Half>(buffer + 2));
- destColor.x.y = Float(*Pointer<Half>(buffer + 4));
- destColor.y.y = Float(*Pointer<Half>(buffer + 6));
+ texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
+ texelColor.y.x = Float(*Pointer<Half>(buffer + 2));
+ texelColor.x.y = Float(*Pointer<Half>(buffer + 4));
+ texelColor.y.y = Float(*Pointer<Half>(buffer + 6));
buffer += pitchB;
- destColor.x.z = Float(*Pointer<Half>(buffer + 0));
- destColor.y.z = Float(*Pointer<Half>(buffer + 2));
- destColor.x.w = Float(*Pointer<Half>(buffer + 4));
- destColor.y.w = Float(*Pointer<Half>(buffer + 6));
- destColor.z = destColor.w = 1.0f;
+ texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
+ texelColor.y.z = Float(*Pointer<Half>(buffer + 2));
+ texelColor.x.w = Float(*Pointer<Half>(buffer + 4));
+ texelColor.y.w = Float(*Pointer<Half>(buffer + 6));
+ texelColor.z = texelColor.w = 1.0f;
break;
case VK_FORMAT_R16G16B16A16_UNORM:
buffer += 8 * x;
- destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
- destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
- destColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
- destColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
- destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
- destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
- destColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
- destColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
+ texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
+ texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
+ texelColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
+ texelColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
+ texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
+ texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
+ texelColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
+ texelColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
buffer += pitchB;
- destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
- destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
- destColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
- destColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
- destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
- destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
- destColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
- destColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
- destColor.x *= (1.0f / 0xFFFF);
- destColor.y *= (1.0f / 0xFFFF);
- destColor.z *= (1.0f / 0xFFFF);
- destColor.w *= (1.0f / 0xFFFF);
+ texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
+ texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
+ texelColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
+ texelColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
+ texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
+ texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
+ texelColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
+ texelColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
+ texelColor.x *= (1.0f / 0xFFFF);
+ texelColor.y *= (1.0f / 0xFFFF);
+ texelColor.z *= (1.0f / 0xFFFF);
+ texelColor.w *= (1.0f / 0xFFFF);
break;
case VK_FORMAT_R16G16B16A16_SFLOAT:
buffer += 8 * x;
- destColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
- destColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
- destColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
- destColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
- destColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
- destColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
- destColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
- destColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
+ texelColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
+ texelColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
+ texelColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
+ texelColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
+ texelColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
+ texelColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
+ texelColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
+ texelColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
buffer += pitchB;
- destColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
- destColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
- destColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
- destColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
- destColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
- destColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
- destColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
- destColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
+ texelColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
+ texelColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
+ texelColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
+ texelColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
+ texelColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
+ texelColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
+ texelColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
+ texelColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
break;
case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
buffer += 4 * x;
- destColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
- destColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
+ texelColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
+ texelColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
buffer += pitchB;
- destColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
- destColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
- transpose4x3(destColor.x, destColor.y, destColor.z, destColor.w);
- destColor.w = 1.0f;
+ texelColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
+ texelColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
+ transpose4x3(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
+ texelColor.w = 1.0f;
break;
default:
{
// Attempt to read an integer based format and convert it to float
Vector4s color;
readPixel(index, cBuffer, x, color);
- destColor.x = convertFloat32(As<UShort4>(color.x));
- destColor.y = convertFloat32(As<UShort4>(color.y));
- destColor.z = convertFloat32(As<UShort4>(color.z));
- destColor.w = convertFloat32(As<UShort4>(color.w));
+ texelColor.x = convertFloat32(As<UShort4>(color.x));
+ texelColor.y = convertFloat32(As<UShort4>(color.y));
+ texelColor.z = convertFloat32(As<UShort4>(color.z));
+ texelColor.w = convertFloat32(As<UShort4>(color.w));
}
break;
}
- Vector4f sourceFactor;
- Vector4f destFactor;
+ ASSERT(SIMD::Width == 4);
+ SIMD::Float4 destColor;
+ destColor.x = texelColor.x;
+ destColor.y = texelColor.y;
+ destColor.z = texelColor.z;
+ destColor.w = texelColor.w;
+
+ SIMD::Float4 sourceFactor;
+ SIMD::Float4 destFactor;
blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
- Vector4f blendedColor;
+ SIMD::Float4 blendedColor;
switch(state.blendState[index].blendOperation)
{
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index 229455d..c00768e 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -55,7 +55,7 @@
void alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples);
void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask);
- Vector4f alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x);
+ SIMD::Float4 alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x);
void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s ¤t, const Int &sMask, const Int &zMask, const Int &cMask);
bool isSRGB(int index) const;
@@ -77,23 +77,23 @@
void readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel);
enum BlendFactorModifier { None, OneMinus };
Float blendConstant(vk::Format format, int component, BlendFactorModifier modifier = None);
- void blendFactorRGB(Vector4f &blendFactorRGB, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format);
+ void blendFactorRGB(SIMD::Float4 &blendFactorRGB, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format);
void blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format);
bool blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format);
- Vector4f computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor);
+ SIMD::Float4 computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor);
SIMD::Float blendOpOverlay(SIMD::Float &src, SIMD::Float &dst);
SIMD::Float blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst);
SIMD::Float blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst);
SIMD::Float blendOpHardlight(SIMD::Float &src, SIMD::Float &dst);
SIMD::Float blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst);
- void setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
- void setLum(Vector4f &cbase, Vector4f &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
+ void setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
+ void setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z);
SIMD::Float computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne);
- SIMD::Float maxRGB(Vector4f &c);
- SIMD::Float minRGB(Vector4f &c);
- SIMD::Float lumRGB(Vector4f &c);
- void premultiply(Vector4f &c);
+ SIMD::Float maxRGB(SIMD::Float4 &c);
+ SIMD::Float minRGB(SIMD::Float4 &c);
+ SIMD::Float lumRGB(SIMD::Float4 &c);
+ void premultiply(SIMD::Float4 &c);
void writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples);
void writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples);
diff --git a/src/Pipeline/SamplerCore.cpp b/src/Pipeline/SamplerCore.cpp
index 3162e29..d9845ad 100644
--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -28,7 +28,35 @@
{
}
-Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], SIMD::Float &dRef, Float &&lodOrBias, SIMD::Float &dsx, SIMD::Float &dsy, Vector4i offset, SIMD::Int &sample)
+SIMD::Float4 SamplerCore::sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], const SIMD::Float &dRef, const Float &lodOrBias, const SIMD::Float &dsx, const SIMD::Float &dsy, SIMD::Int offset[4], const SIMD::Int &sample)
+{
+ SIMD::Float4 c;
+
+ for(int i = 0; i < SIMD::Width / 4; i++)
+ {
+ Float4 uvwa128[4];
+ uvwa128[0] = Extract128(uvwa[0], i);
+ uvwa128[1] = Extract128(uvwa[1], i);
+ uvwa128[2] = Extract128(uvwa[2], i);
+ uvwa128[3] = Extract128(uvwa[3], i);
+
+ Vector4i offset128;
+ offset128[0] = Extract128(offset[0], i);
+ offset128[1] = Extract128(offset[1], i);
+ offset128[2] = Extract128(offset[2], i);
+ offset128[3] = Extract128(offset[3], i);
+
+ Vector4f c128 = sampleTexture128(texture, uvwa128, Extract128(dRef, i), lodOrBias, Extract128(dsx, i), Extract128(dsy, i), offset128, Extract128(sample, i));
+ c.x = Insert128(c.x, c128.x, i);
+ c.y = Insert128(c.y, c128.y, i);
+ c.z = Insert128(c.z, c128.z, i);
+ c.w = Insert128(c.w, c128.w, i);
+ }
+
+ return c;
+}
+
+Vector4f SamplerCore::sampleTexture128(Pointer<Byte> &texture, Float4 uvwa[4], const Float4 &dRef, const Float &lodOrBias, const Float4 &dsx, const Float4 &dsy, Vector4i &offset, const Int4 &sample)
{
Vector4f c;
@@ -797,7 +825,7 @@
return c_;
}
-Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
+Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta)
{
Vector4f c = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, false);
@@ -821,7 +849,7 @@
return c;
}
-Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
+Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD)
{
Vector4f c;
@@ -879,7 +907,7 @@
return c;
}
-Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
+Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
{
if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
{
@@ -891,7 +919,7 @@
}
}
-Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
+Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
{
Vector4f c;
@@ -984,7 +1012,7 @@
return c;
}
-Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
+Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD)
{
Vector4f c;
@@ -1084,7 +1112,7 @@
return lod;
}
-void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &dsx, Float4 &dsy)
+void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, const Float4 &dsx, const Float4 &dsy)
{
Float4 dudxy;
@@ -1108,7 +1136,7 @@
lod = log2sqrt(lod);
}
-void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float4 &dsx, Float4 &dsy)
+void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, const Float4 &dsx, const Float4 &dsy)
{
Float4 duvdxy;
@@ -1156,7 +1184,7 @@
lod = log2sqrt(lod); // log2(sqrt(lod))
}
-void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M)
+void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy, Float4 &M)
{
Float4 dudxy, dvdxy, dsdxy;
@@ -1197,7 +1225,7 @@
lod = log2(lod);
}
-void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float4 &dsx, Float4 &dsy)
+void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, const Float4 &dsx, const Float4 &dsy)
{
Float4 dudxy, dvdxy, dsdxy;
@@ -1236,9 +1264,9 @@
// TODO: Comply with Vulkan recommendation:
// Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
- Int4 xn = CmpLT(x, Float4(0.0f)); // x < 0
- Int4 yn = CmpLT(y, Float4(0.0f)); // y < 0
- Int4 zn = CmpLT(z, Float4(0.0f)); // z < 0
+ Int4 xn = CmpLT(x, 0.0f); // x < 0
+ Int4 yn = CmpLT(y, 0.0f); // y < 0
+ Int4 zn = CmpLT(z, 0.0f); // z < 0
Float4 absX = Abs(x);
Float4 absY = Abs(y);
@@ -1282,9 +1310,9 @@
// V = !yMajor ? -y : (n ^ z)
V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
- M = reciprocal(M) * Float4(0.5f);
- U = U * M + Float4(0.5f);
- V = V * M + Float4(0.5f);
+ M = reciprocal(M) * 0.5f;
+ U = U * M + 0.5f;
+ V = V * M + 0.5f;
return face;
}
@@ -1941,7 +1969,7 @@
return c;
}
-Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
+Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, const Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer)
{
Int4 valid;
diff --git a/src/Pipeline/SamplerCore.hpp b/src/Pipeline/SamplerCore.hpp
index 632f894..e85ca05 100644
--- a/src/Pipeline/SamplerCore.hpp
+++ b/src/Pipeline/SamplerCore.hpp
@@ -61,9 +61,11 @@
public:
SamplerCore(Pointer<Byte> &constants, const Sampler &state, SamplerFunction function);
- Vector4f sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], SIMD::Float &dRef, Float &&lodOrBias, SIMD::Float &dsx, SIMD::Float &dsy, Vector4i offset, SIMD::Int &sample);
+ SIMD::Float4 sampleTexture(Pointer<Byte> &texture, SIMD::Float uvwa[4], const SIMD::Float &dRef, const Float &lodOrBias, const SIMD::Float &dsx, const SIMD::Float &dsy, SIMD::Int offset[4], const SIMD::Int &sample);
private:
+ Vector4f sampleTexture128(Pointer<Byte> &texture, Float4 uvwa[4], const Float4 &dRef, const Float &lodOrBias, const Float4 &dsx, const Float4 &dsy, Vector4i &offset, const Int4 &sample);
+
Float4 applySwizzle(const Vector4f &c, VkComponentSwizzle swizzle, bool integer);
Short4 offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod);
Vector4s sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta);
@@ -71,22 +73,22 @@
Vector4s sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
Vector4s sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
Vector4s sample3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
- Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta);
- Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD);
- Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
- Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
- Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
- void computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &dsx, Float4 &dsy);
- void computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, Float4 &dsx, Float4 &dsy);
- void computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M);
- void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy);
+ Vector4f sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta);
+ Vector4f sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD);
+ Vector4f sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
+ Vector4f sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
+ Vector4f sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD);
+ void computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &u, const Float4 &dsx, const Float4 &dsy);
+ void computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &u, Float4 &v, const Float4 &dsx, const Float4 &dsy);
+ void computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy, Float4 &M);
+ void computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, const Float4 &dsx, const Float4 &dsy);
Int4 cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M);
Short4 applyOffset(Short4 &uvw, Int4 &offset, const Int4 &whd, AddressingMode mode);
void computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Short4 &cubeArrayLayer, Vector4i &offset, const Int4 &sample, const Pointer<Byte> &mipmap);
void computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, const Int4 &sample, Int4 valid, const Pointer<Byte> &mipmap);
Vector4s sampleTexel(Short4 &u, Short4 &v, Short4 &w, const Short4 &cubeArrayLayer, Vector4i &offset, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer);
Vector4s sampleTexel(UInt index[4], Pointer<Byte> buffer);
- Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &w, Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer);
+ Vector4f sampleTexel(Int4 &u, Int4 &v, Int4 &w, const Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer);
Vector4f replaceBorderTexel(const Vector4f &c, Int4 valid);
Pointer<Byte> selectMipmap(const Pointer<Byte> &texture, const Float &lod, bool secondLOD);
Short4 address(const Float4 &uvw, AddressingMode addressingMode, Pointer<Byte> &mipmap);
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index ced3ed6..c1a21c1 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -513,11 +513,79 @@
return Sqrt(x); // TODO(b/222218659): Optimize for relaxed precision.
}
+UInt4 halfToFloatBits(RValue<UInt4> halfBits)
+{
+ auto magic = UInt4(126 << 23);
+
+ auto sign16 = halfBits & UInt4(0x8000);
+ auto man16 = halfBits & UInt4(0x03FF);
+ auto exp16 = halfBits & UInt4(0x7C00);
+
+ auto isDnormOrZero = CmpEQ(exp16, UInt4(0));
+ auto isInfOrNaN = CmpEQ(exp16, UInt4(0x7C00));
+
+ auto sign32 = sign16 << 16;
+ auto man32 = man16 << 13;
+ auto exp32 = (exp16 + UInt4(0x1C000)) << 13;
+ auto norm32 = (man32 | exp32) | (isInfOrNaN & UInt4(0x7F800000));
+
+ auto denorm32 = As<UInt4>(As<Float4>(magic + man16) - As<Float4>(magic));
+
+ return sign32 | (norm32 & ~isDnormOrZero) | (denorm32 & isDnormOrZero);
+}
+
+UInt4 floatToHalfBits(RValue<UInt4> floatBits, bool storeInUpperBits)
+{
+ UInt4 sign = floatBits & UInt4(0x80000000);
+ UInt4 abs = floatBits & UInt4(0x7FFFFFFF);
+
+ UInt4 normal = CmpNLE(abs, UInt4(0x38800000));
+
+ UInt4 mantissa = (abs & UInt4(0x007FFFFF)) | UInt4(0x00800000);
+ UInt4 e = UInt4(113) - (abs >> 23);
+ UInt4 denormal = CmpLT(e, UInt4(24)) & (mantissa >> e);
+
+ UInt4 base = (normal & abs) | (~normal & denormal); // TODO: IfThenElse()
+
+ // float exponent bias is 127, half bias is 15, so adjust by -112
+ UInt4 bias = normal & UInt4(0xC8000000);
+
+ UInt4 rounded = base + bias + UInt4(0x00000FFF) + ((base >> 13) & UInt4(1));
+ UInt4 fp16u = rounded >> 13;
+
+ // Infinity
+ fp16u |= CmpNLE(abs, UInt4(0x47FFEFFF)) & UInt4(0x7FFF);
+
+ return storeInUpperBits ? (sign | (fp16u << 16)) : ((sign >> 16) | fp16u);
+}
+
+SIMD::Float linearToSRGB(const SIMD::Float &c)
+{
+ SIMD::Float lc = Min(c, 0.0031308f) * 12.92f;
+ SIMD::Float ec = MulAdd(1.055f, Pow<Mediump>(c, (1.0f / 2.4f)), -0.055f); // TODO(b/149574741): Use a custom approximation.
+
+ return Max(lc, ec);
+}
+
+SIMD::Float sRGBtoLinear(const SIMD::Float &c)
+{
+ SIMD::Float lc = c * (1.0f / 12.92f);
+ SIMD::Float ec = Pow<Mediump>(MulAdd(c, 1.0f / 1.055f, 0.055f / 1.055f), 2.4f); // TODO(b/149574741): Use a custom approximation.
+
+ SIMD::Int linear = CmpLT(c, 0.04045f);
+ return As<SIMD::Float>((linear & As<SIMD::Int>(lc)) | (~linear & As<SIMD::Int>(ec))); // TODO: IfThenElse()
+}
+
RValue<Float4> reciprocal(RValue<Float4> x, bool pp, bool exactAtPow2)
{
return Rcp(x, pp, exactAtPow2);
}
+RValue<SIMD::Float> reciprocal(RValue<SIMD::Float> x, bool pp, bool exactAtPow2)
+{
+ return Rcp(x, pp, exactAtPow2);
+}
+
RValue<Float4> reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
{
Float4 abs = x;
@@ -541,6 +609,24 @@
return MulAdd(x, y, z);
}
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y, bool relaxedPrecision)
+{
+ // TODO(b/214588983): Eliminate by using only the wide SIMD variant (or specialize or templatize the implementation).
+ SIMD::Float xx;
+ SIMD::Float yy;
+ xx = Insert128(xx, x, 0);
+ yy = Insert128(yy, y, 0);
+ return Extract128(Pow(xx, yy, relaxedPrecision), 0);
+}
+
+RValue<Float4> Sqrt(RValue<Float4> x, bool relaxedPrecision)
+{
+ // TODO(b/214588983): Eliminate by using only the wide SIMD variant (or specialize or templatize the implementation).
+ SIMD::Float xx;
+ xx = Insert128(xx, x, 0);
+ return Extract128(Sqrt(xx, relaxedPrecision), 0);
+}
+
void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3)
{
Int2 tmp0 = UnpackHigh(row0, row1);
diff --git a/src/Pipeline/ShaderCore.hpp b/src/Pipeline/ShaderCore.hpp
index b02f767..f7937ff 100644
--- a/src/Pipeline/ShaderCore.hpp
+++ b/src/Pipeline/ShaderCore.hpp
@@ -76,19 +76,25 @@
Int4 w;
};
-// SIMD contains types that represent multiple scalars packed into a single
-// vector data type. Types in the SIMD namespace provide a semantic hint
-// that the data should be treated as a per-execution-lane scalar instead of
-// a typical euclidean-style vector type.
namespace SIMD {
-// Width is the number of per-lane scalars packed into each SIMD vector.
-static constexpr int Width = 4;
+using namespace rr::SIMD;
-using Float = rr::Float4;
-using Int = rr::Int4;
-using UInt = rr::UInt4;
-using Pointer = rr::Pointer4;
+struct Float4
+{
+ SIMD::Float x;
+ SIMD::Float y;
+ SIMD::Float z;
+ SIMD::Float w;
+};
+
+struct Int4
+{
+ SIMD::Int x;
+ SIMD::Int y;
+ SIMD::Int z;
+ SIMD::Int w;
+};
} // namespace SIMD
@@ -123,20 +129,39 @@
};
// clang-format off
-template<Precision precision> RValue<Float4> Sqrt(RValue<Float4> x);
-template<> inline RValue<Float4> Sqrt<Highp>(RValue<Float4> x) { return Sqrt(x, false); }
-template<> inline RValue<Float4> Sqrt<Mediump>(RValue<Float4> x) { return Sqrt(x, true); }
+template<Precision precision> RValue<SIMD::Float> Pow(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
+template<> inline RValue<SIMD::Float> Pow<Highp>(RValue<SIMD::Float> x, RValue<SIMD::Float> y) { return Pow(x, y, false); }
+template<> inline RValue<SIMD::Float> Pow<Mediump>(RValue<SIMD::Float> x, RValue<SIMD::Float> y) { return Pow(x, y, true); }
-template<Precision precision> RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
-template<> inline RValue<Float4> Pow<Highp>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, false); }
-template<> inline RValue<Float4> Pow<Mediump>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, true); }
+template<Precision precision> RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x);
+template<> inline RValue<SIMD::Float> Sqrt<Highp>(RValue<SIMD::Float> x) { return Sqrt(x, false); }
+template<> inline RValue<SIMD::Float> Sqrt<Mediump>(RValue<SIMD::Float> x) { return Sqrt(x, true); }
// clang-format on
+SIMD::UInt halfToFloatBits(SIMD::UInt halfBits);
+SIMD::UInt floatToHalfBits(SIMD::UInt floatBits, bool storeInUpperBits);
+SIMD::Float linearToSRGB(const SIMD::Float &c);
+SIMD::Float sRGBtoLinear(const SIMD::Float &c);
+
RValue<Float4> reciprocal(RValue<Float4> x, bool pp = false, bool exactAtPow2 = false);
+RValue<SIMD::Float> reciprocal(RValue<SIMD::Float> x, bool pp = false, bool exactAtPow2 = false);
RValue<Float4> reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
RValue<SIMD::Float> mulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z); // TODO(chromium:1299047)
+RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y, bool relaxedPrecision);
+RValue<Float4> Sqrt(RValue<Float4> x, bool relaxedPrecision);
+
+// clang-format off
+template<Precision precision> RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y);
+template<> inline RValue<Float4> Pow<Highp>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, false); }
+template<> inline RValue<Float4> Pow<Mediump>(RValue<Float4> x, RValue<Float4> y) { return Pow(x, y, true); }
+
+template<Precision precision> RValue<Float4> Sqrt(RValue<Float4> x);
+template<> inline RValue<Float4> Sqrt<Highp>(RValue<Float4> x) { return Sqrt(x, false); }
+template<> inline RValue<Float4> Sqrt<Mediump>(RValue<Float4> x) { return Sqrt(x, true); }
+// clang-format on
+
void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
@@ -146,8 +171,8 @@
void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
-sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits);
-sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
+UInt4 halfToFloatBits(RValue<UInt4> halfBits);
+UInt4 floatToHalfBits(RValue<UInt4> floatBits, bool storeInUpperBits);
Float4 r11g11b10Unpack(UInt r11g11b10bits);
UInt r11g11b10Pack(const Float4 &value);
Float4 linearToSRGB(const Float4 &c);
diff --git a/src/Pipeline/SpirvShaderSampling.cpp b/src/Pipeline/SpirvShaderSampling.cpp
index ceaa0d8..ee4cca3 100644
--- a/src/Pipeline/SpirvShaderSampling.cpp
+++ b/src/Pipeline/SpirvShaderSampling.cpp
@@ -154,9 +154,9 @@
SIMD::Float uvwa[4];
SIMD::Float dRef;
SIMD::Float lodOrBias; // Explicit level-of-detail, or bias added to the implicit level-of-detail (depending on samplerMethod).
- Vector4f dsx;
- Vector4f dsy;
- Vector4i offset;
+ SIMD::Float dsx[4];
+ SIMD::Float dsy[4];
+ SIMD::Int offset[4];
SIMD::Int sampleId;
SamplerFunction samplerFunction = instruction.getSamplerFunction();
@@ -216,15 +216,15 @@
{
SIMD::Float dPdx;
SIMD::Float dPdy;
- dPdx.x = Pointer<Float>(&dsx.x)[i];
- dPdx.y = Pointer<Float>(&dsx.y)[i];
- dPdx.z = Pointer<Float>(&dsx.z)[i];
+ dPdx.x = Pointer<Float>(&dsx[0])[i];
+ dPdx.y = Pointer<Float>(&dsx[1])[i];
+ dPdx.z = Pointer<Float>(&dsx[2])[i];
- dPdy.x = Pointer<Float>(&dsy.x)[i];
- dPdy.y = Pointer<Float>(&dsy.y)[i];
- dPdy.z = Pointer<Float>(&dsy.z)[i];
+ dPdy.x = Pointer<Float>(&dsy[0])[i];
+ dPdy.y = Pointer<Float>(&dsy[1])[i];
+ dPdy.z = Pointer<Float>(&dsy[2])[i];
- Vector4f sample = s.sampleTexture(texture, uvwa, dRef, lod[i], dPdx, dPdy, offset, sampleId);
+ SIMD::Float4 sample = s.sampleTexture(texture, uvwa, dRef, lod[i], dPdx, dPdy, offset, sampleId);
If(perLaneSampling)
{
@@ -249,7 +249,8 @@
}
else
{
- Vector4f sample = s.sampleTexture(texture, uvwa, dRef, lodOrBias.x, (dsx.x), (dsy.x), offset, sampleId);
+ Float lod = Float(lodOrBias.x);
+ SIMD::Float4 sample = s.sampleTexture(texture, uvwa, dRef, lod, (dsx[0]), (dsy[0]), offset, sampleId);
Pointer<SIMD::Float> rgba = out;
rgba[0] = sample.x;
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index 535dc97..4f6fb92 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -567,6 +567,8 @@
void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
{
+ ASSERT(SIMD::Width == 4);
+
UInt index0 = batch[0];
UInt index1 = batch[1];
UInt index2 = batch[2];
@@ -590,7 +592,7 @@
assert(it->second.SizeInComponents == 4);
auto &position = routine.getVariable(it->second.Id);
- Vector4f pos;
+ SIMD::Float4 pos;
pos.x = position[it->second.FirstComponent + 0];
pos.y = position[it->second.FirstComponent + 1];
pos.z = position[it->second.FirstComponent + 2];
@@ -600,30 +602,38 @@
SIMD::Float w = As<SIMD::Float>(As<SIMD::Int>(pos.w) | (As<SIMD::Int>(CmpEQ(pos.w, 0.0f)) & As<SIMD::Int>(SIMD::Float(1.0f))));
SIMD::Float rhw = 1.0f / w;
- Vector4f proj;
+ SIMD::Float4 proj;
proj.x = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, WxF)))));
proj.y = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, HxF)))));
proj.z = pos.z * rhw;
proj.w = rhw;
- transpose4x4(pos.x, pos.y, pos.z, pos.w);
+ Float4 pos_x = Extract128(pos.x, 0);
+ Float4 pos_y = Extract128(pos.y, 0);
+ Float4 pos_z = Extract128(pos.z, 0);
+ Float4 pos_w = Extract128(pos.w, 0);
+ transpose4x4(pos_x, pos_y, pos_z, pos_w);
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos.w;
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos.z;
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos.y;
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos.x;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos_w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos_z;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos_y;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos_x;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
- transpose4x4(proj.x, proj.y, proj.z, proj.w);
+ Float4 proj_x = Extract128(proj.x, 0);
+ Float4 proj_y = Extract128(proj.y, 0);
+ Float4 proj_z = Extract128(proj.z, 0);
+ Float4 proj_w = Extract128(proj.w, 0);
+ transpose4x4(proj_x, proj_y, proj_z, proj_w);
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj.w;
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj.z;
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj.y;
- *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj.x;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj_w;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj_z;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj_y;
+ *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj_x;
}
it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
@@ -679,10 +689,10 @@
spirvShader->outputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
{
Vector4f v;
- v.x = routine.outputs[i + 0];
- v.y = routine.outputs[i + 1];
- v.z = routine.outputs[i + 2];
- v.w = routine.outputs[i + 3];
+ v.x = Extract128(routine.outputs[i + 0], 0);
+ v.y = Extract128(routine.outputs[i + 1], 0);
+ v.z = Extract128(routine.outputs[i + 2], 0);
+ v.w = Extract128(routine.outputs[i + 3], 0);
transpose4x4(v.x, v.y, v.z, v.w);
diff --git a/src/Reactor/LLVMReactor.cpp b/src/Reactor/LLVMReactor.cpp
index cfe7004..6512b0d 100644
--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -360,7 +360,7 @@
namespace rr {
-const int SIMD::Width = 8;
+const int SIMD::Width = 4;
std::string Caps::backendName()
{
@@ -1153,14 +1153,14 @@
}
}
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
{
- return As<Float4>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
+ return As<SIMD::Float>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
}
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
{
- return As<Int4>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
+ return As<SIMD::Int>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
}
static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
@@ -1216,12 +1216,12 @@
}
}
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
{
return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
}
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
{
return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
}
diff --git a/src/Reactor/Print.hpp b/src/Reactor/Print.hpp
index d52d3ab..21839ee 100644
--- a/src/Reactor/Print.hpp
+++ b/src/Reactor/Print.hpp
@@ -326,14 +326,25 @@
static std::vector<Value *> val(const RValue<Pointer<T>> &v) { return { v.value() }; }
};
template<>
-struct PrintValue::Ty<Pointer4>
+struct PrintValue::Ty<SIMD::Pointer>
{
- static std::string fmt(const Pointer4 &v)
+ static std::string fmt(const SIMD::Pointer &v)
{
- return v.isBasePlusOffset ? "{%p + [%d, %d, %d, %d]}" : "{%p, %p, %p, %p}";
+ if(v.isBasePlusOffset)
+ {
+ std::string format;
+ for(int i = 1; i < SIMD::Width; i++) { format += ", %p"; }
+ return "{%p + [%d" + format + "]}";
+ }
+ else
+ {
+ std::string format;
+ for(int i = 1; i < SIMD::Width; i++) { format += ", %p"; }
+ return "{%p" + format + "}";
+ }
}
- static std::vector<Value *> val(const Pointer4 &v)
+ static std::vector<Value *> val(const SIMD::Pointer &v)
{
return v.getPrintValues();
}
diff --git a/src/Reactor/SIMD.cpp b/src/Reactor/SIMD.cpp
index 11f636b..ddf2d56 100644
--- a/src/Reactor/SIMD.cpp
+++ b/src/Reactor/SIMD.cpp
@@ -42,6 +42,21 @@
storeValue(Nucleus::createConstantVector(constantVector, type()));
}
+SIMD::Int::Int(int x, int y, int z, int w)
+ : XYZW(this)
+{
+ std::vector<int64_t> constantVector = { x, y, z, w };
+ storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
+SIMD::Int::Int(std::vector<int> v)
+ : XYZW(this)
+{
+ std::vector<int64_t> constantVector;
+ for(int i : v) { constantVector.push_back(i); }
+ storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
SIMD::Int::Int(RValue<SIMD::Int> rhs)
: XYZW(this)
{
@@ -247,6 +262,21 @@
storeValue(Nucleus::createConstantVector(constantVector, type()));
}
+SIMD::UInt::UInt(int x, int y, int z, int w)
+ : XYZW(this)
+{
+ std::vector<int64_t> constantVector = { x, y, z, w };
+ storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
+SIMD::UInt::UInt(std::vector<int> v)
+ : XYZW(this)
+{
+ std::vector<int64_t> constantVector;
+ for(int i : v) { constantVector.push_back(i); }
+ storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
SIMD::UInt::UInt(RValue<SIMD::UInt> rhs)
: XYZW(this)
{
@@ -467,6 +497,21 @@
storeValue(Nucleus::createConstantVector(constantVector, type()));
}
+SIMD::Float::Float(float x, float y, float z, float w)
+ : XYZW(this)
+{
+ std::vector<double> constantVector = { x, y, z, w };
+ storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
+SIMD::Float::Float(std::vector<float> v)
+ : XYZW(this)
+{
+ std::vector<double> constantVector;
+ for(int f : v) { constantVector.push_back(f); }
+ storeValue(Nucleus::createConstantVector(constantVector, type()));
+}
+
SIMD::Float SIMD::Float::infinity()
{
SIMD::Float result;
@@ -508,6 +553,18 @@
*this = RValue<scalar::Float>(rhs.loadValue());
}
+SIMD::Float::Float(RValue<packed::Float4> rhs)
+ : XYZW(this)
+{
+ ASSERT(SIMD::Width == 4);
+ *this = Insert128(*this, rhs, 0);
+}
+
+RValue<SIMD::Float> SIMD::Float::operator=(RValue<packed::Float4> rhs)
+{
+ return *this = SIMD::Float(rhs);
+}
+
RValue<SIMD::Float> SIMD::Float::operator=(float x)
{
return *this = SIMD::Float(x);
@@ -598,6 +655,18 @@
return RValue<SIMD::Float>(Nucleus::createFNeg(val.value()));
}
+RValue<SIMD::Float> Rcp(RValue<SIMD::Float> x, bool relaxedPrecision, bool exactAtPow2)
+{
+ ASSERT(SIMD::Width == 4);
+ return SIMD::Float(Rcp(Extract128(x, 0), relaxedPrecision, exactAtPow2));
+}
+
+RValue<SIMD::Float> RcpSqrt(RValue<SIMD::Float> x, bool relaxedPrecision)
+{
+ ASSERT(SIMD::Width == 4);
+ return SIMD::Float(RcpSqrt(Extract128(x, 0), relaxedPrecision));
+}
+
RValue<SIMD::Float> Insert(RValue<SIMD::Float> x, RValue<scalar::Float> element, int i)
{
return RValue<SIMD::Float>(Nucleus::createInsertElement(x.value(), element.value(), i));
@@ -802,81 +871,81 @@
return Insert128(result, Shuffle(Extract128(x, 0), Extract128(y, 0), select), 0);
}
-Pointer4::Pointer4(Pointer<Byte> base, rr::Int limit)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, rr::Int limit)
: base(base)
, dynamicLimit(limit)
, staticLimit(0)
, dynamicOffsets(0)
- , staticOffsets(4)
+ , staticOffsets(SIMD::Width)
, hasDynamicLimit(true)
, hasDynamicOffsets(false)
, isBasePlusOffset(true)
{}
-Pointer4::Pointer4(Pointer<Byte> base, unsigned int limit)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, unsigned int limit)
: base(base)
, dynamicLimit(0)
, staticLimit(limit)
, dynamicOffsets(0)
- , staticOffsets(4)
+ , staticOffsets(SIMD::Width)
, hasDynamicLimit(false)
, hasDynamicOffsets(false)
, isBasePlusOffset(true)
{}
-Pointer4::Pointer4(Pointer<Byte> base, rr::Int limit, Int4 offset)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, rr::Int limit, SIMD::Int offset)
: base(base)
, dynamicLimit(limit)
, staticLimit(0)
, dynamicOffsets(offset)
- , staticOffsets(4)
+ , staticOffsets(SIMD::Width)
, hasDynamicLimit(true)
, hasDynamicOffsets(true)
, isBasePlusOffset(true)
{}
-Pointer4::Pointer4(Pointer<Byte> base, unsigned int limit, Int4 offset)
+SIMD::Pointer::Pointer(scalar::Pointer<Byte> base, unsigned int limit, SIMD::Int offset)
: base(base)
, dynamicLimit(0)
, staticLimit(limit)
, dynamicOffsets(offset)
- , staticOffsets(4)
+ , staticOffsets(SIMD::Width)
, hasDynamicLimit(false)
, hasDynamicOffsets(true)
, isBasePlusOffset(true)
{}
-Pointer4::Pointer4(std::vector<Pointer<Byte>> pointers)
+SIMD::Pointer::Pointer(std::vector<scalar::Pointer<Byte>> pointers)
: pointers(pointers)
, isBasePlusOffset(false)
{}
-Pointer4::Pointer4(UInt4 cast)
- : pointers(4)
+SIMD::Pointer::Pointer(SIMD::UInt cast)
+ : pointers(SIMD::Width)
, isBasePlusOffset(false)
{
assert(sizeof(void *) == 4);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
- pointers[i] = As<Pointer<Byte>>(Extract(cast, i));
+ pointers[i] = As<rr::Pointer<Byte>>(Extract(cast, i));
}
}
-Pointer4::Pointer4(UInt4 castLow, UInt4 castHigh)
- : pointers(4)
+SIMD::Pointer::Pointer(SIMD::UInt castLow, SIMD::UInt castHigh)
+ : pointers(SIMD::Width)
, isBasePlusOffset(false)
{
assert(sizeof(void *) == 8);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
UInt2 address;
address = Insert(address, Extract(castLow, i), 0);
address = Insert(address, Extract(castHigh, i), 1);
- pointers[i] = As<Pointer<Byte>>(address);
+ pointers[i] = As<rr::Pointer<Byte>>(address);
}
}
-Pointer4 &Pointer4::operator+=(Int4 i)
+SIMD::Pointer &SIMD::Pointer::operator+=(SIMD::Int i)
{
if(isBasePlusOffset)
{
@@ -885,67 +954,68 @@
}
else
{
- for(int el = 0; el < 4; el++) { pointers[el] += Extract(i, el); }
+ for(int el = 0; el < SIMD::Width; el++) { pointers[el] += Extract(i, el); }
}
return *this;
}
-Pointer4 Pointer4::operator+(Int4 i)
+SIMD::Pointer SIMD::Pointer::operator+(SIMD::Int i)
{
- Pointer4 p = *this;
+ SIMD::Pointer p = *this;
p += i;
return p;
}
-Pointer4 &Pointer4::operator+=(int i)
+SIMD::Pointer &SIMD::Pointer::operator+=(int i)
{
if(isBasePlusOffset)
{
- for(int el = 0; el < 4; el++) { staticOffsets[el] += i; }
+ for(int el = 0; el < SIMD::Width; el++) { staticOffsets[el] += i; }
}
else
{
- for(int el = 0; el < 4; el++) { pointers[el] += i; }
+ for(int el = 0; el < SIMD::Width; el++) { pointers[el] += i; }
}
return *this;
}
-Pointer4 Pointer4::operator+(int i)
+SIMD::Pointer SIMD::Pointer::operator+(int i)
{
- Pointer4 p = *this;
+ SIMD::Pointer p = *this;
p += i;
return p;
}
-Int4 Pointer4::offsets() const
+SIMD::Int SIMD::Pointer::offsets() const
{
ASSERT_MSG(isBasePlusOffset, "No offsets for this type of pointer");
- return dynamicOffsets + Int4(staticOffsets[0], staticOffsets[1], staticOffsets[2], staticOffsets[3]);
+ return dynamicOffsets + SIMD::Int(staticOffsets);
}
-Int4 Pointer4::isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+SIMD::Int SIMD::Pointer::isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
{
ASSERT(accessSize > 0);
if(isStaticallyInBounds(accessSize, robustness))
{
- return Int4(0xFFFFFFFF);
+ return SIMD::Int(0xFFFFFFFF);
}
if(!hasDynamicOffsets && !hasDynamicLimit)
{
+ ASSERT(SIMD::Width == 4);
// Common fast paths.
- return Int4(
+ return SIMD::Int(
(staticOffsets[0] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0,
(staticOffsets[1] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0,
(staticOffsets[2] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0,
(staticOffsets[3] + accessSize - 1 < staticLimit) ? 0xFFFFFFFF : 0);
}
- return CmpGE(offsets(), Int4(0)) & CmpLT(offsets() + Int4(accessSize - 1), Int4(limit()));
+ return CmpGE(offsets(), 0) & CmpLT(offsets() + SIMD::Int(accessSize - 1), limit());
}
-bool Pointer4::isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
+bool SIMD::Pointer::isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const
{
if(hasDynamicOffsets)
{
@@ -970,7 +1040,7 @@
}
}
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
if(staticOffsets[i] + accessSize - 1 >= staticLimit)
{
@@ -981,14 +1051,14 @@
return true;
}
-rr::Int Pointer4::limit() const
+SIMD::Int SIMD::Pointer::limit() const
{
return dynamicLimit + staticLimit;
}
// Returns true if all offsets are compile-time static and sequential
// (N+0*step, N+1*step, N+2*step, N+3*step)
-bool Pointer4::hasStaticSequentialOffsets(unsigned int step) const
+bool SIMD::Pointer::hasStaticSequentialOffsets(unsigned int step) const
{
ASSERT_MSG(isBasePlusOffset, "No offsets for this type of pointer");
if(hasDynamicOffsets)
@@ -996,7 +1066,7 @@
return false;
}
- for(int i = 1; i < 4; i++)
+ for(int i = 1; i < SIMD::Width; i++)
{
if(staticOffsets[i - 1] + int32_t(step) != staticOffsets[i])
{
@@ -1009,7 +1079,7 @@
// Returns true if all offsets are compile-time static and equal
// (N, N, N, N)
-bool Pointer4::hasStaticEqualOffsets() const
+bool SIMD::Pointer::hasStaticEqualOffsets() const
{
ASSERT_MSG(isBasePlusOffset, "No offsets for this type of pointer");
if(hasDynamicOffsets)
@@ -1017,7 +1087,7 @@
return false;
}
- for(int i = 1; i < 4; i++)
+ for(int i = 1; i < SIMD::Width; i++)
{
if(staticOffsets[0] != staticOffsets[i])
{
@@ -1028,22 +1098,22 @@
return true;
}
-Pointer<Byte> Pointer4::getUniformPointer() const
+scalar::Pointer<Byte> SIMD::Pointer::getUniformPointer() const
{
#ifndef NDEBUG
if(isBasePlusOffset)
{
- Int4 uniform = offsets();
- Int x = Extract(uniform, 0);
+ SIMD::Int uniform = offsets();
+ scalar::Int x = Extract(uniform, 0);
- for(int i = 1; i < 4; i++)
+ for(int i = 1; i < SIMD::Width; i++)
{
Assert(x == Extract(uniform, i));
}
}
else
{
- for(int i = 1; i < 4; i++)
+ for(int i = 1; i < SIMD::Width; i++)
{
Assert(pointers[0] == pointers[i]);
}
@@ -1053,7 +1123,7 @@
return getPointerForLane(0);
}
-Pointer<Byte> Pointer4::getPointerForLane(int lane) const
+scalar::Pointer<Byte> SIMD::Pointer::getPointerForLane(int lane) const
{
if(isBasePlusOffset)
{
@@ -1065,19 +1135,19 @@
}
}
-void Pointer4::castTo(UInt4 &bits) const
+void SIMD::Pointer::castTo(SIMD::UInt &bits) const
{
assert(sizeof(void *) == 4);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
- bits = Insert(bits, As<UInt>(pointers[i]), i);
+ bits = Insert(bits, As<scalar::UInt>(pointers[i]), i);
}
}
-void Pointer4::castTo(UInt4 &lowerBits, UInt4 &upperBits) const
+void SIMD::Pointer::castTo(SIMD::UInt &lowerBits, SIMD::UInt &upperBits) const
{
assert(sizeof(void *) == 8);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
UInt2 address = As<UInt2>(pointers[i]);
lowerBits = Insert(lowerBits, Extract(address, 0), i);
@@ -1085,10 +1155,10 @@
}
}
-Pointer4 Pointer4::IfThenElse(Int4 condition, const Pointer4 &lhs, const Pointer4 &rhs)
+SIMD::Pointer SIMD::Pointer::IfThenElse(SIMD::Int condition, const SIMD::Pointer &lhs, const SIMD::Pointer &rhs)
{
- std::vector<Pointer<Byte>> pointers(4);
- for(int i = 0; i < 4; i++)
+ std::vector<scalar::Pointer<Byte>> pointers(SIMD::Width);
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(condition, i) != 0)
{
@@ -1104,7 +1174,7 @@
}
#ifdef ENABLE_RR_PRINT
-std::vector<rr::Value *> Pointer4::getPrintValues() const
+std::vector<rr::Value *> SIMD::Pointer::getPrintValues() const
{
if(isBasePlusOffset)
{
@@ -1112,7 +1182,12 @@
}
else
{
- return PrintValue::vals(pointers[0], pointers[1], pointers[2], pointers[3]);
+ std::vector<Value *> vals;
+ for(int i = 0; i < SIMD::Width; i++)
+ {
+ vals.push_back(RValue<scalar::Pointer<Byte>>(pointers[i]).value());
+ }
+ return vals;
}
}
#endif
diff --git a/src/Reactor/SIMD.hpp b/src/Reactor/SIMD.hpp
index c1d2783..6508704 100644
--- a/src/Reactor/SIMD.hpp
+++ b/src/Reactor/SIMD.hpp
@@ -25,6 +25,8 @@
using Int = rr::Int;
using UInt = rr::UInt;
using Float = rr::Float;
+template<class T>
+using Pointer = rr::Pointer<T>;
} // namespace scalar
namespace packed {
@@ -40,6 +42,7 @@
class Int;
class UInt;
class Float;
+class Pointer;
class Int : public LValue<SIMD::Int>,
public XYZW<SIMD::Int> // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics.
@@ -49,6 +52,8 @@
Int();
Int(int broadcast);
+ Int(int x, int y, int z, int w);
+ Int(std::vector<int> v);
Int(RValue<SIMD::Int> rhs);
Int(const Int &rhs);
Int(const Reference<SIMD::Int> &rhs);
@@ -59,6 +64,9 @@
Int(const scalar::Int &rhs);
Int(const Reference<scalar::Int> &rhs);
+ template<int T>
+ Int(const SwizzleMask1<packed::Int4, T> &rhs);
+
RValue<SIMD::Int> operator=(int broadcast);
RValue<SIMD::Int> operator=(RValue<SIMD::Int> rhs);
RValue<SIMD::Int> operator=(const Int &rhs);
@@ -76,6 +84,8 @@
UInt();
UInt(int broadcast);
+ UInt(int x, int y, int z, int w);
+ UInt(std::vector<int> v);
UInt(RValue<SIMD::UInt> rhs);
UInt(const UInt &rhs);
UInt(const Reference<SIMD::UInt> &rhs);
@@ -103,6 +113,8 @@
Float();
Float(float broadcast);
+ Float(float x, float y, float z, float w);
+ Float(std::vector<float> v);
Float(RValue<SIMD::Float> rhs);
Float(const Float &rhs);
Float(const Reference<SIMD::Float> &rhs);
@@ -110,6 +122,11 @@
Float(const scalar::Float &rhs);
Float(const Reference<scalar::Float> &rhs);
+ Float(RValue<packed::Float4> rhs);
+ RValue<SIMD::Float> operator=(RValue<packed::Float4> rhs);
+ template<int T>
+ Float(const SwizzleMask1<packed::Float4, T> &rhs);
+
RValue<SIMD::Float> operator=(float broadcast);
RValue<SIMD::Float> operator=(RValue<SIMD::Float> rhs);
RValue<SIMD::Float> operator=(const Float &rhs);
@@ -124,27 +141,25 @@
static int element_count() { return SIMD::Width; }
};
-} // namespace SIMD
-
-class Pointer4
+class Pointer
{
public:
- Pointer4(Pointer<Byte> base, Int limit);
- Pointer4(Pointer<Byte> base, unsigned int limit);
- Pointer4(Pointer<Byte> base, Int limit, Int4 offset);
- Pointer4(Pointer<Byte> base, unsigned int limit, Int4 offset);
- Pointer4(std::vector<Pointer<Byte>> pointers);
- explicit Pointer4(UInt4 cast); // Cast from 32-bit integers to 32-bit pointers
- explicit Pointer4(UInt4 castLow, UInt4 castHight); // Cast from pairs of 32-bit integers to 64-bit pointers
+ Pointer(scalar::Pointer<Byte> base, scalar::Int limit);
+ Pointer(scalar::Pointer<Byte> base, unsigned int limit);
+ Pointer(scalar::Pointer<Byte> base, scalar::Int limit, SIMD::Int offset);
+ Pointer(scalar::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
+ Pointer(std::vector<scalar::Pointer<Byte>> pointers);
+ explicit Pointer(SIMD::UInt cast); // Cast from 32-bit integers to 32-bit pointers
+ explicit Pointer(SIMD::UInt castLow, SIMD::UInt castHight); // Cast from pairs of 32-bit integers to 64-bit pointers
- Pointer4 &operator+=(Int4 i);
- Pointer4 operator+(Int4 i);
- Pointer4 &operator+=(int i);
- Pointer4 operator+(int i);
+ Pointer &operator+=(SIMD::Int i);
+ Pointer operator+(SIMD::Int i);
+ Pointer &operator+=(int i);
+ Pointer operator+(int i);
- Int4 offsets() const;
+ SIMD::Int offsets() const;
- Int4 isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
+ SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
@@ -159,20 +174,20 @@
bool hasStaticEqualOffsets() const;
template<typename T>
- inline T Load(OutOfBoundsBehavior robustness, Int4 mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
+ inline T Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
template<typename T>
- inline void Store(T val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+ inline void Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
template<typename T>
- inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
+ inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
- Pointer<Byte> getUniformPointer() const;
- Pointer<Byte> getPointerForLane(int lane) const;
- static Pointer4 IfThenElse(Int4 condition, const Pointer4 &lhs, const Pointer4 &rhs);
+ scalar::Pointer<Byte> getUniformPointer() const;
+ scalar::Pointer<Byte> getPointerForLane(int lane) const;
+ static Pointer IfThenElse(SIMD::Int condition, const Pointer &lhs, const Pointer &rhs);
- void castTo(UInt4 &bits) const; // Cast from 32-bit pointers to 32-bit integers
- void castTo(UInt4 &lowerBits, UInt4 &upperBits) const; // Cast from 64-bit pointers to pairs of 32-bit integers
+ void castTo(SIMD::UInt &bits) const; // Cast from 32-bit pointers to 32-bit integers
+ void castTo(SIMD::UInt &lowerBits, SIMD::UInt &upperBits) const; // Cast from 64-bit pointers to pairs of 32-bit integers
#ifdef ENABLE_RR_PRINT
std::vector<rr::Value *> getPrintValues() const;
@@ -180,24 +195,26 @@
private:
// Base address for the pointer, common across all lanes.
- Pointer<Byte> base;
+ scalar::Pointer<Byte> base;
// Per-lane address for dealing with non-uniform data
- std::vector<Pointer<Byte>> pointers;
+ std::vector<scalar::Pointer<Byte>> pointers;
public:
// Upper (non-inclusive) limit for offsets from base.
- Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
+ scalar::Int dynamicLimit; // If hasDynamicLimit is false, dynamicLimit is zero.
unsigned int staticLimit = 0;
// Per lane offsets from base.
- Int4 dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
+ SIMD::Int dynamicOffsets; // If hasDynamicOffsets is false, all dynamicOffsets are zero.
std::vector<int32_t> staticOffsets;
bool hasDynamicLimit = false; // True if dynamicLimit is non-zero.
bool hasDynamicOffsets = false; // True if any dynamicOffsets are non-zero.
- bool isBasePlusOffset = false; // True if this uses base+offsets. False if this is a collection of Pointers
+ bool isBasePlusOffset = false; // True if this uses base+offset. False if this is a collection of Pointers
};
+} // namespace SIMD
+
RValue<SIMD::Int> operator+(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
RValue<SIMD::Int> operator-(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
RValue<SIMD::Int> operator*(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
@@ -429,10 +446,10 @@
RValue<SIMD::UInt> Shuffle(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y, uint16_t select);
RValue<SIMD::Float> Shuffle(RValue<SIMD::Float> x, RValue<SIMD::Float> y, uint16_t select);
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes = false);
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment);
+RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes = false);
+void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment);
+void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment);
template<>
inline RValue<SIMD::Int>::RValue(int i)
@@ -455,38 +472,33 @@
RR_DEBUG_INFO_EMIT_VAR(val);
}
-template<typename T>
-struct Element
-{};
-template<>
-struct Element<Float4>
+template<int T>
+SIMD::Int::Int(const SwizzleMask1<packed::Int4, T> &rhs)
+ : XYZW(this)
{
- using type = Float;
-};
-template<>
-struct Element<Int4>
+ *this = rhs.operator RValue<scalar::Int>();
+}
+
+template<int T>
+SIMD::Float::Float(const SwizzleMask1<packed::Float4, T> &rhs)
+ : XYZW(this)
{
- using type = Int;
-};
-template<>
-struct Element<UInt4>
-{
- using type = UInt;
-};
+ *this = rhs.operator RValue<scalar::Float>();
+}
template<typename T>
-inline T Pointer4::Load(OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
{
- using EL = typename Element<T>::type;
+ using EL = typename Scalar<T>::Type;
if(!isBasePlusOffset)
{
T out = T(0);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
- auto el = rr::Load(Pointer<EL>(pointers[i]), alignment, atomic, order);
+ auto el = rr::Load(scalar::Pointer<EL>(pointers[i]), alignment, atomic, order);
out = Insert(out, el, i);
}
}
@@ -501,13 +513,13 @@
if(hasStaticSequentialOffsets(sizeof(float)))
{
// Offsets are sequential. Perform regular load.
- return rr::Load(Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
+ return rr::Load(scalar::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
}
if(hasStaticEqualOffsets())
{
// Load one, replicate.
- return T(*Pointer<EL>(base + staticOffsets[0], alignment));
+ return T(*scalar::Pointer<EL>(base + staticOffsets[0], alignment));
}
}
else
@@ -537,7 +549,7 @@
T out = T(0);
If(AnyTrue(mask))
{
- EL el = *Pointer<EL>(base + staticOffsets[0], alignment);
+ EL el = *scalar::Pointer<EL>(base + staticOffsets[0], alignment);
out = T(el);
}
return out;
@@ -558,7 +570,7 @@
// TODO(b/195446858): Optimize static sequential offsets case by using masked load.
- return Gather(Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
+ return Gather(scalar::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
}
else
{
@@ -568,24 +580,24 @@
{
// Load one, replicate.
auto offset = Extract(offs, 0);
- out = T(rr::Load(Pointer<EL>(&base[offset]), alignment, atomic, order));
+ out = T(rr::Load(scalar::Pointer<EL>(&base[offset]), alignment, atomic, order));
}
Else If(hasStaticSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
{
// Load all elements in a single SIMD instruction.
auto offset = Extract(offs, 0);
- out = rr::Load(Pointer<T>(&base[offset]), alignment, atomic, order);
+ out = rr::Load(scalar::Pointer<T>(&base[offset]), alignment, atomic, order);
}
Else
{
// Divergent offsets or masked lanes.
out = T(0);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
auto offset = Extract(offs, i);
- auto el = rr::Load(Pointer<EL>(&base[offset]), alignment, atomic, order);
+ auto el = rr::Load(scalar::Pointer<EL>(&base[offset]), alignment, atomic, order);
out = Insert(out, el, i);
}
}
@@ -595,34 +607,34 @@
}
template<>
-inline Pointer4 Pointer4::Load(OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
+inline SIMD::Pointer SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
{
- std::vector<Pointer<Byte>> pointers(4);
+ std::vector<scalar::Pointer<Byte>> pointers(SIMD::Width);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
- pointers[i] = rr::Load(Pointer<Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
+ pointers[i] = rr::Load(scalar::Pointer<scalar::Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
}
}
- return Pointer4(pointers);
+ return SIMD::Pointer(pointers);
}
template<typename T>
-inline void Pointer4::Store(T val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
{
- using EL = typename Element<T>::type;
+ using EL = typename Scalar<T>::Type;
constexpr size_t alignment = sizeof(float);
if(!isBasePlusOffset)
{
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
- rr::Store(Extract(val, i), Pointer<EL>(pointers[i]), alignment, atomic, order);
+ rr::Store(Extract(val, i), scalar::Pointer<EL>(pointers[i]), alignment, atomic, order);
}
}
return;
@@ -647,15 +659,17 @@
{
If(AnyTrue(mask))
{
+ assert(SIMD::Width == 4);
+
// All equal. One of these writes will win -- elect the winning lane.
- auto v0111 = Int4(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+ auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
- auto maskedVal = As<Int4>(val) & elect;
+ auto maskedVal = As<SIMD::Int>(val) & elect;
auto scalarVal = Extract(maskedVal, 0) |
Extract(maskedVal, 1) |
Extract(maskedVal, 2) |
Extract(maskedVal, 3);
- *Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
+ *scalar::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
}
}
else if(hasStaticSequentialOffsets(sizeof(float)) &&
@@ -664,13 +678,13 @@
// TODO(b/195446858): Optimize using masked store.
// Pointer has no elements OOB, and the store is not atomic.
// Perform a read-modify-write.
- auto p = Pointer<Int4>(base + staticOffsets[0], alignment);
+ auto p = scalar::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
auto prev = *p;
- *p = (prev & ~mask) | (As<Int4>(val) & mask);
+ *p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
}
else
{
- Scatter(Pointer<EL>(base), val, offs, mask, alignment);
+ Scatter(scalar::Pointer<EL>(base), val, offs, mask, alignment);
}
}
else
@@ -680,17 +694,17 @@
{
// Store all elements in a single SIMD instruction.
auto offset = Extract(offs, 0);
- rr::Store(val, Pointer<T>(&base[offset]), alignment, atomic, order);
+ rr::Store(val, scalar::Pointer<T>(&base[offset]), alignment, atomic, order);
}
Else
{
// Divergent offsets or masked lanes.
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
auto offset = Extract(offs, i);
- rr::Store(Extract(val, i), Pointer<EL>(&base[offset]), alignment, atomic, order);
+ rr::Store(Extract(val, i), scalar::Pointer<EL>(&base[offset]), alignment, atomic, order);
}
}
}
@@ -698,21 +712,21 @@
}
template<>
-inline void Pointer4::Store(Pointer4 val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+inline void SIMD::Pointer::Store(SIMD::Pointer val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
{
constexpr size_t alignment = sizeof(void *);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
- rr::Store(val.getPointerForLane(i), Pointer<Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
+ rr::Store(val.getPointerForLane(i), scalar::Pointer<scalar::Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
}
}
}
template<typename T>
-inline void Pointer4::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int4 mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
+inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
{
Store(T(val), robustness, mask, atomic, order);
}
diff --git a/src/Reactor/SubzeroReactor.cpp b/src/Reactor/SubzeroReactor.cpp
index 206e578..8f8bd70 100644
--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -4103,7 +4103,7 @@
using UnderlyingTypeT = typename UnderlyingType<T>::Type;
template<typename T, typename EL = UnderlyingTypeT<T>>
-static void gather(T &out, RValue<Pointer<EL>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes)
+static void gather(T &out, RValue<Pointer<EL>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes)
{
constexpr bool atomic = false;
constexpr std::memory_order order = std::memory_order_relaxed;
@@ -4111,7 +4111,7 @@
Pointer<Byte> baseBytePtr = base;
out = T(0);
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
@@ -4127,14 +4127,14 @@
}
template<typename T, typename EL = UnderlyingTypeT<T>>
-static void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+static void scatter(RValue<Pointer<EL>> base, RValue<T> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
{
constexpr bool atomic = false;
constexpr std::memory_order order = std::memory_order_relaxed;
Pointer<Byte> baseBytePtr = base;
- for(int i = 0; i < 4; i++)
+ for(int i = 0; i < SIMD::Width; i++)
{
If(Extract(mask, i) != 0)
{
@@ -4144,32 +4144,32 @@
}
}
-RValue<Float4> Gather(RValue<Pointer<Float>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
{
RR_DEBUG_INFO_UPDATE_LOC();
- Float4 result{};
+ SIMD::Float result{};
gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
return result;
}
-RValue<Int4> Gather(RValue<Pointer<Int>> base, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
+RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
{
RR_DEBUG_INFO_UPDATE_LOC();
- Int4 result{};
+ SIMD::Int result{};
gather(result, base, offsets, mask, alignment, zeroMaskedLanes);
return result;
}
-void Scatter(RValue<Pointer<Float>> base, RValue<Float4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
{
RR_DEBUG_INFO_UPDATE_LOC();
scatter(base, val, offsets, mask, alignment);
}
-void Scatter(RValue<Pointer<Int>> base, RValue<Int4> val, RValue<Int4> offsets, RValue<Int4> mask, unsigned int alignment)
+void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
{
RR_DEBUG_INFO_UPDATE_LOC();
- scatter<Int4>(base, val, offsets, mask, alignment);
+ scatter<SIMD::Int>(base, val, offsets, mask, alignment);
}
RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
diff --git a/tests/ReactorUnitTests/ReactorSIMD.cpp b/tests/ReactorUnitTests/ReactorSIMD.cpp
index de2deda..a4ebe8e 100644
--- a/tests/ReactorUnitTests/ReactorSIMD.cpp
+++ b/tests/ReactorUnitTests/ReactorSIMD.cpp
@@ -136,79 +136,73 @@
TEST(ReactorSIMD, Intrinsics_Scatter)
{
- Function<Void(Pointer<Float> base, Pointer<Float4> val, Pointer<Int4> offsets)> function;
+ Function<Void(Pointer<Float> base, Pointer<SIMD::Float> val, Pointer<SIMD::Int> offsets)> function;
{
Pointer<Float> base = function.Arg<0>();
- Pointer<Float4> val = function.Arg<1>();
- Pointer<Int4> offsets = function.Arg<2>();
+ Pointer<SIMD::Float> val = function.Arg<1>();
+ Pointer<SIMD::Int> offsets = function.Arg<2>();
- auto mask = Int4(~0, ~0, ~0, ~0);
+ SIMD::Int mask = ~0;
unsigned int alignment = 1;
Scatter(base, *val, *offsets, mask, alignment);
}
- float buffer[16] = { 0 };
+ std::vector<float> buffer(10 + 10 * SIMD::Width);
+ std::vector<int> offsets(SIMD::Width);
+ std::vector<float> val(SIMD::Width);
- constexpr auto elemSize = sizeof(buffer[0]);
-
- int offsets[] = {
- 1 * elemSize,
- 6 * elemSize,
- 11 * elemSize,
- 13 * elemSize
- };
-
- float val[4] = { 10, 60, 110, 130 };
+ for(int i = 0; i < SIMD::Width; i++)
+ {
+ offsets[i] = (3 + 7 * i) * sizeof(float);
+ val[i] = 13.0f + 17.0f * i;
+ }
auto routine = function(testName().c_str());
auto entry = (void (*)(float *, float *, int *))routine->getEntry();
- entry(buffer, val, offsets);
+ entry(buffer.data(), val.data(), offsets.data());
- EXPECT_EQ(buffer[offsets[0] / sizeof(buffer[0])], 10);
- EXPECT_EQ(buffer[offsets[1] / sizeof(buffer[0])], 60);
- EXPECT_EQ(buffer[offsets[2] / sizeof(buffer[0])], 110);
- EXPECT_EQ(buffer[offsets[3] / sizeof(buffer[0])], 130);
+ for(int i = 0; i < SIMD::Width; i++)
+ {
+ EXPECT_EQ(buffer[offsets[i] / sizeof(float)], val[i]);
+ }
}
-TEST(ReactorUnitTests, Intrinsics_Gather)
+TEST(ReactorSIMD, Intrinsics_Gather)
{
- Function<Void(Pointer<Float> base, Pointer<Int4> offsets, Pointer<Float4> result)> function;
+ Function<Void(Pointer<Float> base, Pointer<SIMD::Int> offsets, Pointer<SIMD::Float> result)> function;
{
Pointer<Float> base = function.Arg<0>();
- Pointer<Int4> offsets = function.Arg<1>();
- Pointer<Float4> result = function.Arg<2>();
+ Pointer<SIMD::Int> offsets = function.Arg<1>();
+ Pointer<SIMD::Float> result = function.Arg<2>();
- auto mask = Int4(~0, ~0, ~0, ~0);
+ SIMD::Int mask = ~0;
unsigned int alignment = 1;
bool zeroMaskedLanes = true;
*result = Gather(base, *offsets, mask, alignment, zeroMaskedLanes);
}
- float buffer[] = {
- 0, 10, 20, 30,
- 40, 50, 60, 70,
- 80, 90, 100, 110,
- 120, 130, 140, 150
- };
+ std::vector<float> buffer(10 + 10 * SIMD::Width);
+ std::vector<int> offsets(SIMD::Width);
- constexpr auto elemSize = sizeof(buffer[0]);
+ std::vector<float> val(SIMD::Width);
- int offsets[] = {
- 1 * elemSize,
- 6 * elemSize,
- 11 * elemSize,
- 13 * elemSize
- };
+ for(int i = 0; i < SIMD::Width; i++)
+ {
+ offsets[i] = (3 + 7 * i) * sizeof(float);
+ val[i] = 13.0f + 17.0f * i;
+
+ buffer[offsets[i] / sizeof(float)] = val[i];
+ }
auto routine = function(testName().c_str());
auto entry = (void (*)(float *, int *, float *))routine->getEntry();
- float result[4] = {};
- entry(buffer, offsets, result);
+ std::vector<float> result(SIMD::Width);
+ entry(buffer.data(), offsets.data(), result.data());
- EXPECT_EQ(result[0], 10);
- EXPECT_EQ(result[1], 60);
- EXPECT_EQ(result[2], 110);
- EXPECT_EQ(result[3], 130);
+ for(int i = 0; i < SIMD::Width; i++)
+ {
+ EXPECT_EQ(result[i], val[i]);
+ }
}