Revert "Float only writeColor"

This reverts commit 978e2469e6b98de6784c27cbba69121e054f4351.

Reason for revert: Potentially causing some failures in https://angle-gold.skia.org/diff?grouping=name%3Dhearthstone%26source_type%3Dangle&left=f9178a683907f1c8d806c0970e2468c2&right=af6167f064bfd553d11eaed8d1abb29f&changelist_id=4098332&crs=gerrit

Change-Id: Ie7363edbf4e32a886d0732269c5584248f6a508b
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/70329
Reviewed-by: Alexis Hétu <sugoi@google.com>
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Shahbaz Youssefi <syoussefi@google.com>
diff --git a/src/Device/BUILD.gn b/src/Device/BUILD.gn
index 4497e3b..c1ae1ff 100644
--- a/src/Device/BUILD.gn
+++ b/src/Device/BUILD.gn
@@ -70,6 +70,7 @@
 
   defines = [
      "SWIFTSHADER_ENABLE_ASTC",  # TODO(b/150130101)
+     "SWIFTSHADER_LEGACY_PRECISION=true",  # TODO(b/226657516)
   ]
 
   deps = [
diff --git a/src/Pipeline/BUILD.gn b/src/Pipeline/BUILD.gn
index d97f527..9816b0a 100644
--- a/src/Pipeline/BUILD.gn
+++ b/src/Pipeline/BUILD.gn
@@ -65,6 +65,10 @@
     "../../third_party/SPIRV-Tools/include",
   ]
 
+  defines = [
+     "SWIFTSHADER_LEGACY_PRECISION=true",  # TODO(chromium:1299047)
+  ]
+
   deps = [
     "../../third_party/SPIRV-Tools:spvtools_headers",
     "../../third_party/marl:Marl_headers",
diff --git a/src/Pipeline/Constants.cpp b/src/Pipeline/Constants.cpp
index 6535272..dda1f39 100644
--- a/src/Pipeline/Constants.cpp
+++ b/src/Pipeline/Constants.cpp
@@ -275,6 +275,12 @@
 		sRGBtoLinearFF_FF00[i] = (unsigned short)(sRGBtoLinear((float)i / 0xFF) * 0xFF00 + 0.5f);
 	}
 
+	for(int i = 0; i < 0x1000; i++)
+	{
+		linearToSRGB12_16[i] = (unsigned short)(clamp(linearToSRGB((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+		sRGBtoLinear12_16[i] = (unsigned short)(clamp(sRGBtoLinear((float)i / 0x0FFF) * 0xFFFF + 0.5f, 0.0f, (float)0xFFFF));
+	}
+
 	for(int q = 0; q < 4; q++)
 	{
 		for(int c = 0; c < 16; c++)
diff --git a/src/Pipeline/Constants.hpp b/src/Pipeline/Constants.hpp
index ab410f6..a857747 100644
--- a/src/Pipeline/Constants.hpp
+++ b/src/Pipeline/Constants.hpp
@@ -108,6 +108,9 @@
 
 	unsigned short sRGBtoLinearFF_FF00[256];
 
+	unsigned short linearToSRGB12_16[4096];
+	unsigned short sRGBtoLinear12_16[4096];
+
 	// Centroid parameters
 	float4 sampleX[4][16];
 	float4 sampleY[4][16];
diff --git a/src/Pipeline/PixelProgram.cpp b/src/Pipeline/PixelProgram.cpp
index 87a8dab..c73bb42 100644
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -266,18 +266,91 @@
 			continue;
 		}
 
-		for(unsigned int q : samples)
+		auto format = state.colorFormat[index];
+		switch(format)
 		{
-			Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+		case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+		case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+		case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		case VK_FORMAT_B8G8R8A8_UNORM:
+		case VK_FORMAT_B8G8R8A8_SRGB:
+		case VK_FORMAT_R8G8B8A8_UNORM:
+		case VK_FORMAT_R8G8B8A8_SRGB:
+		case VK_FORMAT_R8G8_UNORM:
+		case VK_FORMAT_R8_UNORM:
+		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+			for(unsigned int q : samples)
+			{
+				Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
 
-			SIMD::Float4 C = alphaBlend(index, buffer, c[index], x);
-			ASSERT(SIMD::Width == 4);
-			Vector4f color;
-			color.x = Extract128(C.x, 0);
-			color.y = Extract128(C.y, 0);
-			color.z = Extract128(C.z, 0);
-			color.w = Extract128(C.w, 0);
-			writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+				SIMD::Float4 colorf = alphaBlend(index, buffer, c[index], x);
+
+				ASSERT(SIMD::Width == 4);
+				Vector4s color;
+				color.x = UShort4(Extract128(colorf.x, 0) * 0xFFFF, true);  // Saturating
+				color.y = UShort4(Extract128(colorf.y, 0) * 0xFFFF, true);  // Saturating
+				color.z = UShort4(Extract128(colorf.z, 0) * 0xFFFF, true);  // Saturating
+				color.w = UShort4(Extract128(colorf.w, 0) * 0xFFFF, true);  // Saturating
+				writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+			}
+			break;
+		case VK_FORMAT_R16_SFLOAT:
+		case VK_FORMAT_R16G16_SFLOAT:
+		case VK_FORMAT_R16G16B16A16_SFLOAT:
+		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+		case VK_FORMAT_R32_SFLOAT:
+		case VK_FORMAT_R32G32_SFLOAT:
+		case VK_FORMAT_R32G32B32A32_SFLOAT:
+		case VK_FORMAT_R32_SINT:
+		case VK_FORMAT_R32G32_SINT:
+		case VK_FORMAT_R32G32B32A32_SINT:
+		case VK_FORMAT_R32_UINT:
+		case VK_FORMAT_R32G32_UINT:
+		case VK_FORMAT_R32G32B32A32_UINT:
+		case VK_FORMAT_R16_UNORM:
+		case VK_FORMAT_R16G16_UNORM:
+		case VK_FORMAT_R16G16B16A16_UNORM:
+		case VK_FORMAT_R16_SINT:
+		case VK_FORMAT_R16G16_SINT:
+		case VK_FORMAT_R16G16B16A16_SINT:
+		case VK_FORMAT_R16_UINT:
+		case VK_FORMAT_R16G16_UINT:
+		case VK_FORMAT_R16G16B16A16_UINT:
+		case VK_FORMAT_R8_SINT:
+		case VK_FORMAT_R8G8_SINT:
+		case VK_FORMAT_R8G8B8A8_SINT:
+		case VK_FORMAT_R8_UINT:
+		case VK_FORMAT_R8G8_UINT:
+		case VK_FORMAT_R8G8B8A8_UINT:
+		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
+		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
+		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
+		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
+			for(unsigned int q : samples)
+			{
+				Pointer<Byte> buffer = cBuffer[index] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[index]));
+
+				SIMD::Float4 C = alphaBlend(index, buffer, c[index], x);
+				ASSERT(SIMD::Width == 4);
+				Vector4f color;
+				color.x = Extract128(C.x, 0);
+				color.y = Extract128(C.y, 0);
+				color.z = Extract128(C.z, 0);
+				color.w = Extract128(C.w, 0);
+				writeColor(index, buffer, x, color, sMask[q], zMask[q], cMask[q]);
+			}
+			break;
+		default:
+			UNSUPPORTED("VkFormat: %d", int(format));
 		}
 	}
 }
diff --git a/src/Pipeline/PixelRoutine.cpp b/src/Pipeline/PixelRoutine.cpp
index 409479c..1d46605 100644
--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -1172,6 +1172,576 @@
 	default:
 		UNSUPPORTED("VkFormat %d", int(format));
 	}
+
+	if(isSRGB(index))
+	{
+		sRGBtoLinear16_12_16(pixel);
+	}
+}
+
+void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
+{
+	if(isSRGB(index))
+	{
+		linearToSRGB16_12_16(current);
+	}
+
+	vk::Format format = state.colorFormat[index];
+	switch(format)
+	{
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
+		break;
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 4) + Short4(0x0800);
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
+		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
+		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
+		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
+		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
+		break;
+	default:
+		break;
+	}
+
+	int writeMask = state.colorWriteActive(index);
+	if(format.isBGRformat())
+	{
+		// For BGR formats, flip R and B channels in the channels mask
+		writeMask = (writeMask & 0x0000000A) | (writeMask & 0x00000001) << 2 | (writeMask & 0x00000004) >> 2;
+	}
+
+	switch(format)
+	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+		{
+			current.x = As<UShort4>(current.x & Short4(0xF000));
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
+			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 8;
+			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+		{
+			current.z = As<UShort4>(current.z & Short4(0xF000));
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
+			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 8;
+			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+		{
+			current.w = As<UShort4>(current.w & Short4(0xF000));
+			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 4;
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
+			current.z = As<UShort4>(current.z & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+		{
+			current.w = As<UShort4>(current.w & Short4(0xF000));
+			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 4;
+			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
+			current.x = As<UShort4>(current.x & Short4(0xF000u)) >> 12;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		{
+			current.x = As<UShort4>(current.x & Short4(0xF800));
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
+			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 10;
+			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		{
+			current.z = As<UShort4>(current.z & Short4(0xF800));
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
+			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 10;
+			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		{
+			current.w = current.w & Short4(0x8000u);
+			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
+			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
+			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
+
+			current.x = current.x | current.y | current.z | current.w;
+		}
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		{
+			current.x = current.x & Short4(0xF800u);
+			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+			current.z = As<UShort4>(current.z) >> 11;
+
+			current.x = current.x | current.y | current.z;
+		}
+		break;
+	case VK_FORMAT_B5G6R5_UNORM_PACK16:
+		{
+			current.z = current.z & Short4(0xF800u);
+			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
+			current.x = As<UShort4>(current.x) >> 11;
+
+			current.x = current.x | current.y | current.z;
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		if(writeMask == 0x7)
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.z, current.x));
+			current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		else
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.z, current.x));
+			current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		break;
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		if(writeMask == 0x7)
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.x, current.z));
+			current.y = As<Short4>(PackUnsigned(current.y, current.y));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		else
+		{
+			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
+			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
+
+			current.z = As<Short4>(PackUnsigned(current.x, current.z));
+			current.y = As<Short4>(PackUnsigned(current.y, current.w));
+
+			current.x = current.z;
+			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
+			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
+			current.y = current.z;
+			current.z = As<Short4>(UnpackLow(current.z, current.x));
+			current.y = As<Short4>(UnpackHigh(current.y, current.x));
+		}
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		current.y = As<Short4>(As<UShort4>(current.y) >> 8);
+		current.x = As<Short4>(PackUnsigned(current.x, current.x));
+		current.y = As<Short4>(PackUnsigned(current.y, current.y));
+		current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
+		break;
+	case VK_FORMAT_R8_UNORM:
+		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
+		current.x = As<Short4>(PackUnsigned(current.x, current.x));
+		break;
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+		{
+			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
+			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
+			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
+			auto a = (Int4(current.w) >> 14) & Int4(0x3);
+			Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
+			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
+			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
+			current.x = UnpackLow(c02, c13);
+			current.y = UnpackHigh(c02, c13);
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+		{
+			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
+			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
+			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
+			auto a = (Int4(current.w) >> 14) & Int4(0x3);
+			Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
+			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
+			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
+			current.x = UnpackLow(c02, c13);
+			current.y = UnpackHigh(c02, c13);
+		}
+		break;
+	default:
+		UNSUPPORTED("VkFormat: %d", int(format));
+	}
+
+	Short4 c01 = current.z;
+	Short4 c23 = current.y;
+
+	Int xMask;  // Combination of all masks
+
+	if(state.depthTestActive)
+	{
+		xMask = zMask;
+	}
+	else
+	{
+		xMask = cMask;
+	}
+
+	if(state.stencilActive)
+	{
+		xMask &= sMask;
+	}
+
+	Pointer<Byte> buffer = cBuffer;
+	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
+
+	switch(format)
+	{
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask;
+			switch(format)
+			{
+			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
+				break;
+			case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
+			case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
+				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
+				break;
+			default:
+				UNREACHABLE("Format: %s", vk::Stringify(format).c_str());
+			}
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[writeMask][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[writeMask][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[writeMask][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+		{
+			buffer += 2 * x;
+			Int value = *Pointer<Int>(buffer);
+
+			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
+
+			Int c01 = Extract(As<Int2>(current.x), 0);
+			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
+			if((writeMask & 0x00000007) != 0x00000007)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Int>(buffer);
+
+			Int c23 = Extract(As<Int2>(current.x), 1);
+			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
+			if((writeMask & 0x00000007) != 0x00000007)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_B8G8R8A8_UNORM:
+	case VK_FORMAT_B8G8R8A8_SRGB:
+		{
+			buffer += x * 4;
+			Short4 value = *Pointer<Short4>(buffer);
+			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[writeMask]));
+
+			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Short4>(buffer);
+
+			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R8G8B8A8_UNORM:
+	case VK_FORMAT_R8G8B8A8_SRGB:
+	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
+		{
+			buffer += x * 4;
+			Short4 value = *Pointer<Short4>(buffer);
+			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[writeMask]));
+
+			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask01 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
+
+			buffer += pitchB;
+			value = *Pointer<Short4>(buffer);
+
+			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if(writeMask != 0x0000000F)
+			{
+				mask23 &= channelMask;
+			}
+			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
+		}
+		break;
+	case VK_FORMAT_R8G8_UNORM:
+		if((writeMask & 0x00000003) != 0x0)
+		{
+			buffer += 2 * x;
+			Int2 value;
+			value = Insert(value, *Pointer<Int>(buffer), 0);
+			value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
+
+			Int2 packedCol = As<Int2>(current.x);
+
+			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
+			if((writeMask & 0x3) != 0x3)
+			{
+				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (writeMask & 0x3)]));
+				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
+				mergedMask &= rgbaMask;
+			}
+
+			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
+
+			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
+			*Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
+		}
+		break;
+	case VK_FORMAT_R8_UNORM:
+		if(writeMask & 0x00000001)
+		{
+			buffer += 1 * x;
+			Short4 value;
+			value = Insert(value, *Pointer<Short>(buffer), 0);
+			value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
+
+			current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
+			value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
+			current.x |= value;
+
+			*Pointer<Short>(buffer) = Extract(current.x, 0);
+			*Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
+		}
+		break;
+	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+		{
+			buffer += 4 * x;
+
+			Int2 value = *Pointer<Int2>(buffer, 16);
+			Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
+			if(writeMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
+
+			buffer += pitchB;
+
+			value = *Pointer<Int2>(buffer, 16);
+			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
+			if(writeMask != 0xF)
+			{
+				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask][0]));
+			}
+			*Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
+		}
+		break;
+	default:
+		UNSUPPORTED("VkFormat: %d", int(format));
+	}
 }
 
 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
@@ -1792,13 +2362,6 @@
 			texelColor.y = Float4(As<UShort4>(color.y)) * (1.0f / 0xFFFF);
 			texelColor.z = Float4(As<UShort4>(color.z)) * (1.0f / 0xFFFF);
 			texelColor.w = Float4(As<UShort4>(color.w)) * (1.0f / 0xFFFF);
-
-			if(isSRGB(index))
-			{
-				texelColor.x = sRGBtoLinear(texelColor.x);
-				texelColor.y = sRGBtoLinear(texelColor.y);
-				texelColor.z = sRGBtoLinear(texelColor.z);
-			}
 		}
 		break;
 	}
@@ -2909,4 +3472,65 @@
 	}
 }
 
+void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
+{
+	Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
+
+	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
+	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
+	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
+
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+}
+
+void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
+{
+	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
+	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
+	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
+
+	linearToSRGB12_16(c);
+}
+
+void PixelRoutine::linearToSRGB12_16(Vector4s &c)
+{
+	Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
+
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
+	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
+
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
+	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
+
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
+	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
+}
+
+Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)  // Approximates x^2.2
+{
+	Float4 linear = x * x;
+	linear = linear * 0.73f + linear * x * 0.27f;
+
+	return Min(Max(linear, 0.0f), 1.0f);
+}
+
 }  // namespace sw
diff --git a/src/Pipeline/PixelRoutine.hpp b/src/Pipeline/PixelRoutine.hpp
index e1c4eb7..3b5224a 100644
--- a/src/Pipeline/PixelRoutine.hpp
+++ b/src/Pipeline/PixelRoutine.hpp
@@ -56,8 +56,10 @@
 
 	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask);
 	SIMD::Float4 alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x);
+	void writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask);
 
 	bool isSRGB(int index) const;
+	void linearToSRGB12_16(Vector4s &c);
 
 private:
 	bool hasStencilReplaceRef() const;
@@ -99,6 +101,10 @@
 	void writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples);
 	void occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples);
 
+	void sRGBtoLinear16_12_16(Vector4s &c);
+	void linearToSRGB16_12_16(Vector4s &c);
+	Float4 sRGBtoLinear(const Float4 &x);
+
 	SIMD::Float readDepth32F(const Pointer<Byte> &zBuffer, int q, const Int &x) const;
 	SIMD::Float readDepth16(const Pointer<Byte> &zBuffer, int q, const Int &x) const;
 
diff --git a/src/Pipeline/ShaderCore.cpp b/src/Pipeline/ShaderCore.cpp
index 635c925..ac9f91f 100644
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -20,6 +20,11 @@
 
 #include <limits.h>
 
+// TODO(chromium:1299047)
+#ifndef SWIFTSHADER_LEGACY_PRECISION
+#	define SWIFTSHADER_LEGACY_PRECISION false
+#endif
+
 namespace sw {
 
 Vector4s::Vector4s()
@@ -311,6 +316,23 @@
 	return As<SIMD::Float>((precision_loss & As<SIMD::Int>(-atan2_theta)) | (~precision_loss & As<SIMD::Int>(theta)));  // FIXME: Vector select
 }
 
+// TODO(chromium:1299047)
+static RValue<SIMD::Float> Exp2_legacy(RValue<SIMD::Float> x0)
+{
+	SIMD::Int i = RoundInt(x0 - 0.5f);
+	SIMD::Float ii = As<SIMD::Float>((i + SIMD::Int(127)) << 23);
+
+	SIMD::Float f = x0 - SIMD::Float(i);
+	SIMD::Float ff = As<SIMD::Float>(SIMD::Int(0x3AF61905));
+	ff = ff * f + As<SIMD::Float>(SIMD::Int(0x3C134806));
+	ff = ff * f + As<SIMD::Float>(SIMD::Int(0x3D64AA23));
+	ff = ff * f + As<SIMD::Float>(SIMD::Int(0x3E75EAD4));
+	ff = ff * f + As<SIMD::Float>(SIMD::Int(0x3F31727B));
+	ff = ff * f + 1.0f;
+
+	return ii * ff;
+}
+
 RValue<SIMD::Float> Exp2(RValue<SIMD::Float> x, bool relaxedPrecision)
 {
 	// Clamp to prevent overflow past the representation of infinity.
@@ -318,6 +340,11 @@
 	x0 = Min(x0, 128.0f);
 	x0 = Max(x0, As<SIMD::Float>(SIMD::Int(0xC2FDFFFF)));  // -126.999992
 
+	if(SWIFTSHADER_LEGACY_PRECISION)  // TODO(chromium:1299047)
+	{
+		return Exp2_legacy(x0);
+	}
+
 	SIMD::Float xi = Floor(x0);
 	SIMD::Float f = x0 - xi;
 
@@ -378,6 +405,11 @@
 
 RValue<SIMD::Float> Log2(RValue<SIMD::Float> x, bool relaxedPrecision)
 {
+	if(SWIFTSHADER_LEGACY_PRECISION)  // TODO(chromium:1299047)
+	{
+		return Log2_legacy(x);
+	}
+
 	if(!relaxedPrecision)  // highp
 	{
 		// Reinterpretation as an integer provides a piecewise linear
@@ -681,6 +713,11 @@
 // TODO(chromium:1299047): Eliminate when Chromium tests accept both fused and unfused multiply-add.
 RValue<SIMD::Float> mulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
 {
+	if(SWIFTSHADER_LEGACY_PRECISION)
+	{
+		return x * y + z;
+	}
+
 	return MulAdd(x, y, z);
 }
 
diff --git a/src/Vulkan/BUILD.gn b/src/Vulkan/BUILD.gn
index a7703f0..5556abd 100644
--- a/src/Vulkan/BUILD.gn
+++ b/src/Vulkan/BUILD.gn
@@ -63,6 +63,7 @@
 
   defines += [
      "SWIFTSHADER_ENABLE_ASTC",  # TODO(b/150130101)
+     "SWIFTSHADER_LEGACY_PRECISION=true",  # TODO(chromium:1299047)
      "SWIFTSHADER_ZERO_INITIALIZE_DEVICE_MEMORY",
   ]
 }
diff --git a/src/Vulkan/VkConfig.hpp b/src/Vulkan/VkConfig.hpp
index ca2f5ce..4e2c023 100644
--- a/src/Vulkan/VkConfig.hpp
+++ b/src/Vulkan/VkConfig.hpp
@@ -20,6 +20,10 @@
 #include "Vulkan/VulkanPlatform.hpp"
 #include "spirv-tools/libspirv.h"
 
+#ifndef SWIFTSHADER_LEGACY_PRECISION
+#	define SWIFTSHADER_LEGACY_PRECISION false
+#endif
+
 namespace vk {
 
 // Note: Constant array initialization requires a string literal.
@@ -90,7 +94,7 @@
 
 constexpr int MAX_SAMPLER_ALLOCATION_COUNT = 4000;
 
-constexpr int SUBPIXEL_PRECISION_BITS = 8;
+constexpr int SUBPIXEL_PRECISION_BITS = SWIFTSHADER_LEGACY_PRECISION ? 4 : 8;
 constexpr float SUBPIXEL_PRECISION_FACTOR = static_cast<float>(1 << SUBPIXEL_PRECISION_BITS);
 constexpr int SUBPIXEL_PRECISION_MASK = 0xFFFFFFFF >> (32 - SUBPIXEL_PRECISION_BITS);