Implement support for regular scaled buffer formats

`SCALED` formats are essentially the same as `INT` formats, but
converted to floating-point. Thus we can trivially support all the ones
that already had a corresponding supported `INT` format. Concretely,
this change adds support for 1-, 2-, and 4-element formats with 8-bit
and 16-bit  components.

Bug: b/201174375
Tests: dEQP-VK.*scaled*
Change-Id: I459fd4ba6fa159972f86d54c7474141c1e2a64eb
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51470
Kokoro-Result: kokoro <noreply+kokoro@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
Reviewed-by: Sean Risser <srisser@google.com>
diff --git a/src/Pipeline/VertexRoutine.cpp b/src/Pipeline/VertexRoutine.cpp
index aff1bc4..a1962ab 100644
--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -286,6 +286,28 @@
 		if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
 		if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleSByte)), Float4(-1.0f));
 		break;
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_R8G8_USCALED:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
+		v.x = Float4(*Pointer<Byte4>(source0));
+		v.y = Float4(*Pointer<Byte4>(source1));
+		v.z = Float4(*Pointer<Byte4>(source2));
+		v.w = Float4(*Pointer<Byte4>(source3));
+
+		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
+		break;
+	case VK_FORMAT_R8_SSCALED:
+	case VK_FORMAT_R8G8_SSCALED:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
+	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
+		v.x = Float4(*Pointer<SByte4>(source0));
+		v.y = Float4(*Pointer<SByte4>(source1));
+		v.z = Float4(*Pointer<SByte4>(source2));
+		v.w = Float4(*Pointer<SByte4>(source3));
+
+		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
+		break;
 	case VK_FORMAT_R8_SINT:
 	case VK_FORMAT_R8G8_SINT:
 	case VK_FORMAT_R8G8B8A8_SINT:
@@ -297,31 +319,6 @@
 
 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
 		break;
-	case VK_FORMAT_R16_SNORM:
-	case VK_FORMAT_R16G16_SNORM:
-	case VK_FORMAT_R16G16B16A16_SNORM:
-		v.x = Float4(*Pointer<Short4>(source0));
-		v.y = Float4(*Pointer<Short4>(source1));
-		v.z = Float4(*Pointer<Short4>(source2));
-		v.w = Float4(*Pointer<Short4>(source3));
-
-		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
-
-		if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
-		if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
-		if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
-		if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
-		break;
-	case VK_FORMAT_R16_SINT:
-	case VK_FORMAT_R16G16_SINT:
-	case VK_FORMAT_R16G16B16A16_SINT:
-		v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
-		v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
-		v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
-		v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
-
-		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
-		break;
 	case VK_FORMAT_R16_UNORM:
 	case VK_FORMAT_R16G16_UNORM:
 	case VK_FORMAT_R16G16B16A16_UNORM:
@@ -337,6 +334,51 @@
 		if(componentCount >= 3) v.z *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
 		if(componentCount >= 4) v.w *= *Pointer<Float4>(constants + OFFSET(Constants, unscaleUShort));
 		break;
+	case VK_FORMAT_R16_SNORM:
+	case VK_FORMAT_R16G16_SNORM:
+	case VK_FORMAT_R16G16B16A16_SNORM:
+		v.x = Float4(*Pointer<Short4>(source0));
+		v.y = Float4(*Pointer<Short4>(source1));
+		v.z = Float4(*Pointer<Short4>(source2));
+		v.w = Float4(*Pointer<Short4>(source3));
+
+		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
+
+		if(componentCount >= 1) v.x = Max(v.x * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
+		if(componentCount >= 2) v.y = Max(v.y * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
+		if(componentCount >= 3) v.z = Max(v.z * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
+		if(componentCount >= 4) v.w = Max(v.w * *Pointer<Float4>(constants + OFFSET(Constants, unscaleShort)), Float4(-1.0f));
+		break;
+	case VK_FORMAT_R16_USCALED:
+	case VK_FORMAT_R16G16_USCALED:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+		v.x = Float4(*Pointer<UShort4>(source0));
+		v.y = Float4(*Pointer<UShort4>(source1));
+		v.z = Float4(*Pointer<UShort4>(source2));
+		v.w = Float4(*Pointer<UShort4>(source3));
+
+		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
+		break;
+	case VK_FORMAT_R16_SSCALED:
+	case VK_FORMAT_R16G16_SSCALED:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
+		v.x = Float4(*Pointer<Short4>(source0));
+		v.y = Float4(*Pointer<Short4>(source1));
+		v.z = Float4(*Pointer<Short4>(source2));
+		v.w = Float4(*Pointer<Short4>(source3));
+
+		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
+		break;
+	case VK_FORMAT_R16_SINT:
+	case VK_FORMAT_R16G16_SINT:
+	case VK_FORMAT_R16G16B16A16_SINT:
+		v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
+		v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
+		v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
+		v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
+
+		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
+		break;
 	case VK_FORMAT_R16_UINT:
 	case VK_FORMAT_R16G16_UINT:
 	case VK_FORMAT_R16G16B16A16_UINT:
diff --git a/src/Vulkan/VkPhysicalDevice.cpp b/src/Vulkan/VkPhysicalDevice.cpp
index c553eec..9437356 100644
--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -1398,19 +1398,27 @@
 	{
 	case VK_FORMAT_R8_UNORM:
 	case VK_FORMAT_R8_SNORM:
+	case VK_FORMAT_R8_USCALED:
+	case VK_FORMAT_R8_SSCALED:
 	case VK_FORMAT_R8_UINT:
 	case VK_FORMAT_R8_SINT:
 	case VK_FORMAT_R8G8_UNORM:
 	case VK_FORMAT_R8G8_SNORM:
+	case VK_FORMAT_R8G8_USCALED:
+	case VK_FORMAT_R8G8_SSCALED:
 	case VK_FORMAT_R8G8_UINT:
 	case VK_FORMAT_R8G8_SINT:
 	case VK_FORMAT_R8G8B8A8_UNORM:
 	case VK_FORMAT_R8G8B8A8_SNORM:
+	case VK_FORMAT_R8G8B8A8_USCALED:
+	case VK_FORMAT_R8G8B8A8_SSCALED:
 	case VK_FORMAT_R8G8B8A8_UINT:
 	case VK_FORMAT_R8G8B8A8_SINT:
 	case VK_FORMAT_B8G8R8A8_UNORM:
 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
+	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
+	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
@@ -1423,16 +1431,22 @@
 	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
 	case VK_FORMAT_R16_UNORM:
 	case VK_FORMAT_R16_SNORM:
+	case VK_FORMAT_R16_USCALED:
+	case VK_FORMAT_R16_SSCALED:
 	case VK_FORMAT_R16_UINT:
 	case VK_FORMAT_R16_SINT:
 	case VK_FORMAT_R16_SFLOAT:
 	case VK_FORMAT_R16G16_UNORM:
 	case VK_FORMAT_R16G16_SNORM:
+	case VK_FORMAT_R16G16_USCALED:
+	case VK_FORMAT_R16G16_SSCALED:
 	case VK_FORMAT_R16G16_UINT:
 	case VK_FORMAT_R16G16_SINT:
 	case VK_FORMAT_R16G16_SFLOAT:
 	case VK_FORMAT_R16G16B16A16_UNORM:
 	case VK_FORMAT_R16G16B16A16_SNORM:
+	case VK_FORMAT_R16G16B16A16_USCALED:
+	case VK_FORMAT_R16G16B16A16_SSCALED:
 	case VK_FORMAT_R16G16B16A16_UINT:
 	case VK_FORMAT_R16G16B16A16_SINT:
 	case VK_FORMAT_R16G16B16A16_SFLOAT: