Fix some more UBSan errors in swiftshader

VkPipelineLayout.cpp had a memcpy call that could be passed NULL, 0.
Switch to std::copy which avoids this and also a manual sizeof
multiplication.

VkStructConversion.cpp is a bit more fun. It manually lays out a bunch
of arrays in a single structure. Almost all fields in the array provide
8-byte alignment, except for VkPipelineStageFlags. If there are an odd
number of VkPipelineStageFlags, subsequent fields become misaligned.
This is undefined not only by C/C++ (it is forbidden to cast unaligned
pointers), but also Vulkan itself if I'm understanding the spec right:
https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#fundamentals-validusage-pointers

(Although Vulkan saying it is somewhat redundant because merely having
those pointer types in the type signature means you must keep them
aligned by C/C++'s rules.)

Fix this by rounding all the array sizes up to a multiple of 8. This is
a no-op for everything but the VkPipelineStageFlags fields, but I
applied it to all fields so it is more obviously correct. The compiler
is smart enough to optimize this out per [0].

[0] https://godbolt.org/z/nfM3Y6xfo

Bug: chromium:1394755
Change-Id: I950869e4fed969351ac1aefddd72a898b43cfdc1
Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/72628
Tested-by: Shahbaz Youssefi <syoussefi@google.com>
Presubmit-Ready: Shahbaz Youssefi <syoussefi@google.com>
Commit-Queue: Shahbaz Youssefi <syoussefi@google.com>
Reviewed-by: Shahbaz Youssefi <syoussefi@google.com>
Kokoro-Result: kokoro <noreply+kokoro@google.com>
diff --git a/src/Vulkan/VkPipelineLayout.cpp b/src/Vulkan/VkPipelineLayout.cpp
index f6d1917..f972496 100644
--- a/src/Vulkan/VkPipelineLayout.cpp
+++ b/src/Vulkan/VkPipelineLayout.cpp
@@ -14,8 +14,8 @@
 
 #include "VkPipelineLayout.hpp"
 
+#include <algorithm>
 #include <atomic>
-#include <cstring>
 
 namespace vk {
 
@@ -57,9 +57,8 @@
 		}
 	}
 
-	size_t pushConstantRangesSize = pCreateInfo->pushConstantRangeCount * sizeof(VkPushConstantRange);
 	pushConstantRanges = reinterpret_cast<VkPushConstantRange *>(bindingStorage);
-	memcpy(pushConstantRanges, pCreateInfo->pPushConstantRanges, pushConstantRangesSize);
+	std::copy(pCreateInfo->pPushConstantRanges, pCreateInfo->pPushConstantRanges + pCreateInfo->pushConstantRangeCount, pushConstantRanges);
 
 	incRefCount();
 }
diff --git a/src/Vulkan/VkStructConversion.hpp b/src/Vulkan/VkStructConversion.hpp
index 94563cb..7c970e3 100644
--- a/src/Vulkan/VkStructConversion.hpp
+++ b/src/Vulkan/VkStructConversion.hpp
@@ -17,6 +17,9 @@
 
 #include "VkMemory.hpp"
 #include "VkStringify.hpp"
+
+#include "System/Debug.hpp"
+
 #include <cstring>
 #include <vector>
 
@@ -362,13 +365,13 @@
 	static SubmitInfo *Allocate(uint32_t submitCount, const VkSubmitInfo *pSubmits)
 	{
 		size_t submitSize = sizeof(SubmitInfo) * submitCount;
-		size_t totalSize = submitSize;
+		size_t totalSize = Align8(submitSize);
 		for(uint32_t i = 0; i < submitCount; i++)
 		{
-			totalSize += pSubmits[i].waitSemaphoreCount * sizeof(VkSemaphore);
-			totalSize += pSubmits[i].waitSemaphoreCount * sizeof(VkPipelineStageFlags);
-			totalSize += pSubmits[i].signalSemaphoreCount * sizeof(VkSemaphore);
-			totalSize += pSubmits[i].commandBufferCount * sizeof(VkCommandBuffer);
+			totalSize += Align8(pSubmits[i].waitSemaphoreCount * sizeof(VkSemaphore));
+			totalSize += Align8(pSubmits[i].waitSemaphoreCount * sizeof(VkPipelineStageFlags));
+			totalSize += Align8(pSubmits[i].signalSemaphoreCount * sizeof(VkSemaphore));
+			totalSize += Align8(pSubmits[i].commandBufferCount * sizeof(VkCommandBuffer));
 
 			for(const auto *extension = reinterpret_cast<const VkBaseInStructure *>(pSubmits[i].pNext);
 			    extension != nullptr; extension = reinterpret_cast<const VkBaseInStructure *>(extension->pNext))
@@ -378,8 +381,8 @@
 				case VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO:
 					{
 						const auto *tlsSubmitInfo = reinterpret_cast<const VkTimelineSemaphoreSubmitInfo *>(extension);
-						totalSize += tlsSubmitInfo->waitSemaphoreValueCount * sizeof(uint64_t);
-						totalSize += tlsSubmitInfo->signalSemaphoreValueCount * sizeof(uint64_t);
+						totalSize += Align8(tlsSubmitInfo->waitSemaphoreValueCount * sizeof(uint64_t));
+						totalSize += Align8(tlsSubmitInfo->signalSemaphoreValueCount * sizeof(uint64_t));
 					}
 					break;
 				case VK_STRUCTURE_TYPE_DEVICE_GROUP_SUBMIT_INFO:
@@ -396,11 +399,12 @@
 			}
 		}
 
-		uint8_t *mem = static_cast<uint8_t *>(
+		uint8_t *buffer = static_cast<uint8_t *>(
 		    vk::allocateHostMemory(totalSize, vk::HOST_MEMORY_ALLOCATION_ALIGNMENT, vk::NULL_ALLOCATION_CALLBACKS, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT));
+		uint8_t *mem = buffer;
 
 		auto submits = new(mem) SubmitInfo[submitCount];
-		mem += submitSize;
+		mem += Align8(submitSize);
 
 		for(uint32_t i = 0; i < submitCount; i++)
 		{
@@ -418,12 +422,12 @@
 				size_t size = pSubmits[i].waitSemaphoreCount * sizeof(VkSemaphore);
 				submits[i].pWaitSemaphores = reinterpret_cast<VkSemaphore *>(mem);
 				memcpy(mem, pSubmits[i].pWaitSemaphores, size);
-				mem += size;
+				mem += Align8(size);
 
 				size = pSubmits[i].waitSemaphoreCount * sizeof(VkPipelineStageFlags);
 				submits[i].pWaitDstStageMask = reinterpret_cast<VkPipelineStageFlags *>(mem);
 				memcpy(mem, pSubmits[i].pWaitDstStageMask, size);
-				mem += size;
+				mem += Align8(size);
 			}
 
 			if(pSubmits[i].signalSemaphoreCount > 0)
@@ -431,7 +435,7 @@
 				size_t size = pSubmits[i].signalSemaphoreCount * sizeof(VkSemaphore);
 				submits[i].pSignalSemaphores = reinterpret_cast<VkSemaphore *>(mem);
 				memcpy(mem, pSubmits[i].pSignalSemaphores, size);
-				mem += size;
+				mem += Align8(size);
 			}
 
 			if(pSubmits[i].commandBufferCount > 0)
@@ -439,7 +443,7 @@
 				size_t size = pSubmits[i].commandBufferCount * sizeof(VkCommandBuffer);
 				submits[i].pCommandBuffers = reinterpret_cast<VkCommandBuffer *>(mem);
 				memcpy(mem, pSubmits[i].pCommandBuffers, size);
-				mem += size;
+				mem += Align8(size);
 			}
 
 			submits[i].waitSemaphoreValueCount = 0;
@@ -462,7 +466,7 @@
 							size_t size = tlsSubmitInfo->waitSemaphoreValueCount * sizeof(uint64_t);
 							submits[i].pWaitSemaphoreValues = reinterpret_cast<uint64_t *>(mem);
 							memcpy(mem, tlsSubmitInfo->pWaitSemaphoreValues, size);
-							mem += size;
+							mem += Align8(size);
 						}
 
 						if(tlsSubmitInfo->signalSemaphoreValueCount > 0)
@@ -471,7 +475,7 @@
 							size_t size = tlsSubmitInfo->signalSemaphoreValueCount * sizeof(uint64_t);
 							submits[i].pSignalSemaphoreValues = reinterpret_cast<uint64_t *>(mem);
 							memcpy(mem, tlsSubmitInfo->pSignalSemaphoreValues, size);
-							mem += size;
+							mem += Align8(size);
 						}
 					}
 					break;
@@ -489,21 +493,22 @@
 			}
 		}
 
+		ASSERT(static_cast<size_t>(mem - buffer) == totalSize);
 		return submits;
 	}
 
 	static SubmitInfo *Allocate(uint32_t submitCount, const VkSubmitInfo2 *pSubmits)
 	{
 		size_t submitSize = sizeof(SubmitInfo) * submitCount;
-		size_t totalSize = submitSize;
+		size_t totalSize = Align8(submitSize);
 		for(uint32_t i = 0; i < submitCount; i++)
 		{
-			totalSize += pSubmits[i].waitSemaphoreInfoCount * sizeof(VkSemaphore);
-			totalSize += pSubmits[i].waitSemaphoreInfoCount * sizeof(VkPipelineStageFlags);
-			totalSize += pSubmits[i].waitSemaphoreInfoCount * sizeof(uint64_t);
-			totalSize += pSubmits[i].signalSemaphoreInfoCount * sizeof(VkSemaphore);
-			totalSize += pSubmits[i].signalSemaphoreInfoCount * sizeof(uint64_t);
-			totalSize += pSubmits[i].commandBufferInfoCount * sizeof(VkCommandBuffer);
+			totalSize += Align8(pSubmits[i].waitSemaphoreInfoCount * sizeof(VkSemaphore));
+			totalSize += Align8(pSubmits[i].waitSemaphoreInfoCount * sizeof(VkPipelineStageFlags));
+			totalSize += Align8(pSubmits[i].waitSemaphoreInfoCount * sizeof(uint64_t));
+			totalSize += Align8(pSubmits[i].signalSemaphoreInfoCount * sizeof(VkSemaphore));
+			totalSize += Align8(pSubmits[i].signalSemaphoreInfoCount * sizeof(uint64_t));
+			totalSize += Align8(pSubmits[i].commandBufferInfoCount * sizeof(VkCommandBuffer));
 
 			for(const auto *extension = reinterpret_cast<const VkBaseInStructure *>(pSubmits[i].pNext);
 			    extension != nullptr; extension = reinterpret_cast<const VkBaseInStructure *>(extension->pNext))
@@ -523,11 +528,12 @@
 			}
 		}
 
-		uint8_t *mem = static_cast<uint8_t *>(
+		uint8_t *buffer = static_cast<uint8_t *>(
 		    vk::allocateHostMemory(totalSize, vk::HOST_MEMORY_ALLOCATION_ALIGNMENT, vk::NULL_ALLOCATION_CALLBACKS, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT));
+		uint8_t *mem = buffer;
 
 		auto submits = new(mem) SubmitInfo[submitCount];
-		mem += submitSize;
+		mem += Align8(submitSize);
 
 		for(uint32_t i = 0; i < submitCount; i++)
 		{
@@ -549,15 +555,15 @@
 			{
 				size_t size = submits[i].waitSemaphoreCount * sizeof(VkSemaphore);
 				submits[i].pWaitSemaphores = reinterpret_cast<VkSemaphore *>(mem);
-				mem += size;
+				mem += Align8(size);
 
 				size = submits[i].waitSemaphoreCount * sizeof(VkPipelineStageFlags);
 				submits[i].pWaitDstStageMask = reinterpret_cast<VkPipelineStageFlags *>(mem);
-				mem += size;
+				mem += Align8(size);
 
 				size = submits[i].waitSemaphoreCount * sizeof(uint64_t);
 				submits[i].pWaitSemaphoreValues = reinterpret_cast<uint64_t *>(mem);
-				mem += size;
+				mem += Align8(size);
 
 				for(uint32_t j = 0; j < submits[i].waitSemaphoreCount; j++)
 				{
@@ -571,11 +577,11 @@
 			{
 				size_t size = submits[i].signalSemaphoreCount * sizeof(VkSemaphore);
 				submits[i].pSignalSemaphores = reinterpret_cast<VkSemaphore *>(mem);
-				mem += size;
+				mem += Align8(size);
 
 				size = submits[i].signalSemaphoreCount * sizeof(uint64_t);
 				submits[i].pSignalSemaphoreValues = reinterpret_cast<uint64_t *>(mem);
-				mem += size;
+				mem += Align8(size);
 
 				for(uint32_t j = 0; j < submits[i].signalSemaphoreCount; j++)
 				{
@@ -588,7 +594,7 @@
 			{
 				size_t size = submits[i].commandBufferCount * sizeof(VkCommandBuffer);
 				submits[i].pCommandBuffers = reinterpret_cast<VkCommandBuffer *>(mem);
-				mem += size;
+				mem += Align8(size);
 
 				for(uint32_t j = 0; j < submits[i].commandBufferCount; j++)
 				{
@@ -597,6 +603,7 @@
 			}
 		}
 
+		ASSERT(static_cast<size_t>(mem - buffer) == totalSize);
 		return submits;
 	}
 
@@ -616,8 +623,17 @@
 	uint64_t *pWaitSemaphoreValues;
 	uint32_t signalSemaphoreValueCount;
 	uint64_t *pSignalSemaphoreValues;
+
+private:
+	static size_t Align8(size_t size)
+	{
+		// Keep all arrays 8-byte aligned, so that an odd number of `VkPipelineStageFlags` does not break the
+		// alignment of the other fields.
+		constexpr size_t align = 8;
+		return (size + align - 1) & ~(align - 1);
+	}
 };
 
 }  // namespace vk
 
-#endif  // VK_STRUCT_CONVERSION_HPP_
\ No newline at end of file
+#endif  // VK_STRUCT_CONVERSION_HPP_