third_party/marl/src/thread.cpp - SwiftShader - Git at Google

 // Copyright 2019 The Marl Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "marl/thread.h"

 #include "marl/debug.h"
 #include "marl/defer.h"
 #include "marl/trace.h"

 #include <algorithm>  // std::sort

 #include <cstdarg>
 #include <cstdio>

 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN 1
 #include <windows.h>
 #include <array>
 #include <cstdlib>  // mbstowcs
 #include <limits>   // std::numeric_limits
 #include <vector>
 #undef max
 #elif defined(__APPLE__)
 #include <mach/thread_act.h>
 #include <pthread.h>
 #include <unistd.h>
 #include <thread>
 #elif defined(__FreeBSD__)
 #include <pthread.h>
 #include <pthread_np.h>
 #include <unistd.h>
 #include <thread>
 #else
 #include <pthread.h>
 #include <unistd.h>
 #include <thread>
 #endif

 namespace {

 struct CoreHasher {
   inline uint64_t operator()(const marl::Thread::Core& core) const {
     return core.pthread.index;
   }
 };

 }  // anonymous namespace

 namespace marl {

 #if defined(_WIN32)
 static constexpr size_t MaxCoreCount =
     std::numeric_limits<decltype(Thread::Core::windows.index)>::max() + 1ULL;
 static constexpr size_t MaxGroupCount =
     std::numeric_limits<decltype(Thread::Core::windows.group)>::max() + 1ULL;
 static_assert(sizeof(KAFFINITY) * 8ULL <= MaxCoreCount,
               "Thread::Core::windows.index is too small");

 namespace {
 #define CHECK_WIN32(expr)                                    \
   do {                                                       \
     auto res = expr;                                         \
     (void)res;                                               \
     MARL_ASSERT(res == TRUE, #expr " failed with error: %d", \
                 (int)GetLastError());                        \
   } while (false)

 struct ProcessorGroup {
   unsigned int count;  // number of logical processors in this group.
   KAFFINITY affinity;  // affinity mask.
 };

 struct ProcessorGroups {
   std::array<ProcessorGroup, MaxGroupCount> groups;
   size_t count;
 };

 const ProcessorGroups& getProcessorGroups() {
   static ProcessorGroups groups = [] {
     ProcessorGroups out = {};
     SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info[32] = {};
     DWORD size = sizeof(info);
     CHECK_WIN32(GetLogicalProcessorInformationEx(RelationGroup, info, &size));
     DWORD count = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX);
     for (DWORD i = 0; i < count; i++) {
       if (info[i].Relationship == RelationGroup) {
         auto groupCount = info[i].Group.ActiveGroupCount;
         for (WORD groupIdx = 0; groupIdx < groupCount; groupIdx++) {
           auto const& groupInfo = info[i].Group.GroupInfo[groupIdx];
           out.groups[out.count++] = ProcessorGroup{
               groupInfo.ActiveProcessorCount, groupInfo.ActiveProcessorMask};
           MARL_ASSERT(out.count <= MaxGroupCount, "Group index overflow");
         }
       }
     }
     return out;
   }();
   return groups;
 }
 }  // namespace
 #endif  // defined(_WIN32)

 ////////////////////////////////////////////////////////////////////////////////
 // Thread::Affinty
 ////////////////////////////////////////////////////////////////////////////////

 Thread::Affinity::Affinity(Allocator* allocator) : cores(allocator) {}
 Thread::Affinity::Affinity(Affinity&& other) : cores(std::move(other.cores)) {}
 Thread::Affinity::Affinity(const Affinity& other, Allocator* allocator)
     : cores(other.cores, allocator) {}

 Thread::Affinity::Affinity(std::initializer_list<Core> list,
                            Allocator* allocator)
     : cores(allocator) {
   cores.reserve(list.size());
   for (auto core : list) {
     cores.push_back(core);
   }
 }

 Thread::Affinity Thread::Affinity::all(
     Allocator* allocator /* = Allocator::Default */) {
   Thread::Affinity affinity(allocator);

 #if defined(_WIN32)
   const auto& groups = getProcessorGroups();
   for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) {
     const auto& group = groups.groups[groupIdx];
     Core core;
     core.windows.group = static_cast<decltype(Core::windows.group)>(groupIdx);
     for (unsigned int coreIdx = 0; coreIdx < group.count; coreIdx++) {
       if ((group.affinity >> coreIdx) & 1) {
         core.windows.index = static_cast<decltype(core.windows.index)>(coreIdx);
         affinity.cores.emplace_back(std::move(core));
       }
     }
   }
 #elif defined(__linux__) && !defined(__ANDROID__)
   auto thread = pthread_self();
   cpu_set_t cpuset;
   CPU_ZERO(&cpuset);
   if (pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset) == 0) {
     int count = CPU_COUNT(&cpuset);
     for (int i = 0; i < count; i++) {
       Core core;
       core.pthread.index = static_cast<uint16_t>(i);
       affinity.cores.emplace_back(std::move(core));
     }
   }
 #elif defined(__FreeBSD__)
   auto thread = pthread_self();
   cpuset_t cpuset;
   CPU_ZERO(&cpuset);
   if (pthread_getaffinity_np(thread, sizeof(cpuset_t), &cpuset) == 0) {
     int count = CPU_COUNT(&cpuset);
     for (int i = 0; i < count; i++) {
       Core core;
       core.pthread.index = static_cast<uint16_t>(i);
       affinity.cores.emplace_back(std::move(core));
     }
   }
 #else
   static_assert(!supported,
                 "marl::Thread::Affinity::supported is true, but "
                 "Thread::Affinity::all() is not implemented for this platform");
 #endif

   return affinity;
 }

 std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::anyOf(
     Affinity&& affinity,
     Allocator* allocator /* = Allocator::Default */) {
   struct Policy : public Thread::Affinity::Policy {
     Affinity affinity;
     Policy(Affinity&& affinity) : affinity(std::move(affinity)) {}

     Affinity get(uint32_t threadId, Allocator* allocator) const override {
 #if defined(_WIN32)
       auto count = affinity.count();
       if (count == 0) {
         return Affinity(affinity, allocator);
       }
       auto group = affinity[threadId % affinity.count()].windows.group;
       Affinity out(allocator);
       out.cores.reserve(count);
       for (auto core : affinity.cores) {
         if (core.windows.group == group) {
           out.cores.push_back(core);
         }
       }
       return out;
 #else
       return Affinity(affinity, allocator);
 #endif
     }
   };

   return allocator->make_shared<Policy>(std::move(affinity));
 }

 std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::oneOf(
     Affinity&& affinity,
     Allocator* allocator /* = Allocator::Default */) {
   struct Policy : public Thread::Affinity::Policy {
     Affinity affinity;
     Policy(Affinity&& affinity) : affinity(std::move(affinity)) {}

     Affinity get(uint32_t threadId, Allocator* allocator) const override {
       auto count = affinity.count();
       if (count == 0) {
         return Affinity(affinity, allocator);
       }
       return Affinity({affinity[threadId % affinity.count()]}, allocator);
     }
   };

   return allocator->make_shared<Policy>(std::move(affinity));
 }

 size_t Thread::Affinity::count() const {
   return cores.size();
 }

 Thread::Core Thread::Affinity::operator[](size_t index) const {
   return cores[index];
 }

 Thread::Affinity& Thread::Affinity::add(const Thread::Affinity& other) {
   containers::unordered_set<Core, CoreHasher> set(cores.allocator);
   for (auto core : cores) {
     set.emplace(core);
   }
   for (auto core : other.cores) {
     if (set.count(core) == 0) {
       cores.push_back(core);
     }
   }
   std::sort(cores.begin(), cores.end());
   return *this;
 }

 Thread::Affinity& Thread::Affinity::remove(const Thread::Affinity& other) {
   containers::unordered_set<Core, CoreHasher> set(cores.allocator);
   for (auto core : other.cores) {
     set.emplace(core);
   }
   for (size_t i = 0; i < cores.size(); i++) {
     if (set.count(cores[i]) != 0) {
       cores[i] = cores.back();
       cores.resize(cores.size() - 1);
     }
   }
   std::sort(cores.begin(), cores.end());
   return *this;
 }

 #if defined(_WIN32)

 class Thread::Impl {
  public:
   Impl(Func&& func) : func(std::move(func)) {}
   static DWORD WINAPI run(void* self) {
     reinterpret_cast<Impl*>(self)->func();
     return 0;
   }

   Func func;
   HANDLE handle;
 };

 Thread::Thread(Affinity&& affinity, Func&& func) {
   SIZE_T size = 0;
   InitializeProcThreadAttributeList(nullptr, 1, 0, &size);
   MARL_ASSERT(size > 0,
               "InitializeProcThreadAttributeList() did not give a size");

   std::vector<uint8_t> buffer(size);
   LPPROC_THREAD_ATTRIBUTE_LIST attributes =
       reinterpret_cast<LPPROC_THREAD_ATTRIBUTE_LIST>(buffer.data());
   CHECK_WIN32(InitializeProcThreadAttributeList(attributes, 1, 0, &size));
   defer(DeleteProcThreadAttributeList(attributes));

   GROUP_AFFINITY groupAffinity = {};

   auto count = affinity.count();
   if (count > 0) {
     groupAffinity.Group = affinity[0].windows.group;
     for (size_t i = 0; i < count; i++) {
       auto core = affinity[i];
       MARL_ASSERT(groupAffinity.Group == core.windows.group,
                   "Cannot create thread that uses multiple affinity groups");
       groupAffinity.Mask |= (1ULL << core.windows.index);
     }
     CHECK_WIN32(UpdateProcThreadAttribute(
         attributes, 0, PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY, &groupAffinity,
         sizeof(groupAffinity), nullptr, nullptr));
   }

   impl = new Impl(std::move(func));
   impl->handle = CreateRemoteThreadEx(GetCurrentProcess(), nullptr, 0,
                                       &Impl::run, impl, 0, attributes, nullptr);
 }

 Thread::~Thread() {
   if (impl) {
     CloseHandle(impl->handle);
     delete impl;
   }
 }

 void Thread::join() {
   MARL_ASSERT(impl != nullptr, "join() called on unjoinable thread");
   WaitForSingleObject(impl->handle, INFINITE);
 }

 void Thread::setName(const char* fmt, ...) {
   static auto setThreadDescription =
       reinterpret_cast<HRESULT(WINAPI*)(HANDLE, PCWSTR)>(GetProcAddress(
           GetModuleHandle("kernelbase.dll"), "SetThreadDescription"));
   if (setThreadDescription == nullptr) {
     return;
   }

   char name[1024];
   va_list vararg;
   va_start(vararg, fmt);
   vsnprintf(name, sizeof(name), fmt, vararg);
   va_end(vararg);

   wchar_t wname[1024];
   mbstowcs(wname, name, 1024);
   setThreadDescription(GetCurrentThread(), wname);
   MARL_NAME_THREAD("%s", name);
 }

 unsigned int Thread::numLogicalCPUs() {
   unsigned int count = 0;
   const auto& groups = getProcessorGroups();
   for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) {
     const auto& group = groups.groups[groupIdx];
     count += group.count;
   }
   return count;
 }

 #else

 class Thread::Impl {
  public:
   Impl(Affinity&& affinity, Thread::Func&& f)
       : affinity(std::move(affinity)), func(std::move(f)), thread([this] {
           setAffinity();
           func();
         }) {}

   Affinity affinity;
   Func func;
   std::thread thread;

   void setAffinity() {
     auto count = affinity.count();
     if (count == 0) {
       return;
     }

 #if defined(__linux__) && !defined(__ANDROID__)
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
     for (size_t i = 0; i < count; i++) {
       CPU_SET(affinity[i].pthread.index, &cpuset);
     }
     auto thread = pthread_self();
     pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 #elif defined(__FreeBSD__)
     cpuset_t cpuset;
     CPU_ZERO(&cpuset);
     for (size_t i = 0; i < count; i++) {
       CPU_SET(affinity[i].pthread.index, &cpuset);
     }
     auto thread = pthread_self();
     pthread_setaffinity_np(thread, sizeof(cpuset_t), &cpuset);
 #else
     MARL_ASSERT(!marl::Thread::Affinity::supported,
                 "Attempting to use thread affinity on a unsupported platform");
 #endif
   }
 };

 Thread::Thread(Affinity&& affinity, Func&& func)
     : impl(new Thread::Impl(std::move(affinity), std::move(func))) {}

 Thread::~Thread() {
   MARL_ASSERT(!impl, "Thread::join() was not called before destruction");
 }

 void Thread::join() {
   impl->thread.join();
   delete impl;
   impl = nullptr;
 }

 void Thread::setName(const char* fmt, ...) {
   char name[1024];
   va_list vararg;
   va_start(vararg, fmt);
   vsnprintf(name, sizeof(name), fmt, vararg);
   va_end(vararg);

 #if defined(__APPLE__)
   pthread_setname_np(name);
 #elif defined(__FreeBSD__)
   pthread_set_name_np(pthread_self(), name);
 #elif !defined(__Fuchsia__)
   pthread_setname_np(pthread_self(), name);
 #endif

   MARL_NAME_THREAD("%s", name);
 }

 unsigned int Thread::numLogicalCPUs() {
   return static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
 }

 #endif  // OS

 Thread::Thread(Thread&& rhs) : impl(rhs.impl) {
   rhs.impl = nullptr;
 }

 Thread& Thread::operator=(Thread&& rhs) {
   if (impl) {
     delete impl;
     impl = nullptr;
   }
   impl = rhs.impl;
   rhs.impl = nullptr;
   return *this;
 }

 }  // namespace marl
	// Copyright 2019 The Marl Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "marl/thread.h"

	#include "marl/debug.h"
	#include "marl/defer.h"
	#include "marl/trace.h"

	#include <algorithm> // std::sort

	#include <cstdarg>
	#include <cstdio>

	#if defined(_WIN32)
	#define WIN32_LEAN_AND_MEAN 1
	#include <windows.h>
	#include <array>
	#include <cstdlib> // mbstowcs
	#include <limits> // std::numeric_limits
	#include <vector>
	#undef max
	#elif defined(__APPLE__)
	#include <mach/thread_act.h>
	#include <pthread.h>
	#include <unistd.h>
	#include <thread>
	#elif defined(__FreeBSD__)
	#include <pthread.h>
	#include <pthread_np.h>
	#include <unistd.h>
	#include <thread>
	#else
	#include <pthread.h>
	#include <unistd.h>
	#include <thread>
	#endif

	namespace {

	struct CoreHasher {
	inline uint64_t operator()(const marl::Thread::Core& core) const {
	return core.pthread.index;
	}
	};

	} // anonymous namespace

	namespace marl {

	#if defined(_WIN32)
	static constexpr size_t MaxCoreCount =
	std::numeric_limits<decltype(Thread::Core::windows.index)>::max() + 1ULL;
	static constexpr size_t MaxGroupCount =
	std::numeric_limits<decltype(Thread::Core::windows.group)>::max() + 1ULL;
	static_assert(sizeof(KAFFINITY) * 8ULL <= MaxCoreCount,
	"Thread::Core::windows.index is too small");

	namespace {
	#define CHECK_WIN32(expr) \
	do { \
	auto res = expr; \
	(void)res; \
	MARL_ASSERT(res == TRUE, #expr " failed with error: %d", \
	(int)GetLastError()); \
	} while (false)

	struct ProcessorGroup {
	unsigned int count; // number of logical processors in this group.
	KAFFINITY affinity; // affinity mask.
	};

	struct ProcessorGroups {
	std::array<ProcessorGroup, MaxGroupCount> groups;
	size_t count;
	};

	const ProcessorGroups& getProcessorGroups() {
	static ProcessorGroups groups = [] {
	ProcessorGroups out = {};
	SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info[32] = {};
	DWORD size = sizeof(info);
	CHECK_WIN32(GetLogicalProcessorInformationEx(RelationGroup, info, &size));
	DWORD count = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX);
	for (DWORD i = 0; i < count; i++) {
	if (info[i].Relationship == RelationGroup) {
	auto groupCount = info[i].Group.ActiveGroupCount;
	for (WORD groupIdx = 0; groupIdx < groupCount; groupIdx++) {
	auto const& groupInfo = info[i].Group.GroupInfo[groupIdx];
	out.groups[out.count++] = ProcessorGroup{
	groupInfo.ActiveProcessorCount, groupInfo.ActiveProcessorMask};
	MARL_ASSERT(out.count <= MaxGroupCount, "Group index overflow");
	}
	}
	}
	return out;
	}();
	return groups;
	}
	} // namespace
	#endif // defined(_WIN32)

	////////////////////////////////////////////////////////////////////////////////
	// Thread::Affinty
	////////////////////////////////////////////////////////////////////////////////

	Thread::Affinity::Affinity(Allocator* allocator) : cores(allocator) {}
	Thread::Affinity::Affinity(Affinity&& other) : cores(std::move(other.cores)) {}
	Thread::Affinity::Affinity(const Affinity& other, Allocator* allocator)
	: cores(other.cores, allocator) {}

	Thread::Affinity::Affinity(std::initializer_list<Core> list,
	Allocator* allocator)
	: cores(allocator) {
	cores.reserve(list.size());
	for (auto core : list) {
	cores.push_back(core);
	}
	}

	Thread::Affinity Thread::Affinity::all(
	Allocator* allocator /* = Allocator::Default */) {
	Thread::Affinity affinity(allocator);

	#if defined(_WIN32)
	const auto& groups = getProcessorGroups();
	for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) {
	const auto& group = groups.groups[groupIdx];
	Core core;
	core.windows.group = static_cast<decltype(Core::windows.group)>(groupIdx);
	for (unsigned int coreIdx = 0; coreIdx < group.count; coreIdx++) {
	if ((group.affinity >> coreIdx) & 1) {
	core.windows.index = static_cast<decltype(core.windows.index)>(coreIdx);
	affinity.cores.emplace_back(std::move(core));
	}
	}
	}
	#elif defined(__linux__) && !defined(__ANDROID__)
	auto thread = pthread_self();
	cpu_set_t cpuset;
	CPU_ZERO(&cpuset);
	if (pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset) == 0) {
	int count = CPU_COUNT(&cpuset);
	for (int i = 0; i < count; i++) {
	Core core;
	core.pthread.index = static_cast<uint16_t>(i);
	affinity.cores.emplace_back(std::move(core));
	}
	}
	#elif defined(__FreeBSD__)
	auto thread = pthread_self();
	cpuset_t cpuset;
	CPU_ZERO(&cpuset);
	if (pthread_getaffinity_np(thread, sizeof(cpuset_t), &cpuset) == 0) {
	int count = CPU_COUNT(&cpuset);
	for (int i = 0; i < count; i++) {
	Core core;
	core.pthread.index = static_cast<uint16_t>(i);
	affinity.cores.emplace_back(std::move(core));
	}
	}
	#else
	static_assert(!supported,
	"marl::Thread::Affinity::supported is true, but "
	"Thread::Affinity::all() is not implemented for this platform");
	#endif

	return affinity;
	}

	std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::anyOf(
	Affinity&& affinity,
	Allocator* allocator /* = Allocator::Default */) {
	struct Policy : public Thread::Affinity::Policy {
	Affinity affinity;
	Policy(Affinity&& affinity) : affinity(std::move(affinity)) {}

	Affinity get(uint32_t threadId, Allocator* allocator) const override {
	#if defined(_WIN32)
	auto count = affinity.count();
	if (count == 0) {
	return Affinity(affinity, allocator);
	}
	auto group = affinity[threadId % affinity.count()].windows.group;
	Affinity out(allocator);
	out.cores.reserve(count);
	for (auto core : affinity.cores) {
	if (core.windows.group == group) {
	out.cores.push_back(core);
	}
	}
	return out;
	#else
	return Affinity(affinity, allocator);
	#endif
	}
	};

	return allocator->make_shared<Policy>(std::move(affinity));
	}

	std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::oneOf(
	Affinity&& affinity,
	Allocator* allocator /* = Allocator::Default */) {
	struct Policy : public Thread::Affinity::Policy {
	Affinity affinity;
	Policy(Affinity&& affinity) : affinity(std::move(affinity)) {}

	Affinity get(uint32_t threadId, Allocator* allocator) const override {
	auto count = affinity.count();
	if (count == 0) {
	return Affinity(affinity, allocator);
	}
	return Affinity({affinity[threadId % affinity.count()]}, allocator);
	}
	};

	return allocator->make_shared<Policy>(std::move(affinity));
	}

	size_t Thread::Affinity::count() const {
	return cores.size();
	}

	Thread::Core Thread::Affinity::operator[](size_t index) const {
	return cores[index];
	}

	Thread::Affinity& Thread::Affinity::add(const Thread::Affinity& other) {
	containers::unordered_set<Core, CoreHasher> set(cores.allocator);
	for (auto core : cores) {
	set.emplace(core);
	}
	for (auto core : other.cores) {
	if (set.count(core) == 0) {
	cores.push_back(core);
	}
	}
	std::sort(cores.begin(), cores.end());
	return *this;
	}

	Thread::Affinity& Thread::Affinity::remove(const Thread::Affinity& other) {
	containers::unordered_set<Core, CoreHasher> set(cores.allocator);
	for (auto core : other.cores) {
	set.emplace(core);
	}
	for (size_t i = 0; i < cores.size(); i++) {
	if (set.count(cores[i]) != 0) {
	cores[i] = cores.back();
	cores.resize(cores.size() - 1);
	}
	}
	std::sort(cores.begin(), cores.end());
	return *this;
	}

	#if defined(_WIN32)

	class Thread::Impl {
	public:
	Impl(Func&& func) : func(std::move(func)) {}
	static DWORD WINAPI run(void* self) {
	reinterpret_cast<Impl*>(self)->func();
	return 0;
	}

	Func func;
	HANDLE handle;
	};

	Thread::Thread(Affinity&& affinity, Func&& func) {
	SIZE_T size = 0;
	InitializeProcThreadAttributeList(nullptr, 1, 0, &size);
	MARL_ASSERT(size > 0,
	"InitializeProcThreadAttributeList() did not give a size");

	std::vector<uint8_t> buffer(size);
	LPPROC_THREAD_ATTRIBUTE_LIST attributes =
	reinterpret_cast<LPPROC_THREAD_ATTRIBUTE_LIST>(buffer.data());
	CHECK_WIN32(InitializeProcThreadAttributeList(attributes, 1, 0, &size));
	defer(DeleteProcThreadAttributeList(attributes));

	GROUP_AFFINITY groupAffinity = {};

	auto count = affinity.count();
	if (count > 0) {
	groupAffinity.Group = affinity[0].windows.group;
	for (size_t i = 0; i < count; i++) {
	auto core = affinity[i];
	MARL_ASSERT(groupAffinity.Group == core.windows.group,
	"Cannot create thread that uses multiple affinity groups");
	groupAffinity.Mask \|= (1ULL << core.windows.index);
	}
	CHECK_WIN32(UpdateProcThreadAttribute(
	attributes, 0, PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY, &groupAffinity,
	sizeof(groupAffinity), nullptr, nullptr));
	}

	impl = new Impl(std::move(func));
	impl->handle = CreateRemoteThreadEx(GetCurrentProcess(), nullptr, 0,
	&Impl::run, impl, 0, attributes, nullptr);
	}

	Thread::~Thread() {
	if (impl) {
	CloseHandle(impl->handle);
	delete impl;
	}
	}

	void Thread::join() {
	MARL_ASSERT(impl != nullptr, "join() called on unjoinable thread");
	WaitForSingleObject(impl->handle, INFINITE);
	}

	void Thread::setName(const char* fmt, ...) {
	static auto setThreadDescription =
	reinterpret_cast<HRESULT(WINAPI*)(HANDLE, PCWSTR)>(GetProcAddress(
	GetModuleHandle("kernelbase.dll"), "SetThreadDescription"));
	if (setThreadDescription == nullptr) {
	return;
	}

	char name[1024];
	va_list vararg;
	va_start(vararg, fmt);
	vsnprintf(name, sizeof(name), fmt, vararg);
	va_end(vararg);

	wchar_t wname[1024];
	mbstowcs(wname, name, 1024);
	setThreadDescription(GetCurrentThread(), wname);
	MARL_NAME_THREAD("%s", name);
	}

	unsigned int Thread::numLogicalCPUs() {
	unsigned int count = 0;
	const auto& groups = getProcessorGroups();
	for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) {
	const auto& group = groups.groups[groupIdx];
	count += group.count;
	}
	return count;
	}

	#else

	class Thread::Impl {
	public:
	Impl(Affinity&& affinity, Thread::Func&& f)
	: affinity(std::move(affinity)), func(std::move(f)), thread([this] {
	setAffinity();
	func();
	}) {}

	Affinity affinity;
	Func func;
	std::thread thread;

	void setAffinity() {
	auto count = affinity.count();
	if (count == 0) {
	return;
	}

	#if defined(__linux__) && !defined(__ANDROID__)
	cpu_set_t cpuset;
	CPU_ZERO(&cpuset);
	for (size_t i = 0; i < count; i++) {
	CPU_SET(affinity[i].pthread.index, &cpuset);
	}
	auto thread = pthread_self();
	pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
	#elif defined(__FreeBSD__)
	cpuset_t cpuset;
	CPU_ZERO(&cpuset);
	for (size_t i = 0; i < count; i++) {
	CPU_SET(affinity[i].pthread.index, &cpuset);
	}
	auto thread = pthread_self();
	pthread_setaffinity_np(thread, sizeof(cpuset_t), &cpuset);
	#else
	MARL_ASSERT(!marl::Thread::Affinity::supported,
	"Attempting to use thread affinity on a unsupported platform");
	#endif
	}
	};

	Thread::Thread(Affinity&& affinity, Func&& func)
	: impl(new Thread::Impl(std::move(affinity), std::move(func))) {}

	Thread::~Thread() {
	MARL_ASSERT(!impl, "Thread::join() was not called before destruction");
	}

	void Thread::join() {
	impl->thread.join();
	delete impl;
	impl = nullptr;
	}

	void Thread::setName(const char* fmt, ...) {
	char name[1024];
	va_list vararg;
	va_start(vararg, fmt);
	vsnprintf(name, sizeof(name), fmt, vararg);
	va_end(vararg);

	#if defined(__APPLE__)
	pthread_setname_np(name);
	#elif defined(__FreeBSD__)
	pthread_set_name_np(pthread_self(), name);
	#elif !defined(__Fuchsia__)
	pthread_setname_np(pthread_self(), name);
	#endif

	MARL_NAME_THREAD("%s", name);
	}

	unsigned int Thread::numLogicalCPUs() {
	return static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
	}

	#endif // OS

	Thread::Thread(Thread&& rhs) : impl(rhs.impl) {
	rhs.impl = nullptr;
	}

	Thread& Thread::operator=(Thread&& rhs) {
	if (impl) {
	delete impl;
	impl = nullptr;
	}
	impl = rhs.impl;
	rhs.impl = nullptr;
	return *this;
	}

	} // namespace marl