|  | // Copyright 2019 The Marl Authors. | 
|  | // | 
|  | // Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | // you may not use this file except in compliance with the License. | 
|  | // You may obtain a copy of the License at | 
|  | // | 
|  | //     https://www.apache.org/licenses/LICENSE-2.0 | 
|  | // | 
|  | // Unless required by applicable law or agreed to in writing, software | 
|  | // distributed under the License is distributed on an "AS IS" BASIS, | 
|  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | // See the License for the specific language governing permissions and | 
|  | // limitations under the License. | 
|  |  | 
|  | #include "marl/thread.h" | 
|  |  | 
|  | #include "marl/debug.h" | 
|  | #include "marl/defer.h" | 
|  | #include "marl/trace.h" | 
|  |  | 
|  | #include <algorithm>  // std::sort | 
|  |  | 
|  | #include <cstdarg> | 
|  | #include <cstdio> | 
|  |  | 
|  | #if defined(_WIN32) | 
|  | #define WIN32_LEAN_AND_MEAN 1 | 
|  | #include <windows.h> | 
|  | #include <array> | 
|  | #include <cstdlib>  // mbstowcs | 
|  | #include <limits>   // std::numeric_limits | 
|  | #include <vector> | 
|  | #undef max | 
|  | #elif defined(__APPLE__) | 
|  | #include <mach/thread_act.h> | 
|  | #include <pthread.h> | 
|  | #include <unistd.h> | 
|  | #include <thread> | 
|  | #elif defined(__FreeBSD__) | 
|  | #include <pthread.h> | 
|  | #include <pthread_np.h> | 
|  | #include <unistd.h> | 
|  | #include <thread> | 
|  | #else | 
|  | #include <pthread.h> | 
|  | #include <unistd.h> | 
|  | #include <thread> | 
|  | #endif | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | struct CoreHasher { | 
|  | inline uint64_t operator()(const marl::Thread::Core& core) const { | 
|  | return core.pthread.index; | 
|  | } | 
|  | }; | 
|  |  | 
|  | }  // anonymous namespace | 
|  |  | 
|  | namespace marl { | 
|  |  | 
|  | #if defined(_WIN32) | 
|  | static constexpr size_t MaxCoreCount = | 
|  | std::numeric_limits<decltype(Thread::Core::windows.index)>::max() + 1ULL; | 
|  | static constexpr size_t MaxGroupCount = | 
|  | std::numeric_limits<decltype(Thread::Core::windows.group)>::max() + 1ULL; | 
|  | static_assert(sizeof(KAFFINITY) * 8ULL <= MaxCoreCount, | 
|  | "Thread::Core::windows.index is too small"); | 
|  |  | 
|  | namespace { | 
|  | #define CHECK_WIN32(expr)                                    \ | 
|  | do {                                                       \ | 
|  | auto res = expr;                                         \ | 
|  | (void)res;                                               \ | 
|  | MARL_ASSERT(res == TRUE, #expr " failed with error: %d", \ | 
|  | (int)GetLastError());                        \ | 
|  | } while (false) | 
|  |  | 
|  | struct ProcessorGroup { | 
|  | unsigned int count;  // number of logical processors in this group. | 
|  | KAFFINITY affinity;  // affinity mask. | 
|  | }; | 
|  |  | 
|  | struct ProcessorGroups { | 
|  | std::array<ProcessorGroup, MaxGroupCount> groups; | 
|  | size_t count; | 
|  | }; | 
|  |  | 
|  | const ProcessorGroups& getProcessorGroups() { | 
|  | static ProcessorGroups groups = [] { | 
|  | ProcessorGroups out = {}; | 
|  | SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info[32] = {}; | 
|  | DWORD size = sizeof(info); | 
|  | CHECK_WIN32(GetLogicalProcessorInformationEx(RelationGroup, info, &size)); | 
|  | DWORD count = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX); | 
|  | for (DWORD i = 0; i < count; i++) { | 
|  | if (info[i].Relationship == RelationGroup) { | 
|  | auto groupCount = info[i].Group.ActiveGroupCount; | 
|  | for (WORD groupIdx = 0; groupIdx < groupCount; groupIdx++) { | 
|  | auto const& groupInfo = info[i].Group.GroupInfo[groupIdx]; | 
|  | out.groups[out.count++] = ProcessorGroup{ | 
|  | groupInfo.ActiveProcessorCount, groupInfo.ActiveProcessorMask}; | 
|  | MARL_ASSERT(out.count <= MaxGroupCount, "Group index overflow"); | 
|  | } | 
|  | } | 
|  | } | 
|  | return out; | 
|  | }(); | 
|  | return groups; | 
|  | } | 
|  | }  // namespace | 
|  | #endif  // defined(_WIN32) | 
|  |  | 
|  | //////////////////////////////////////////////////////////////////////////////// | 
|  | // Thread::Affinty | 
|  | //////////////////////////////////////////////////////////////////////////////// | 
|  |  | 
|  | Thread::Affinity::Affinity(Allocator* allocator) : cores(allocator) {} | 
|  | Thread::Affinity::Affinity(Affinity&& other) : cores(std::move(other.cores)) {} | 
|  | Thread::Affinity::Affinity(const Affinity& other, Allocator* allocator) | 
|  | : cores(other.cores, allocator) {} | 
|  |  | 
|  | Thread::Affinity::Affinity(std::initializer_list<Core> list, | 
|  | Allocator* allocator) | 
|  | : cores(allocator) { | 
|  | cores.reserve(list.size()); | 
|  | for (auto core : list) { | 
|  | cores.push_back(core); | 
|  | } | 
|  | } | 
|  |  | 
|  | Thread::Affinity Thread::Affinity::all( | 
|  | Allocator* allocator /* = Allocator::Default */) { | 
|  | Thread::Affinity affinity(allocator); | 
|  |  | 
|  | #if defined(_WIN32) | 
|  | const auto& groups = getProcessorGroups(); | 
|  | for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) { | 
|  | const auto& group = groups.groups[groupIdx]; | 
|  | Core core; | 
|  | core.windows.group = static_cast<decltype(Core::windows.group)>(groupIdx); | 
|  | for (unsigned int coreIdx = 0; coreIdx < group.count; coreIdx++) { | 
|  | if ((group.affinity >> coreIdx) & 1) { | 
|  | core.windows.index = static_cast<decltype(core.windows.index)>(coreIdx); | 
|  | affinity.cores.emplace_back(std::move(core)); | 
|  | } | 
|  | } | 
|  | } | 
|  | #elif defined(__linux__) && !defined(__ANDROID__) | 
|  | auto thread = pthread_self(); | 
|  | cpu_set_t cpuset; | 
|  | CPU_ZERO(&cpuset); | 
|  | if (pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset) == 0) { | 
|  | int count = CPU_COUNT(&cpuset); | 
|  | for (int i = 0; i < count; i++) { | 
|  | Core core; | 
|  | core.pthread.index = static_cast<uint16_t>(i); | 
|  | affinity.cores.emplace_back(std::move(core)); | 
|  | } | 
|  | } | 
|  | #elif defined(__FreeBSD__) | 
|  | auto thread = pthread_self(); | 
|  | cpuset_t cpuset; | 
|  | CPU_ZERO(&cpuset); | 
|  | if (pthread_getaffinity_np(thread, sizeof(cpuset_t), &cpuset) == 0) { | 
|  | int count = CPU_COUNT(&cpuset); | 
|  | for (int i = 0; i < count; i++) { | 
|  | Core core; | 
|  | core.pthread.index = static_cast<uint16_t>(i); | 
|  | affinity.cores.emplace_back(std::move(core)); | 
|  | } | 
|  | } | 
|  | #else | 
|  | static_assert(!supported, | 
|  | "marl::Thread::Affinity::supported is true, but " | 
|  | "Thread::Affinity::all() is not implemented for this platform"); | 
|  | #endif | 
|  |  | 
|  | return affinity; | 
|  | } | 
|  |  | 
|  | std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::anyOf( | 
|  | Affinity&& affinity, | 
|  | Allocator* allocator /* = Allocator::Default */) { | 
|  | struct Policy : public Thread::Affinity::Policy { | 
|  | Affinity affinity; | 
|  | Policy(Affinity&& affinity) : affinity(std::move(affinity)) {} | 
|  |  | 
|  | Affinity get(uint32_t threadId, Allocator* allocator) const override { | 
|  | #if defined(_WIN32) | 
|  | auto count = affinity.count(); | 
|  | if (count == 0) { | 
|  | return Affinity(affinity, allocator); | 
|  | } | 
|  | auto group = affinity[threadId % affinity.count()].windows.group; | 
|  | Affinity out(allocator); | 
|  | out.cores.reserve(count); | 
|  | for (auto core : affinity.cores) { | 
|  | if (core.windows.group == group) { | 
|  | out.cores.push_back(core); | 
|  | } | 
|  | } | 
|  | return out; | 
|  | #else | 
|  | return Affinity(affinity, allocator); | 
|  | #endif | 
|  | } | 
|  | }; | 
|  |  | 
|  | return allocator->make_shared<Policy>(std::move(affinity)); | 
|  | } | 
|  |  | 
|  | std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::oneOf( | 
|  | Affinity&& affinity, | 
|  | Allocator* allocator /* = Allocator::Default */) { | 
|  | struct Policy : public Thread::Affinity::Policy { | 
|  | Affinity affinity; | 
|  | Policy(Affinity&& affinity) : affinity(std::move(affinity)) {} | 
|  |  | 
|  | Affinity get(uint32_t threadId, Allocator* allocator) const override { | 
|  | auto count = affinity.count(); | 
|  | if (count == 0) { | 
|  | return Affinity(affinity, allocator); | 
|  | } | 
|  | return Affinity({affinity[threadId % affinity.count()]}, allocator); | 
|  | } | 
|  | }; | 
|  |  | 
|  | return allocator->make_shared<Policy>(std::move(affinity)); | 
|  | } | 
|  |  | 
|  | size_t Thread::Affinity::count() const { | 
|  | return cores.size(); | 
|  | } | 
|  |  | 
|  | Thread::Core Thread::Affinity::operator[](size_t index) const { | 
|  | return cores[index]; | 
|  | } | 
|  |  | 
|  | Thread::Affinity& Thread::Affinity::add(const Thread::Affinity& other) { | 
|  | containers::unordered_set<Core, CoreHasher> set(cores.allocator); | 
|  | for (auto core : cores) { | 
|  | set.emplace(core); | 
|  | } | 
|  | for (auto core : other.cores) { | 
|  | if (set.count(core) == 0) { | 
|  | cores.push_back(core); | 
|  | } | 
|  | } | 
|  | std::sort(cores.begin(), cores.end()); | 
|  | return *this; | 
|  | } | 
|  |  | 
|  | Thread::Affinity& Thread::Affinity::remove(const Thread::Affinity& other) { | 
|  | containers::unordered_set<Core, CoreHasher> set(cores.allocator); | 
|  | for (auto core : other.cores) { | 
|  | set.emplace(core); | 
|  | } | 
|  | for (size_t i = 0; i < cores.size(); i++) { | 
|  | if (set.count(cores[i]) != 0) { | 
|  | cores[i] = cores.back(); | 
|  | cores.resize(cores.size() - 1); | 
|  | } | 
|  | } | 
|  | std::sort(cores.begin(), cores.end()); | 
|  | return *this; | 
|  | } | 
|  |  | 
|  | #if defined(_WIN32) | 
|  |  | 
|  | class Thread::Impl { | 
|  | public: | 
|  | Impl(Func&& func) : func(std::move(func)) {} | 
|  | static DWORD WINAPI run(void* self) { | 
|  | reinterpret_cast<Impl*>(self)->func(); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | Func func; | 
|  | HANDLE handle; | 
|  | }; | 
|  |  | 
|  | Thread::Thread(Affinity&& affinity, Func&& func) { | 
|  | SIZE_T size = 0; | 
|  | InitializeProcThreadAttributeList(nullptr, 1, 0, &size); | 
|  | MARL_ASSERT(size > 0, | 
|  | "InitializeProcThreadAttributeList() did not give a size"); | 
|  |  | 
|  | std::vector<uint8_t> buffer(size); | 
|  | LPPROC_THREAD_ATTRIBUTE_LIST attributes = | 
|  | reinterpret_cast<LPPROC_THREAD_ATTRIBUTE_LIST>(buffer.data()); | 
|  | CHECK_WIN32(InitializeProcThreadAttributeList(attributes, 1, 0, &size)); | 
|  | defer(DeleteProcThreadAttributeList(attributes)); | 
|  |  | 
|  | GROUP_AFFINITY groupAffinity = {}; | 
|  |  | 
|  | auto count = affinity.count(); | 
|  | if (count > 0) { | 
|  | groupAffinity.Group = affinity[0].windows.group; | 
|  | for (size_t i = 0; i < count; i++) { | 
|  | auto core = affinity[i]; | 
|  | MARL_ASSERT(groupAffinity.Group == core.windows.group, | 
|  | "Cannot create thread that uses multiple affinity groups"); | 
|  | groupAffinity.Mask |= (1ULL << core.windows.index); | 
|  | } | 
|  | CHECK_WIN32(UpdateProcThreadAttribute( | 
|  | attributes, 0, PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY, &groupAffinity, | 
|  | sizeof(groupAffinity), nullptr, nullptr)); | 
|  | } | 
|  |  | 
|  | impl = new Impl(std::move(func)); | 
|  | impl->handle = CreateRemoteThreadEx(GetCurrentProcess(), nullptr, 0, | 
|  | &Impl::run, impl, 0, attributes, nullptr); | 
|  | } | 
|  |  | 
|  | Thread::~Thread() { | 
|  | if (impl) { | 
|  | CloseHandle(impl->handle); | 
|  | delete impl; | 
|  | } | 
|  | } | 
|  |  | 
|  | void Thread::join() { | 
|  | MARL_ASSERT(impl != nullptr, "join() called on unjoinable thread"); | 
|  | WaitForSingleObject(impl->handle, INFINITE); | 
|  | } | 
|  |  | 
|  | void Thread::setName(const char* fmt, ...) { | 
|  | static auto setThreadDescription = | 
|  | reinterpret_cast<HRESULT(WINAPI*)(HANDLE, PCWSTR)>(GetProcAddress( | 
|  | GetModuleHandle("kernelbase.dll"), "SetThreadDescription")); | 
|  | if (setThreadDescription == nullptr) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | char name[1024]; | 
|  | va_list vararg; | 
|  | va_start(vararg, fmt); | 
|  | vsnprintf(name, sizeof(name), fmt, vararg); | 
|  | va_end(vararg); | 
|  |  | 
|  | wchar_t wname[1024]; | 
|  | mbstowcs(wname, name, 1024); | 
|  | setThreadDescription(GetCurrentThread(), wname); | 
|  | MARL_NAME_THREAD("%s", name); | 
|  | } | 
|  |  | 
|  | unsigned int Thread::numLogicalCPUs() { | 
|  | unsigned int count = 0; | 
|  | const auto& groups = getProcessorGroups(); | 
|  | for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) { | 
|  | const auto& group = groups.groups[groupIdx]; | 
|  | count += group.count; | 
|  | } | 
|  | return count; | 
|  | } | 
|  |  | 
|  | #else | 
|  |  | 
|  | class Thread::Impl { | 
|  | public: | 
|  | Impl(Affinity&& affinity, Thread::Func&& f) | 
|  | : affinity(std::move(affinity)), func(std::move(f)), thread([this] { | 
|  | setAffinity(); | 
|  | func(); | 
|  | }) {} | 
|  |  | 
|  | Affinity affinity; | 
|  | Func func; | 
|  | std::thread thread; | 
|  |  | 
|  | void setAffinity() { | 
|  | auto count = affinity.count(); | 
|  | if (count == 0) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | #if defined(__linux__) && !defined(__ANDROID__) | 
|  | cpu_set_t cpuset; | 
|  | CPU_ZERO(&cpuset); | 
|  | for (size_t i = 0; i < count; i++) { | 
|  | CPU_SET(affinity[i].pthread.index, &cpuset); | 
|  | } | 
|  | auto thread = pthread_self(); | 
|  | pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); | 
|  | #elif defined(__FreeBSD__) | 
|  | cpuset_t cpuset; | 
|  | CPU_ZERO(&cpuset); | 
|  | for (size_t i = 0; i < count; i++) { | 
|  | CPU_SET(affinity[i].pthread.index, &cpuset); | 
|  | } | 
|  | auto thread = pthread_self(); | 
|  | pthread_setaffinity_np(thread, sizeof(cpuset_t), &cpuset); | 
|  | #else | 
|  | MARL_ASSERT(!marl::Thread::Affinity::supported, | 
|  | "Attempting to use thread affinity on a unsupported platform"); | 
|  | #endif | 
|  | } | 
|  | }; | 
|  |  | 
|  | Thread::Thread(Affinity&& affinity, Func&& func) | 
|  | : impl(new Thread::Impl(std::move(affinity), std::move(func))) {} | 
|  |  | 
|  | Thread::~Thread() { | 
|  | delete impl; | 
|  | } | 
|  |  | 
|  | void Thread::join() { | 
|  | impl->thread.join(); | 
|  | } | 
|  |  | 
|  | void Thread::setName(const char* fmt, ...) { | 
|  | char name[1024]; | 
|  | va_list vararg; | 
|  | va_start(vararg, fmt); | 
|  | vsnprintf(name, sizeof(name), fmt, vararg); | 
|  | va_end(vararg); | 
|  |  | 
|  | #if defined(__APPLE__) | 
|  | pthread_setname_np(name); | 
|  | #elif defined(__FreeBSD__) | 
|  | pthread_set_name_np(pthread_self(), name); | 
|  | #elif !defined(__Fuchsia__) | 
|  | pthread_setname_np(pthread_self(), name); | 
|  | #endif | 
|  |  | 
|  | MARL_NAME_THREAD("%s", name); | 
|  | } | 
|  |  | 
|  | unsigned int Thread::numLogicalCPUs() { | 
|  | return static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN)); | 
|  | } | 
|  |  | 
|  | #endif  // OS | 
|  |  | 
|  | Thread::Thread(Thread&& rhs) : impl(rhs.impl) { | 
|  | rhs.impl = nullptr; | 
|  | } | 
|  |  | 
|  | Thread& Thread::operator=(Thread&& rhs) { | 
|  | if (impl) { | 
|  | delete impl; | 
|  | impl = nullptr; | 
|  | } | 
|  | impl = rhs.impl; | 
|  | rhs.impl = nullptr; | 
|  | return *this; | 
|  | } | 
|  |  | 
|  | }  // namespace marl |