mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Thread pool
This commit is contained in:
committed by
Rasmus Munk Larsen
parent
9eb8e2afba
commit
94f57867fe
69
Eigen/src/ThreadPool/Barrier.h
Normal file
69
Eigen/src/ThreadPool/Barrier.h
Normal file
@@ -0,0 +1,69 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Barrier is an object that allows one or more threads to wait until
|
||||
// Notify has been called a specified number of times.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
|
||||
#define EIGEN_CXX11_THREADPOOL_BARRIER_H
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
class Barrier {
|
||||
public:
|
||||
Barrier(unsigned int count) : state_(count << 1), notified_(false) {
|
||||
eigen_plain_assert(((count << 1) >> 1) == count);
|
||||
}
|
||||
~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
|
||||
|
||||
void Notify() {
|
||||
unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
|
||||
if (v != 1) {
|
||||
// Clear the lowest bit (waiter flag) and check that the original state
|
||||
// value was not zero. If it was zero, it means that notify was called
|
||||
// more times than the original count.
|
||||
eigen_plain_assert(((v + 2) & ~1) != 0);
|
||||
return; // either count has not dropped to 0, or waiter is not waiting
|
||||
}
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
eigen_plain_assert(!notified_);
|
||||
notified_ = true;
|
||||
cv_.notify_all();
|
||||
}
|
||||
|
||||
void Wait() {
|
||||
unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
|
||||
if ((v >> 1) == 0) return;
|
||||
std::unique_lock<std::mutex> l(mu_);
|
||||
while (!notified_) {
|
||||
cv_.wait(l);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mu_;
|
||||
std::condition_variable cv_;
|
||||
std::atomic<unsigned int> state_; // low bit is waiter flag
|
||||
bool notified_;
|
||||
};
|
||||
|
||||
// Notification is an object that allows a user to to wait for another
|
||||
// thread to signal a notification that an event has occurred.
|
||||
//
|
||||
// Multiple threads can wait on the same Notification object,
|
||||
// but only one caller must call Notify() on the object.
|
||||
struct Notification : Barrier {
|
||||
Notification() : Barrier(1){};
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_BARRIER_H
|
||||
251
Eigen/src/ThreadPool/EventCount.h
Normal file
251
Eigen/src/ThreadPool/EventCount.h
Normal file
@@ -0,0 +1,251 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
|
||||
#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// EventCount allows to wait for arbitrary predicates in non-blocking
|
||||
// algorithms. Think of condition variable, but wait predicate does not need to
|
||||
// be protected by a mutex. Usage:
|
||||
// Waiting thread does:
|
||||
//
|
||||
// if (predicate)
|
||||
// return act();
|
||||
// EventCount::Waiter& w = waiters[my_index];
|
||||
// ec.Prewait(&w);
|
||||
// if (predicate) {
|
||||
// ec.CancelWait(&w);
|
||||
// return act();
|
||||
// }
|
||||
// ec.CommitWait(&w);
|
||||
//
|
||||
// Notifying thread does:
|
||||
//
|
||||
// predicate = true;
|
||||
// ec.Notify(true);
|
||||
//
|
||||
// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
|
||||
// cheap, but they are executed only if the preceding predicate check has
|
||||
// failed.
|
||||
//
|
||||
// Algorithm outline:
|
||||
// There are two main variables: predicate (managed by user) and state_.
|
||||
// Operation closely resembles Dekker mutual algorithm:
|
||||
// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
|
||||
// Waiting thread sets state_ then checks predicate, Notifying thread sets
|
||||
// predicate then checks state_. Due to seq_cst fences in between these
|
||||
// operations it is guaranteed than either waiter will see predicate change
|
||||
// and won't block, or notifying thread will see state_ change and will unblock
|
||||
// the waiter, or both. But it can't happen that both threads don't see each
|
||||
// other changes, which would lead to deadlock.
|
||||
class EventCount {
|
||||
public:
|
||||
class Waiter;
|
||||
|
||||
EventCount(MaxSizeVector<Waiter>& waiters)
|
||||
: state_(kStackMask), waiters_(waiters) {
|
||||
eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
|
||||
}
|
||||
|
||||
~EventCount() {
|
||||
// Ensure there are no waiters.
|
||||
eigen_plain_assert(state_.load() == kStackMask);
|
||||
}
|
||||
|
||||
// Prewait prepares for waiting.
|
||||
// After calling Prewait, the thread must re-check the wait predicate
|
||||
// and then call either CancelWait or CommitWait.
|
||||
void Prewait() {
|
||||
uint64_t state = state_.load(std::memory_order_relaxed);
|
||||
for (;;) {
|
||||
CheckState(state);
|
||||
uint64_t newstate = state + kWaiterInc;
|
||||
CheckState(newstate);
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_seq_cst))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// CommitWait commits waiting after Prewait.
|
||||
void CommitWait(Waiter* w) {
|
||||
eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
|
||||
w->state = Waiter::kNotSignaled;
|
||||
const uint64_t me = (w - &waiters_[0]) | w->epoch;
|
||||
uint64_t state = state_.load(std::memory_order_seq_cst);
|
||||
for (;;) {
|
||||
CheckState(state, true);
|
||||
uint64_t newstate;
|
||||
if ((state & kSignalMask) != 0) {
|
||||
// Consume the signal and return immediately.
|
||||
newstate = state - kWaiterInc - kSignalInc;
|
||||
} else {
|
||||
// Remove this thread from pre-wait counter and add to the waiter stack.
|
||||
newstate = ((state & kWaiterMask) - kWaiterInc) | me;
|
||||
w->next.store(state & (kStackMask | kEpochMask),
|
||||
std::memory_order_relaxed);
|
||||
}
|
||||
CheckState(newstate);
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_acq_rel)) {
|
||||
if ((state & kSignalMask) == 0) {
|
||||
w->epoch += kEpochInc;
|
||||
Park(w);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CancelWait cancels effects of the previous Prewait call.
|
||||
void CancelWait() {
|
||||
uint64_t state = state_.load(std::memory_order_relaxed);
|
||||
for (;;) {
|
||||
CheckState(state, true);
|
||||
uint64_t newstate = state - kWaiterInc;
|
||||
// We don't know if the thread was also notified or not,
|
||||
// so we should not consume a signal unconditionally.
|
||||
// Only if number of waiters is equal to number of signals,
|
||||
// we know that the thread was notified and we must take away the signal.
|
||||
if (((state & kWaiterMask) >> kWaiterShift) ==
|
||||
((state & kSignalMask) >> kSignalShift))
|
||||
newstate -= kSignalInc;
|
||||
CheckState(newstate);
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_acq_rel))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Notify wakes one or all waiting threads.
|
||||
// Must be called after changing the associated wait predicate.
|
||||
void Notify(bool notifyAll) {
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
uint64_t state = state_.load(std::memory_order_acquire);
|
||||
for (;;) {
|
||||
CheckState(state);
|
||||
const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
|
||||
const uint64_t signals = (state & kSignalMask) >> kSignalShift;
|
||||
// Easy case: no waiters.
|
||||
if ((state & kStackMask) == kStackMask && waiters == signals) return;
|
||||
uint64_t newstate;
|
||||
if (notifyAll) {
|
||||
// Empty wait stack and set signal to number of pre-wait threads.
|
||||
newstate =
|
||||
(state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
|
||||
} else if (signals < waiters) {
|
||||
// There is a thread in pre-wait state, unblock it.
|
||||
newstate = state + kSignalInc;
|
||||
} else {
|
||||
// Pop a waiter from list and unpark it.
|
||||
Waiter* w = &waiters_[state & kStackMask];
|
||||
uint64_t next = w->next.load(std::memory_order_relaxed);
|
||||
newstate = (state & (kWaiterMask | kSignalMask)) | next;
|
||||
}
|
||||
CheckState(newstate);
|
||||
if (state_.compare_exchange_weak(state, newstate,
|
||||
std::memory_order_acq_rel)) {
|
||||
if (!notifyAll && (signals < waiters))
|
||||
return; // unblocked pre-wait thread
|
||||
if ((state & kStackMask) == kStackMask) return;
|
||||
Waiter* w = &waiters_[state & kStackMask];
|
||||
if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
|
||||
Unpark(w);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Waiter {
|
||||
friend class EventCount;
|
||||
// Align to 128 byte boundary to prevent false sharing with other Waiter
|
||||
// objects in the same vector.
|
||||
EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
|
||||
std::mutex mu;
|
||||
std::condition_variable cv;
|
||||
uint64_t epoch = 0;
|
||||
unsigned state = kNotSignaled;
|
||||
enum {
|
||||
kNotSignaled,
|
||||
kWaiting,
|
||||
kSignaled,
|
||||
};
|
||||
};
|
||||
|
||||
private:
|
||||
// State_ layout:
|
||||
// - low kWaiterBits is a stack of waiters committed wait
|
||||
// (indexes in waiters_ array are used as stack elements,
|
||||
// kStackMask means empty stack).
|
||||
// - next kWaiterBits is count of waiters in prewait state.
|
||||
// - next kWaiterBits is count of pending signals.
|
||||
// - remaining bits are ABA counter for the stack.
|
||||
// (stored in Waiter node and incremented on push).
|
||||
static const uint64_t kWaiterBits = 14;
|
||||
static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
|
||||
static const uint64_t kWaiterShift = kWaiterBits;
|
||||
static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
|
||||
<< kWaiterShift;
|
||||
static const uint64_t kWaiterInc = 1ull << kWaiterShift;
|
||||
static const uint64_t kSignalShift = 2 * kWaiterBits;
|
||||
static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
|
||||
<< kSignalShift;
|
||||
static const uint64_t kSignalInc = 1ull << kSignalShift;
|
||||
static const uint64_t kEpochShift = 3 * kWaiterBits;
|
||||
static const uint64_t kEpochBits = 64 - kEpochShift;
|
||||
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
|
||||
static const uint64_t kEpochInc = 1ull << kEpochShift;
|
||||
std::atomic<uint64_t> state_;
|
||||
MaxSizeVector<Waiter>& waiters_;
|
||||
|
||||
static void CheckState(uint64_t state, bool waiter = false) {
|
||||
static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
|
||||
const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
|
||||
const uint64_t signals = (state & kSignalMask) >> kSignalShift;
|
||||
eigen_plain_assert(waiters >= signals);
|
||||
eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
|
||||
eigen_plain_assert(!waiter || waiters > 0);
|
||||
(void)waiters;
|
||||
(void)signals;
|
||||
}
|
||||
|
||||
void Park(Waiter* w) {
|
||||
std::unique_lock<std::mutex> lock(w->mu);
|
||||
while (w->state != Waiter::kSignaled) {
|
||||
w->state = Waiter::kWaiting;
|
||||
w->cv.wait(lock);
|
||||
}
|
||||
}
|
||||
|
||||
void Unpark(Waiter* w) {
|
||||
for (Waiter* next; w; w = next) {
|
||||
uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
|
||||
next = wnext == kStackMask ? nullptr : &waiters_[wnext];
|
||||
unsigned state;
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(w->mu);
|
||||
state = w->state;
|
||||
w->state = Waiter::kSignaled;
|
||||
}
|
||||
// Avoid notifying if it wasn't waiting.
|
||||
if (state == Waiter::kWaiting) w->cv.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
EventCount(const EventCount&) = delete;
|
||||
void operator=(const EventCount&) = delete;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
|
||||
3
Eigen/src/ThreadPool/InternalHeaderCheck.h
Normal file
3
Eigen/src/ThreadPool/InternalHeaderCheck.h
Normal file
@@ -0,0 +1,3 @@
|
||||
#ifndef EIGEN_THREADPOOL_MODULE_H
|
||||
#error "Please include unsupported/Eigen/CXX11/ThreadPool instead of including headers inside the src directory directly."
|
||||
#endif
|
||||
488
Eigen/src/ThreadPool/NonBlockingThreadPool.h
Normal file
488
Eigen/src/ThreadPool/NonBlockingThreadPool.h
Normal file
@@ -0,0 +1,488 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Environment>
|
||||
class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
|
||||
public:
|
||||
typedef typename Environment::Task Task;
|
||||
typedef RunQueue<Task, 1024> Queue;
|
||||
|
||||
ThreadPoolTempl(int num_threads, Environment env = Environment())
|
||||
: ThreadPoolTempl(num_threads, true, env) {}
|
||||
|
||||
ThreadPoolTempl(int num_threads, bool allow_spinning,
|
||||
Environment env = Environment())
|
||||
: env_(env),
|
||||
num_threads_(num_threads),
|
||||
allow_spinning_(allow_spinning),
|
||||
thread_data_(num_threads),
|
||||
all_coprimes_(num_threads),
|
||||
waiters_(num_threads),
|
||||
global_steal_partition_(EncodePartition(0, num_threads_)),
|
||||
blocked_(0),
|
||||
spinning_(0),
|
||||
done_(false),
|
||||
cancelled_(false),
|
||||
ec_(waiters_) {
|
||||
waiters_.resize(num_threads_);
|
||||
// Calculate coprimes of all numbers [1, num_threads].
|
||||
// Coprimes are used for random walks over all threads in Steal
|
||||
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
|
||||
// a random starting thread index t and calculate num_threads - 1 subsequent
|
||||
// indices as (t + coprime) % num_threads, we will cover all threads without
|
||||
// repetitions (effectively getting a presudo-random permutation of thread
|
||||
// indices).
|
||||
eigen_plain_assert(num_threads_ < kMaxThreads);
|
||||
for (int i = 1; i <= num_threads_; ++i) {
|
||||
all_coprimes_.emplace_back(i);
|
||||
ComputeCoprimes(i, &all_coprimes_.back());
|
||||
}
|
||||
#ifndef EIGEN_THREAD_LOCAL
|
||||
init_barrier_.reset(new Barrier(num_threads_));
|
||||
#endif
|
||||
thread_data_.resize(num_threads_);
|
||||
for (int i = 0; i < num_threads_; i++) {
|
||||
SetStealPartition(i, EncodePartition(0, num_threads_));
|
||||
thread_data_[i].thread.reset(
|
||||
env_.CreateThread([this, i]() { WorkerLoop(i); }));
|
||||
}
|
||||
#ifndef EIGEN_THREAD_LOCAL
|
||||
// Wait for workers to initialize per_thread_map_. Otherwise we might race
|
||||
// with them in Schedule or CurrentThreadId.
|
||||
init_barrier_->Wait();
|
||||
#endif
|
||||
}
|
||||
|
||||
~ThreadPoolTempl() {
|
||||
done_ = true;
|
||||
|
||||
// Now if all threads block without work, they will start exiting.
|
||||
// But note that threads can continue to work arbitrary long,
|
||||
// block, submit new work, unblock and otherwise live full life.
|
||||
if (!cancelled_) {
|
||||
ec_.Notify(true);
|
||||
} else {
|
||||
// Since we were cancelled, there might be entries in the queues.
|
||||
// Empty them to prevent their destructor from asserting.
|
||||
for (size_t i = 0; i < thread_data_.size(); i++) {
|
||||
thread_data_[i].queue.Flush();
|
||||
}
|
||||
}
|
||||
// Join threads explicitly (by destroying) to avoid destruction order within
|
||||
// this class.
|
||||
for (size_t i = 0; i < thread_data_.size(); ++i)
|
||||
thread_data_[i].thread.reset();
|
||||
}
|
||||
|
||||
void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
|
||||
eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
|
||||
|
||||
// Pass this information to each thread queue.
|
||||
for (int i = 0; i < num_threads_; i++) {
|
||||
const auto& pair = partitions[i];
|
||||
unsigned start = pair.first, end = pair.second;
|
||||
AssertBounds(start, end);
|
||||
unsigned val = EncodePartition(start, end);
|
||||
SetStealPartition(i, val);
|
||||
}
|
||||
}
|
||||
|
||||
void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
|
||||
ScheduleWithHint(std::move(fn), 0, num_threads_);
|
||||
}
|
||||
|
||||
void ScheduleWithHint(std::function<void()> fn, int start,
|
||||
int limit) override {
|
||||
Task t = env_.CreateTask(std::move(fn));
|
||||
PerThread* pt = GetPerThread();
|
||||
if (pt->pool == this) {
|
||||
// Worker thread of this pool, push onto the thread's queue.
|
||||
Queue& q = thread_data_[pt->thread_id].queue;
|
||||
t = q.PushFront(std::move(t));
|
||||
} else {
|
||||
// A free-standing thread (or worker of another pool), push onto a random
|
||||
// queue.
|
||||
eigen_plain_assert(start < limit);
|
||||
eigen_plain_assert(limit <= num_threads_);
|
||||
int num_queues = limit - start;
|
||||
int rnd = Rand(&pt->rand) % num_queues;
|
||||
eigen_plain_assert(start + rnd < limit);
|
||||
Queue& q = thread_data_[start + rnd].queue;
|
||||
t = q.PushBack(std::move(t));
|
||||
}
|
||||
// Note: below we touch this after making w available to worker threads.
|
||||
// Strictly speaking, this can lead to a racy-use-after-free. Consider that
|
||||
// Schedule is called from a thread that is neither main thread nor a worker
|
||||
// thread of this pool. Then, execution of w directly or indirectly
|
||||
// completes overall computations, which in turn leads to destruction of
|
||||
// this. We expect that such scenario is prevented by program, that is,
|
||||
// this is kept alive while any threads can potentially be in Schedule.
|
||||
if (!t.f) {
|
||||
ec_.Notify(false);
|
||||
} else {
|
||||
env_.ExecuteTask(t); // Push failed, execute directly.
|
||||
}
|
||||
}
|
||||
|
||||
void Cancel() EIGEN_OVERRIDE {
|
||||
cancelled_ = true;
|
||||
done_ = true;
|
||||
|
||||
// Let each thread know it's been cancelled.
|
||||
#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
|
||||
for (size_t i = 0; i < thread_data_.size(); i++) {
|
||||
thread_data_[i].thread->OnCancel();
|
||||
}
|
||||
#endif
|
||||
|
||||
// Wake up the threads without work to let them exit on their own.
|
||||
ec_.Notify(true);
|
||||
}
|
||||
|
||||
int NumThreads() const EIGEN_FINAL { return num_threads_; }
|
||||
|
||||
int CurrentThreadId() const EIGEN_FINAL {
|
||||
const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
|
||||
if (pt->pool == this) {
|
||||
return pt->thread_id;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Create a single atomic<int> that encodes start and limit information for
|
||||
// each thread.
|
||||
// We expect num_threads_ < 65536, so we can store them in a single
|
||||
// std::atomic<unsigned>.
|
||||
// Exposed publicly as static functions so that external callers can reuse
|
||||
// this encode/decode logic for maintaining their own thread-safe copies of
|
||||
// scheduling and steal domain(s).
|
||||
static const int kMaxPartitionBits = 16;
|
||||
static const int kMaxThreads = 1 << kMaxPartitionBits;
|
||||
|
||||
inline unsigned EncodePartition(unsigned start, unsigned limit) {
|
||||
return (start << kMaxPartitionBits) | limit;
|
||||
}
|
||||
|
||||
inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
|
||||
*limit = val & (kMaxThreads - 1);
|
||||
val >>= kMaxPartitionBits;
|
||||
*start = val;
|
||||
}
|
||||
|
||||
void AssertBounds(int start, int end) {
|
||||
eigen_plain_assert(start >= 0);
|
||||
eigen_plain_assert(start < end); // non-zero sized partition
|
||||
eigen_plain_assert(end <= num_threads_);
|
||||
}
|
||||
|
||||
inline void SetStealPartition(size_t i, unsigned val) {
|
||||
thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
inline unsigned GetStealPartition(int i) {
|
||||
return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
|
||||
for (int i = 1; i <= N; i++) {
|
||||
unsigned a = i;
|
||||
unsigned b = N;
|
||||
// If GCD(a, b) == 1, then a and b are coprimes.
|
||||
while (b != 0) {
|
||||
unsigned tmp = a;
|
||||
a = b;
|
||||
b = tmp % b;
|
||||
}
|
||||
if (a == 1) {
|
||||
coprimes->push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename Environment::EnvThread Thread;
|
||||
|
||||
struct PerThread {
|
||||
constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
|
||||
ThreadPoolTempl* pool; // Parent pool, or null for normal threads.
|
||||
uint64_t rand; // Random generator state.
|
||||
int thread_id; // Worker thread index in pool.
|
||||
#ifndef EIGEN_THREAD_LOCAL
|
||||
// Prevent false sharing.
|
||||
char pad_[128];
|
||||
#endif
|
||||
};
|
||||
|
||||
struct ThreadData {
|
||||
constexpr ThreadData() : thread(), steal_partition(0), queue() {}
|
||||
std::unique_ptr<Thread> thread;
|
||||
std::atomic<unsigned> steal_partition;
|
||||
Queue queue;
|
||||
};
|
||||
|
||||
Environment env_;
|
||||
const int num_threads_;
|
||||
const bool allow_spinning_;
|
||||
MaxSizeVector<ThreadData> thread_data_;
|
||||
MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
|
||||
MaxSizeVector<EventCount::Waiter> waiters_;
|
||||
unsigned global_steal_partition_;
|
||||
std::atomic<unsigned> blocked_;
|
||||
std::atomic<bool> spinning_;
|
||||
std::atomic<bool> done_;
|
||||
std::atomic<bool> cancelled_;
|
||||
EventCount ec_;
|
||||
#ifndef EIGEN_THREAD_LOCAL
|
||||
std::unique_ptr<Barrier> init_barrier_;
|
||||
std::mutex per_thread_map_mutex_; // Protects per_thread_map_.
|
||||
std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
|
||||
#endif
|
||||
|
||||
// Main worker thread loop.
|
||||
void WorkerLoop(int thread_id) {
|
||||
#ifndef EIGEN_THREAD_LOCAL
|
||||
std::unique_ptr<PerThread> new_pt(new PerThread());
|
||||
per_thread_map_mutex_.lock();
|
||||
bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
|
||||
eigen_plain_assert(insertOK);
|
||||
EIGEN_UNUSED_VARIABLE(insertOK);
|
||||
per_thread_map_mutex_.unlock();
|
||||
init_barrier_->Notify();
|
||||
init_barrier_->Wait();
|
||||
#endif
|
||||
PerThread* pt = GetPerThread();
|
||||
pt->pool = this;
|
||||
pt->rand = GlobalThreadIdHash();
|
||||
pt->thread_id = thread_id;
|
||||
Queue& q = thread_data_[thread_id].queue;
|
||||
EventCount::Waiter* waiter = &waiters_[thread_id];
|
||||
// TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
|
||||
// proportional to num_threads_ and we assume that new work is scheduled at
|
||||
// a constant rate, so we set spin_count to 5000 / num_threads_. The
|
||||
// constant was picked based on a fair dice roll, tune it.
|
||||
const int spin_count =
|
||||
allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
|
||||
if (num_threads_ == 1) {
|
||||
// For num_threads_ == 1 there is no point in going through the expensive
|
||||
// steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
|
||||
// victim queues it might reverse the order in which ops are executed
|
||||
// compared to the order in which they are scheduled, which tends to be
|
||||
// counter-productive for the types of I/O workloads the single thread
|
||||
// pools tend to be used for.
|
||||
while (!cancelled_) {
|
||||
Task t = q.PopFront();
|
||||
for (int i = 0; i < spin_count && !t.f; i++) {
|
||||
if (!cancelled_.load(std::memory_order_relaxed)) {
|
||||
t = q.PopFront();
|
||||
}
|
||||
}
|
||||
if (!t.f) {
|
||||
if (!WaitForWork(waiter, &t)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (t.f) {
|
||||
env_.ExecuteTask(t);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while (!cancelled_) {
|
||||
Task t = q.PopFront();
|
||||
if (!t.f) {
|
||||
t = LocalSteal();
|
||||
if (!t.f) {
|
||||
t = GlobalSteal();
|
||||
if (!t.f) {
|
||||
// Leave one thread spinning. This reduces latency.
|
||||
if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
|
||||
for (int i = 0; i < spin_count && !t.f; i++) {
|
||||
if (!cancelled_.load(std::memory_order_relaxed)) {
|
||||
t = GlobalSteal();
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
spinning_ = false;
|
||||
}
|
||||
if (!t.f) {
|
||||
if (!WaitForWork(waiter, &t)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (t.f) {
|
||||
env_.ExecuteTask(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Steal tries to steal work from other worker threads in the range [start,
|
||||
// limit) in best-effort manner.
|
||||
Task Steal(unsigned start, unsigned limit) {
|
||||
PerThread* pt = GetPerThread();
|
||||
const size_t size = limit - start;
|
||||
unsigned r = Rand(&pt->rand);
|
||||
// Reduce r into [0, size) range, this utilizes trick from
|
||||
// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
|
||||
eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
|
||||
unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
|
||||
unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
|
||||
unsigned inc = all_coprimes_[size - 1][index];
|
||||
|
||||
for (unsigned i = 0; i < size; i++) {
|
||||
eigen_plain_assert(start + victim < limit);
|
||||
Task t = thread_data_[start + victim].queue.PopBack();
|
||||
if (t.f) {
|
||||
return t;
|
||||
}
|
||||
victim += inc;
|
||||
if (victim >= size) {
|
||||
victim -= size;
|
||||
}
|
||||
}
|
||||
return Task();
|
||||
}
|
||||
|
||||
// Steals work within threads belonging to the partition.
|
||||
Task LocalSteal() {
|
||||
PerThread* pt = GetPerThread();
|
||||
unsigned partition = GetStealPartition(pt->thread_id);
|
||||
// If thread steal partition is the same as global partition, there is no
|
||||
// need to go through the steal loop twice.
|
||||
if (global_steal_partition_ == partition) return Task();
|
||||
unsigned start, limit;
|
||||
DecodePartition(partition, &start, &limit);
|
||||
AssertBounds(start, limit);
|
||||
|
||||
return Steal(start, limit);
|
||||
}
|
||||
|
||||
// Steals work from any other thread in the pool.
|
||||
Task GlobalSteal() {
|
||||
return Steal(0, num_threads_);
|
||||
}
|
||||
|
||||
|
||||
// WaitForWork blocks until new work is available (returns true), or if it is
|
||||
// time to exit (returns false). Can optionally return a task to execute in t
|
||||
// (in such case t.f != nullptr on return).
|
||||
bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
|
||||
eigen_plain_assert(!t->f);
|
||||
// We already did best-effort emptiness check in Steal, so prepare for
|
||||
// blocking.
|
||||
ec_.Prewait();
|
||||
// Now do a reliable emptiness check.
|
||||
int victim = NonEmptyQueueIndex();
|
||||
if (victim != -1) {
|
||||
ec_.CancelWait();
|
||||
if (cancelled_) {
|
||||
return false;
|
||||
} else {
|
||||
*t = thread_data_[victim].queue.PopBack();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Number of blocked threads is used as termination condition.
|
||||
// If we are shutting down and all worker threads blocked without work,
|
||||
// that's we are done.
|
||||
blocked_++;
|
||||
// TODO is blocked_ required to be unsigned?
|
||||
if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
|
||||
ec_.CancelWait();
|
||||
// Almost done, but need to re-check queues.
|
||||
// Consider that all queues are empty and all worker threads are preempted
|
||||
// right after incrementing blocked_ above. Now a free-standing thread
|
||||
// submits work and calls destructor (which sets done_). If we don't
|
||||
// re-check queues, we will exit leaving the work unexecuted.
|
||||
if (NonEmptyQueueIndex() != -1) {
|
||||
// Note: we must not pop from queues before we decrement blocked_,
|
||||
// otherwise the following scenario is possible. Consider that instead
|
||||
// of checking for emptiness we popped the only element from queues.
|
||||
// Now other worker threads can start exiting, which is bad if the
|
||||
// work item submits other work. So we just check emptiness here,
|
||||
// which ensures that all worker threads exit at the same time.
|
||||
blocked_--;
|
||||
return true;
|
||||
}
|
||||
// Reached stable termination state.
|
||||
ec_.Notify(true);
|
||||
return false;
|
||||
}
|
||||
ec_.CommitWait(waiter);
|
||||
blocked_--;
|
||||
return true;
|
||||
}
|
||||
|
||||
int NonEmptyQueueIndex() {
|
||||
PerThread* pt = GetPerThread();
|
||||
// We intentionally design NonEmptyQueueIndex to steal work from
|
||||
// anywhere in the queue so threads don't block in WaitForWork() forever
|
||||
// when all threads in their partition go to sleep. Steal is still local.
|
||||
const size_t size = thread_data_.size();
|
||||
unsigned r = Rand(&pt->rand);
|
||||
unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
|
||||
unsigned victim = r % size;
|
||||
for (unsigned i = 0; i < size; i++) {
|
||||
if (!thread_data_[victim].queue.Empty()) {
|
||||
return victim;
|
||||
}
|
||||
victim += inc;
|
||||
if (victim >= size) {
|
||||
victim -= size;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
|
||||
return std::hash<std::thread::id>()(std::this_thread::get_id());
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE PerThread* GetPerThread() {
|
||||
#ifndef EIGEN_THREAD_LOCAL
|
||||
static PerThread dummy;
|
||||
auto it = per_thread_map_.find(GlobalThreadIdHash());
|
||||
if (it == per_thread_map_.end()) {
|
||||
return &dummy;
|
||||
} else {
|
||||
return it->second.get();
|
||||
}
|
||||
#else
|
||||
EIGEN_THREAD_LOCAL PerThread per_thread_;
|
||||
PerThread* pt = &per_thread_;
|
||||
return pt;
|
||||
#endif
|
||||
}
|
||||
|
||||
static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
|
||||
uint64_t current = *state;
|
||||
// Update the internal state
|
||||
*state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
|
||||
// Generate the random output (using the PCG-XSH-RS scheme)
|
||||
return static_cast<unsigned>((current ^ (current >> 22)) >>
|
||||
(22 + (current >> 61)));
|
||||
}
|
||||
};
|
||||
|
||||
typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
|
||||
238
Eigen/src/ThreadPool/RunQueue.h
Normal file
238
Eigen/src/ThreadPool/RunQueue.h
Normal file
@@ -0,0 +1,238 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
|
||||
#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// RunQueue is a fixed-size, partially non-blocking deque or Work items.
|
||||
// Operations on front of the queue must be done by a single thread (owner),
|
||||
// operations on back of the queue can be done by multiple threads concurrently.
|
||||
//
|
||||
// Algorithm outline:
|
||||
// All remote threads operating on the queue back are serialized by a mutex.
|
||||
// This ensures that at most two threads access state: owner and one remote
|
||||
// thread (Size aside). The algorithm ensures that the occupied region of the
|
||||
// underlying array is logically continuous (can wraparound, but no stray
|
||||
// occupied elements). Owner operates on one end of this region, remote thread
|
||||
// operates on the other end. Synchronization between these threads
|
||||
// (potential consumption of the last element and take up of the last empty
|
||||
// element) happens by means of state variable in each element. States are:
|
||||
// empty, busy (in process of insertion of removal) and ready. Threads claim
|
||||
// elements (empty->busy and ready->busy transitions) by means of a CAS
|
||||
// operation. The finishing transition (busy->empty and busy->ready) are done
|
||||
// with plain store as the element is exclusively owned by the current thread.
|
||||
//
|
||||
// Note: we could permit only pointers as elements, then we would not need
|
||||
// separate state variable as null/non-null pointer value would serve as state,
|
||||
// but that would require malloc/free per operation for large, complex values
|
||||
// (and this is designed to store std::function<()>).
|
||||
template <typename Work, unsigned kSize>
|
||||
class RunQueue {
|
||||
public:
|
||||
RunQueue() : front_(0), back_(0) {
|
||||
// require power-of-two for fast masking
|
||||
eigen_plain_assert((kSize & (kSize - 1)) == 0);
|
||||
eigen_plain_assert(kSize > 2); // why would you do this?
|
||||
eigen_plain_assert(kSize <= (64 << 10)); // leave enough space for counter
|
||||
for (unsigned i = 0; i < kSize; i++)
|
||||
array_[i].state.store(kEmpty, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
~RunQueue() { eigen_plain_assert(Size() == 0); }
|
||||
|
||||
// PushFront inserts w at the beginning of the queue.
|
||||
// If queue is full returns w, otherwise returns default-constructed Work.
|
||||
Work PushFront(Work w) {
|
||||
unsigned front = front_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[front & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kEmpty ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return w;
|
||||
front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
e->w = std::move(w);
|
||||
e->state.store(kReady, std::memory_order_release);
|
||||
return Work();
|
||||
}
|
||||
|
||||
// PopFront removes and returns the first element in the queue.
|
||||
// If the queue was empty returns default-constructed Work.
|
||||
Work PopFront() {
|
||||
unsigned front = front_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[(front - 1) & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return Work();
|
||||
Work w = std::move(e->w);
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
front = ((front - 1) & kMask2) | (front & ~kMask2);
|
||||
front_.store(front, std::memory_order_relaxed);
|
||||
return w;
|
||||
}
|
||||
|
||||
// PushBack adds w at the end of the queue.
|
||||
// If queue is full returns w, otherwise returns default-constructed Work.
|
||||
Work PushBack(Work w) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[(back - 1) & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kEmpty ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return w;
|
||||
back = ((back - 1) & kMask2) | (back & ~kMask2);
|
||||
back_.store(back, std::memory_order_relaxed);
|
||||
e->w = std::move(w);
|
||||
e->state.store(kReady, std::memory_order_release);
|
||||
return Work();
|
||||
}
|
||||
|
||||
// PopBack removes and returns the last elements in the queue.
|
||||
Work PopBack() {
|
||||
if (Empty()) return Work();
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
Elem* e = &array_[back & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (s != kReady ||
|
||||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
|
||||
return Work();
|
||||
Work w = std::move(e->w);
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
return w;
|
||||
}
|
||||
|
||||
// PopBackHalf removes and returns half last elements in the queue.
|
||||
// Returns number of elements removed.
|
||||
unsigned PopBackHalf(std::vector<Work>* result) {
|
||||
if (Empty()) return 0;
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
unsigned back = back_.load(std::memory_order_relaxed);
|
||||
unsigned size = Size();
|
||||
unsigned mid = back;
|
||||
if (size > 1) mid = back + (size - 1) / 2;
|
||||
unsigned n = 0;
|
||||
unsigned start = 0;
|
||||
for (; static_cast<int>(mid - back) >= 0; mid--) {
|
||||
Elem* e = &array_[mid & kMask];
|
||||
uint8_t s = e->state.load(std::memory_order_relaxed);
|
||||
if (n == 0) {
|
||||
if (s != kReady || !e->state.compare_exchange_strong(
|
||||
s, kBusy, std::memory_order_acquire))
|
||||
continue;
|
||||
start = mid;
|
||||
} else {
|
||||
// Note: no need to store temporal kBusy, we exclusively own these
|
||||
// elements.
|
||||
eigen_plain_assert(s == kReady);
|
||||
}
|
||||
result->push_back(std::move(e->w));
|
||||
e->state.store(kEmpty, std::memory_order_release);
|
||||
n++;
|
||||
}
|
||||
if (n != 0)
|
||||
back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
|
||||
return n;
|
||||
}
|
||||
|
||||
// Size returns current queue size.
|
||||
// Can be called by any thread at any time.
|
||||
unsigned Size() const { return SizeOrNotEmpty<true>(); }
|
||||
|
||||
// Empty tests whether container is empty.
|
||||
// Can be called by any thread at any time.
|
||||
bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
|
||||
|
||||
// Delete all the elements from the queue.
|
||||
void Flush() {
|
||||
while (!Empty()) {
|
||||
PopFront();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static const unsigned kMask = kSize - 1;
|
||||
static const unsigned kMask2 = (kSize << 1) - 1;
|
||||
struct Elem {
|
||||
std::atomic<uint8_t> state;
|
||||
Work w;
|
||||
};
|
||||
enum {
|
||||
kEmpty,
|
||||
kBusy,
|
||||
kReady,
|
||||
};
|
||||
std::mutex mutex_;
|
||||
// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
|
||||
// front/back, respectively. The remaining bits contain modification counters
|
||||
// that are incremented on Push operations. This allows us to (1) distinguish
|
||||
// between empty and full conditions (if we would use log(kSize) bits for
|
||||
// position, these conditions would be indistinguishable); (2) obtain
|
||||
// consistent snapshot of front_/back_ for Size operation using the
|
||||
// modification counters.
|
||||
std::atomic<unsigned> front_;
|
||||
std::atomic<unsigned> back_;
|
||||
Elem array_[kSize];
|
||||
|
||||
// SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
|
||||
// only whether the size is 0 is guaranteed to be correct.
|
||||
// Can be called by any thread at any time.
|
||||
template<bool NeedSizeEstimate>
|
||||
unsigned SizeOrNotEmpty() const {
|
||||
// Emptiness plays critical role in thread pool blocking. So we go to great
|
||||
// effort to not produce false positives (claim non-empty queue as empty).
|
||||
unsigned front = front_.load(std::memory_order_acquire);
|
||||
for (;;) {
|
||||
// Capture a consistent snapshot of front/tail.
|
||||
unsigned back = back_.load(std::memory_order_acquire);
|
||||
unsigned front1 = front_.load(std::memory_order_relaxed);
|
||||
if (front != front1) {
|
||||
front = front1;
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
continue;
|
||||
}
|
||||
if (NeedSizeEstimate) {
|
||||
return CalculateSize(front, back);
|
||||
} else {
|
||||
// This value will be 0 if the queue is empty, and undefined otherwise.
|
||||
unsigned maybe_zero = ((front ^ back) & kMask2);
|
||||
// Queue size estimate must agree with maybe zero check on the queue
|
||||
// empty/non-empty state.
|
||||
eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
|
||||
return maybe_zero;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE
|
||||
unsigned CalculateSize(unsigned front, unsigned back) const {
|
||||
int size = (front & kMask2) - (back & kMask2);
|
||||
// Fix overflow.
|
||||
if (size < 0) size += 2 * kSize;
|
||||
// Order of modification in push/pop is crafted to make the queue look
|
||||
// larger than it is during concurrent modifications. E.g. push can
|
||||
// increment size before the corresponding pop has decremented it.
|
||||
// So the computed size can be up to kSize + 1, fix it.
|
||||
if (size > static_cast<int>(kSize)) size = kSize;
|
||||
return static_cast<unsigned>(size);
|
||||
}
|
||||
|
||||
RunQueue(const RunQueue&) = delete;
|
||||
void operator=(const RunQueue&) = delete;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
|
||||
23
Eigen/src/ThreadPool/ThreadCancel.h
Normal file
23
Eigen/src/ThreadPool/ThreadCancel.h
Normal file
@@ -0,0 +1,23 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
|
||||
|
||||
// Try to come up with a portable way to cancel a thread
|
||||
#if EIGEN_OS_GNULINUX
|
||||
#define EIGEN_THREAD_CANCEL(t) \
|
||||
pthread_cancel(t.native_handle());
|
||||
#define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
|
||||
#else
|
||||
#define EIGEN_THREAD_CANCEL(t)
|
||||
#endif
|
||||
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
|
||||
42
Eigen/src/ThreadPool/ThreadEnvironment.h
Normal file
42
Eigen/src/ThreadPool/ThreadEnvironment.h
Normal file
@@ -0,0 +1,42 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
struct StlThreadEnvironment {
|
||||
struct Task {
|
||||
std::function<void()> f;
|
||||
};
|
||||
|
||||
// EnvThread constructor must start the thread,
|
||||
// destructor must join the thread.
|
||||
class EnvThread {
|
||||
public:
|
||||
EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
|
||||
~EnvThread() { thr_.join(); }
|
||||
// This function is called when the threadpool is cancelled.
|
||||
void OnCancel() { }
|
||||
|
||||
private:
|
||||
std::thread thr_;
|
||||
};
|
||||
|
||||
EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
|
||||
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
|
||||
void ExecuteTask(const Task& t) { t.f(); }
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
|
||||
299
Eigen/src/ThreadPool/ThreadLocal.h
Normal file
299
Eigen/src/ThreadPool/ThreadLocal.h
Normal file
@@ -0,0 +1,299 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
||||
|
||||
#ifdef EIGEN_AVOID_THREAD_LOCAL
|
||||
|
||||
#ifdef EIGEN_THREAD_LOCAL
|
||||
#undef EIGEN_THREAD_LOCAL
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if ((EIGEN_COMP_GNUC) || __has_feature(cxx_thread_local) || EIGEN_COMP_MSVC )
|
||||
#define EIGEN_THREAD_LOCAL static thread_local
|
||||
#endif
|
||||
|
||||
// Disable TLS for Apple and Android builds with older toolchains.
|
||||
#if defined(__APPLE__)
|
||||
// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
|
||||
// __IPHONE_8_0.
|
||||
#include <Availability.h>
|
||||
#include <TargetConditionals.h>
|
||||
#endif
|
||||
// Checks whether C++11's `thread_local` storage duration specifier is
|
||||
// supported.
|
||||
#if EIGEN_COMP_CLANGAPPLE && ((EIGEN_COMP_CLANGAPPLE < 8000042) || \
|
||||
(TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
|
||||
// Notes: Xcode's clang did not support `thread_local` until version
|
||||
// 8, and even then not for all iOS < 9.0.
|
||||
#undef EIGEN_THREAD_LOCAL
|
||||
|
||||
#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
|
||||
// There are platforms for which TLS should not be used even though the compiler
|
||||
// makes it seem like it's supported (Android NDK < r12b for example).
|
||||
// This is primarily because of linker problems and toolchain misconfiguration:
|
||||
// TLS isn't supported until NDK r12b per
|
||||
// https://developer.android.com/ndk/downloads/revision_history.html
|
||||
// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
|
||||
// <android/ndk-version.h>. For NDK < r16, users should define these macros,
|
||||
// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
|
||||
#if __has_include(<android/ndk-version.h>)
|
||||
#include <android/ndk-version.h>
|
||||
#endif // __has_include(<android/ndk-version.h>)
|
||||
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
|
||||
defined(__NDK_MINOR__) && \
|
||||
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
|
||||
#undef EIGEN_THREAD_LOCAL
|
||||
#endif
|
||||
#endif // defined(__ANDROID__) && defined(__clang__)
|
||||
|
||||
#endif // EIGEN_AVOID_THREAD_LOCAL
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
template <typename T>
|
||||
struct ThreadLocalNoOpInitialize {
|
||||
void operator()(T&) const {}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ThreadLocalNoOpRelease {
|
||||
void operator()(T&) const {}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// Thread local container for elements of type T, that does not use thread local
|
||||
// storage. As long as the number of unique threads accessing this storage
|
||||
// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
|
||||
// use a mutex for synchronization.
|
||||
//
|
||||
// Type `T` has to be default constructible, and by default each thread will get
|
||||
// a default constructed value. It is possible to specify custom `initialize`
|
||||
// callable, that will be called lazily from each thread accessing this object,
|
||||
// and will be passed a default initialized object of type `T`. Also it's
|
||||
// possible to pass a custom `release` callable, that will be invoked before
|
||||
// calling ~T().
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// struct Counter {
|
||||
// int value = 0;
|
||||
// }
|
||||
//
|
||||
// Eigen::ThreadLocal<Counter> counter(10);
|
||||
//
|
||||
// // Each thread will have access to it's own counter object.
|
||||
// Counter& cnt = counter.local();
|
||||
// cnt++;
|
||||
//
|
||||
// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
|
||||
// std::this_thread::get_id() to identify threads. This value is not guaranteed
|
||||
// to be unique except for the life of the thread. A newly created thread may
|
||||
// get an OS-specific ID equal to that of an already destroyed thread.
|
||||
//
|
||||
// Somewhat similar to TBB thread local storage, with similar restrictions:
|
||||
// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
|
||||
//
|
||||
template <typename T,
|
||||
typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
|
||||
typename Release = internal::ThreadLocalNoOpRelease<T>>
|
||||
class ThreadLocal {
|
||||
// We preallocate default constructed elements in MaxSizedVector.
|
||||
static_assert(std::is_default_constructible<T>::value,
|
||||
"ThreadLocal data type must be default constructible");
|
||||
|
||||
public:
|
||||
explicit ThreadLocal(int capacity)
|
||||
: ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
|
||||
internal::ThreadLocalNoOpRelease<T>()) {}
|
||||
|
||||
ThreadLocal(int capacity, Initialize initialize)
|
||||
: ThreadLocal(capacity, std::move(initialize),
|
||||
internal::ThreadLocalNoOpRelease<T>()) {}
|
||||
|
||||
ThreadLocal(int capacity, Initialize initialize, Release release)
|
||||
: initialize_(std::move(initialize)),
|
||||
release_(std::move(release)),
|
||||
capacity_(capacity),
|
||||
data_(capacity_),
|
||||
ptr_(capacity_),
|
||||
filled_records_(0) {
|
||||
eigen_assert(capacity_ >= 0);
|
||||
data_.resize(capacity_);
|
||||
for (int i = 0; i < capacity_; ++i) {
|
||||
ptr_.emplace_back(nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
T& local() {
|
||||
std::thread::id this_thread = std::this_thread::get_id();
|
||||
if (capacity_ == 0) return SpilledLocal(this_thread);
|
||||
|
||||
std::size_t h = std::hash<std::thread::id>()(this_thread);
|
||||
const int start_idx = h % capacity_;
|
||||
|
||||
// NOTE: From the definition of `std::this_thread::get_id()` it is
|
||||
// guaranteed that we never can have concurrent insertions with the same key
|
||||
// to our hash-map like data structure. If we didn't find an element during
|
||||
// the initial traversal, it's guaranteed that no one else could have
|
||||
// inserted it while we are in this function. This allows to massively
|
||||
// simplify out lock-free insert-only hash map.
|
||||
|
||||
// Check if we already have an element for `this_thread`.
|
||||
int idx = start_idx;
|
||||
while (ptr_[idx].load() != nullptr) {
|
||||
ThreadIdAndValue& record = *(ptr_[idx].load());
|
||||
if (record.thread_id == this_thread) return record.value;
|
||||
|
||||
idx += 1;
|
||||
if (idx >= capacity_) idx -= capacity_;
|
||||
if (idx == start_idx) break;
|
||||
}
|
||||
|
||||
// If we are here, it means that we found an insertion point in lookup
|
||||
// table at `idx`, or we did a full traversal and table is full.
|
||||
|
||||
// If lock-free storage is full, fallback on mutex.
|
||||
if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
|
||||
|
||||
// We double check that we still have space to insert an element into a lock
|
||||
// free storage. If old value in `filled_records_` is larger than the
|
||||
// records capacity, it means that some other thread added an element while
|
||||
// we were traversing lookup table.
|
||||
int insertion_index =
|
||||
filled_records_.fetch_add(1, std::memory_order_relaxed);
|
||||
if (insertion_index >= capacity_) return SpilledLocal(this_thread);
|
||||
|
||||
// At this point it's guaranteed that we can access to
|
||||
// data_[insertion_index_] without a data race.
|
||||
data_[insertion_index].thread_id = this_thread;
|
||||
initialize_(data_[insertion_index].value);
|
||||
|
||||
// That's the pointer we'll put into the lookup table.
|
||||
ThreadIdAndValue* inserted = &data_[insertion_index];
|
||||
|
||||
// We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
|
||||
ThreadIdAndValue* empty = nullptr;
|
||||
|
||||
// Now we have to find an insertion point into the lookup table. We start
|
||||
// from the `idx` that was identified as an insertion point above, it's
|
||||
// guaranteed that we will have an empty record somewhere in a lookup table
|
||||
// (because we created a record in the `data_`).
|
||||
const int insertion_idx = idx;
|
||||
|
||||
do {
|
||||
// Always start search from the original insertion candidate.
|
||||
idx = insertion_idx;
|
||||
while (ptr_[idx].load() != nullptr) {
|
||||
idx += 1;
|
||||
if (idx >= capacity_) idx -= capacity_;
|
||||
// If we did a full loop, it means that we don't have any free entries
|
||||
// in the lookup table, and this means that something is terribly wrong.
|
||||
eigen_assert(idx != insertion_idx);
|
||||
}
|
||||
// Atomic CAS of the pointer guarantees that any other thread, that will
|
||||
// follow this pointer will see all the mutations in the `data_`.
|
||||
} while (!ptr_[idx].compare_exchange_weak(empty, inserted));
|
||||
|
||||
return inserted->value;
|
||||
}
|
||||
|
||||
// WARN: It's not thread safe to call it concurrently with `local()`.
|
||||
void ForEach(std::function<void(std::thread::id, T&)> f) {
|
||||
// Reading directly from `data_` is unsafe, because only CAS to the
|
||||
// record in `ptr_` makes all changes visible to other threads.
|
||||
for (auto& ptr : ptr_) {
|
||||
ThreadIdAndValue* record = ptr.load();
|
||||
if (record == nullptr) continue;
|
||||
f(record->thread_id, record->value);
|
||||
}
|
||||
|
||||
// We did not spill into the map based storage.
|
||||
if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
|
||||
|
||||
// Adds a happens before edge from the last call to SpilledLocal().
|
||||
std::unique_lock<std::mutex> lock(mu_);
|
||||
for (auto& kv : per_thread_map_) {
|
||||
f(kv.first, kv.second);
|
||||
}
|
||||
}
|
||||
|
||||
// WARN: It's not thread safe to call it concurrently with `local()`.
|
||||
~ThreadLocal() {
|
||||
// Reading directly from `data_` is unsafe, because only CAS to the record
|
||||
// in `ptr_` makes all changes visible to other threads.
|
||||
for (auto& ptr : ptr_) {
|
||||
ThreadIdAndValue* record = ptr.load();
|
||||
if (record == nullptr) continue;
|
||||
release_(record->value);
|
||||
}
|
||||
|
||||
// We did not spill into the map based storage.
|
||||
if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
|
||||
|
||||
// Adds a happens before edge from the last call to SpilledLocal().
|
||||
std::unique_lock<std::mutex> lock(mu_);
|
||||
for (auto& kv : per_thread_map_) {
|
||||
release_(kv.second);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
struct ThreadIdAndValue {
|
||||
std::thread::id thread_id;
|
||||
T value;
|
||||
};
|
||||
|
||||
// Use unordered map guarded by a mutex when lock free storage is full.
|
||||
T& SpilledLocal(std::thread::id this_thread) {
|
||||
std::unique_lock<std::mutex> lock(mu_);
|
||||
|
||||
auto it = per_thread_map_.find(this_thread);
|
||||
if (it == per_thread_map_.end()) {
|
||||
auto result = per_thread_map_.emplace(this_thread, T());
|
||||
eigen_assert(result.second);
|
||||
initialize_((*result.first).second);
|
||||
return (*result.first).second;
|
||||
} else {
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
|
||||
Initialize initialize_;
|
||||
Release release_;
|
||||
const int capacity_;
|
||||
|
||||
// Storage that backs lock-free lookup table `ptr_`. Records stored in this
|
||||
// storage contiguously starting from index 0.
|
||||
MaxSizeVector<ThreadIdAndValue> data_;
|
||||
|
||||
// Atomic pointers to the data stored in `data_`. Used as a lookup table for
|
||||
// linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
|
||||
MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
|
||||
|
||||
// Number of records stored in the `data_`.
|
||||
std::atomic<int> filled_records_;
|
||||
|
||||
// We fallback on per thread map if lock-free storage is full. In practice
|
||||
// this should never happen, if `capacity_` is a reasonable estimate of the
|
||||
// number of threads running in a system.
|
||||
std::mutex mu_; // Protects per_thread_map_.
|
||||
std::unordered_map<std::thread::id, T> per_thread_map_;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
|
||||
50
Eigen/src/ThreadPool/ThreadPoolInterface.h
Normal file
50
Eigen/src/ThreadPool/ThreadPoolInterface.h
Normal file
@@ -0,0 +1,50 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
||||
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// This defines an interface that ThreadPoolDevice can take to use
|
||||
// custom thread pools underneath.
|
||||
class ThreadPoolInterface {
|
||||
public:
|
||||
// Submits a closure to be run by a thread in the pool.
|
||||
virtual void Schedule(std::function<void()> fn) = 0;
|
||||
|
||||
// Submits a closure to be run by threads in the range [start, end) in the
|
||||
// pool.
|
||||
virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
|
||||
int /*end*/) {
|
||||
// Just defer to Schedule in case sub-classes aren't interested in
|
||||
// overriding this functionality.
|
||||
Schedule(fn);
|
||||
}
|
||||
|
||||
// If implemented, stop processing the closures that have been enqueued.
|
||||
// Currently running closures may still be processed.
|
||||
// If not implemented, does nothing.
|
||||
virtual void Cancel() {}
|
||||
|
||||
// Returns the number of threads in the pool.
|
||||
virtual int NumThreads() const = 0;
|
||||
|
||||
// Returns a logical thread index between 0 and NumThreads() - 1 if called
|
||||
// from one of the threads in the pool. Returns -1 otherwise.
|
||||
virtual int CurrentThreadId() const = 0;
|
||||
|
||||
virtual ~ThreadPoolInterface() {}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
|
||||
16
Eigen/src/ThreadPool/ThreadYield.h
Normal file
16
Eigen/src/ThreadPool/ThreadYield.h
Normal file
@@ -0,0 +1,16 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
||||
#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
||||
|
||||
// Try to come up with a portable way to yield
|
||||
#define EIGEN_THREAD_YIELD() std::this_thread::yield()
|
||||
|
||||
#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
|
||||
Reference in New Issue
Block a user