Slightly simplify ForkJoin code, and make sure the test is actually run.

2026-04-10 11:34:33 +08:00 · 2025-02-25 17:22:43 +00:00
parent 6aebfa9acc
commit 72adf891d5
4 changed files with 69 additions and 88 deletions
--- a/Eigen/src/ThreadPool/ForkJoin.h
+++ b/Eigen/src/ThreadPool/ForkJoin.h
@@ -31,7 +31,7 @@ namespace Eigen {
 // where `s_{j+1} - s_{j}` and `end - s_n` are roughly within a factor of two of `granularity`. For a unary
 // task function `g(k)`, the same operation is applied with
 //
-//   f(i,j) = [&](){ for(int k = i; k < j; ++k) g(k); };
+//   f(i,j) = [&](){ for(Index k = i; k < j; ++k) g(k); };
 //
 // Note that the parameter `granularity` should be tuned by the user based on the trade-off of running the
 // given task function sequentially vs. scheduling individual tasks in parallel. An example of a partially
@@ -45,51 +45,50 @@ namespace Eigen {
 // ForkJoinScheduler::ParallelFor(0, num_tasks, granularity, std::move(parallel_task), &thread_pool);
 // ```
 //
-// Example usage #2 (asynchronous):
+// Example usage #2 (executing multiple tasks asynchronously, each one parallelized with ParallelFor):
 // ```
 // ThreadPool thread_pool(num_threads);
-// Barrier barrier(num_tasks * num_async_calls);
-// auto done = [&](){barrier.Notify();};
-// for (int k=0; k<num_async_calls; ++k) {
-//   thread_pool.Schedule([&](){
-//     ForkJoinScheduler::ParallelForAsync(0, num_tasks, granularity, parallel_task, done, &thread_pool);
-//   });
+// Barrier barrier(num_async_calls);
+// auto done = [&](){ barrier.Notify(); };
+// for (Index k=0; k<num_async_calls; ++k) {
+//   ForkJoinScheduler::ParallelForAsync(task_start[k], task_end[k], granularity[k], parallel_task[k], done,
+//   &thread_pool);
 // }
 // barrier.Wait();
 // ```
 class ForkJoinScheduler {
 public:
-  // Runs `do_func` asynchronously for the range [start, end) with a specified granularity. `do_func` should
-  // either be of type `std::function<void(int)>` or `std::function<void(int, int)`.
-  // If `end > start`, the `done` callback will be called `end - start` times when all tasks have been
-  // executed. Otherwise, `done` is called only once.
-  template <typename DoFnType>
-  static void ParallelForAsync(int start, int end, int granularity, DoFnType do_func, std::function<void()> done,
-                               Eigen::ThreadPool* thread_pool) {
+  // Runs `do_func` asynchronously for the range [start, end) with a specified
+  // granularity. `do_func` should be of type `std::function<void(Index,
+  // Index)`. `done()` is called exactly once after all tasks have been executed.
+  template <typename DoFnType, typename DoneFnType>
+  static void ParallelForAsync(Index start, Index end, Index granularity, DoFnType&& do_func, DoneFnType&& done,
+                               ThreadPool* thread_pool) {
    if (start >= end) {
      done();
      return;
    }
-    ForkJoinScheduler::RunParallelForAsync(start, end, granularity, do_func, done, thread_pool);
+    thread_pool->Schedule([start, end, granularity, thread_pool, do_func = std::forward<DoFnType>(do_func),
+                           done = std::forward<DoneFnType>(done)]() {
+      RunParallelFor(start, end, granularity, do_func, thread_pool);
+      done();
+    });
  }

  // Synchronous variant of ParallelForAsync.
  template <typename DoFnType>
-  static void ParallelFor(int start, int end, int granularity, DoFnType do_func, Eigen::ThreadPool* thread_pool) {
+  static void ParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
    if (start >= end) return;
-    auto dummy_done = []() {};
    Barrier barrier(1);
-    thread_pool->Schedule([start, end, granularity, thread_pool, &do_func, &dummy_done, &barrier]() {
-      ForkJoinScheduler::ParallelForAsync(start, end, granularity, do_func, dummy_done, thread_pool);
-      barrier.Notify();
-    });
+    auto done = [&barrier]() { barrier.Notify(); };
+    ParallelForAsync(start, end, granularity, do_func, done, thread_pool);
    barrier.Wait();
  }

 private:
  // Schedules `right_thunk`, runs `left_thunk`, and runs other tasks until `right_thunk` has finished.
  template <typename LeftType, typename RightType>
-  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, Eigen::ThreadPool* thread_pool) {
+  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, ThreadPool* thread_pool) {
    std::atomic<bool> right_done(false);
    auto execute_right = [&right_thunk, &right_done]() {
      std::forward<RightType>(right_thunk)();
@@ -97,47 +96,38 @@ class ForkJoinScheduler {
    };
    thread_pool->Schedule(execute_right);
    std::forward<LeftType>(left_thunk)();
-    Eigen::ThreadPool::Task task;
+    ThreadPool::Task task;
    while (!right_done.load(std::memory_order_acquire)) {
      thread_pool->MaybeGetTask(&task);
      if (task.f) task.f();
    }
  }

-  // Runs `do_func` in parallel for the range [start, end). The main recursive asynchronous runner that
-  // calls `ForkJoin`.
-  static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int)>& do_func,
-                                  std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
-    std::function<void(int, int)> wrapped_do_func = [&do_func](int start, int end) {
-      for (int i = start; i < end; ++i) do_func(i);
-    };
-    ForkJoinScheduler::RunParallelForAsync(start, end, granularity, wrapped_do_func, done, thread_pool);
+  static Index ComputeMidpoint(Index start, Index end, Index granularity) {
+    // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
+    // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
+    // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
+    // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
+    // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
+    const Index size = end - start;
+    const Index offset = numext::round_down(9 * (size + 1) / 16, granularity);
+    return start + offset;
  }

-  // Variant of `RunAsyncParallelFor` that uses a do function that operates on an index range.
-  // Specifically, `do_func` takes two arguments: the start and end of the range.
-  static void RunParallelForAsync(int start, int end, int granularity, std::function<void(int, int)>& do_func,
-                                  std::function<void()>& done, Eigen::ThreadPool* thread_pool) {
-    if ((end - start) <= granularity) {
+  template <typename DoFnType>
+  static void RunParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func, ThreadPool* thread_pool) {
+    Index mid = ComputeMidpoint(start, end, granularity);
+    if ((end - start) < granularity || mid == start || mid == end) {
      do_func(start, end);
-      for (int j = 0; j < end - start; ++j) done();
-    } else {
-      // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
-      // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
-      // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
-      // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
-      // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
-      const int size = end - start;
-      const int mid = start + 9 * (size + 1) / 16;
-      ForkJoinScheduler::ForkJoin(
-          [start, mid, granularity, &do_func, &done, thread_pool]() {
-            RunParallelForAsync(start, mid, granularity, do_func, done, thread_pool);
-          },
-          [mid, end, granularity, &do_func, &done, thread_pool]() {
-            RunParallelForAsync(mid, end, granularity, do_func, done, thread_pool);
-          },
-          thread_pool);
+      return;
    }
+    ForkJoin([start, mid, granularity, &do_func, thread_pool]() {
+               RunParallelFor(start, mid, granularity, do_func, thread_pool);
+             },
+             [mid, end, granularity, &do_func, thread_pool]() {
+               RunParallelFor(mid, end, granularity, do_func, thread_pool);
+             },
+             thread_pool);
  }
 };

--- a/Eigen/src/ThreadPool/NonBlockingThreadPool.h
+++ b/Eigen/src/ThreadPool/NonBlockingThreadPool.h
@@ -156,7 +156,10 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
  // Tries to assign work to the current task.
  void MaybeGetTask(Task* t) {
    PerThread* pt = GetPerThread();
-    Queue& q = thread_data_[pt->thread_id].queue;
+    const int thread_id = pt->thread_id;
+    // If we are not a worker thread of this pool, we can't get any work.
+    if (thread_id < 0) return;
+    Queue& q = thread_data_[thread_id].queue;
    *t = q.PopFront();
    if (t->f) return;
    if (num_threads_ == 1) {