Clang-format tests, examples, libraries, benchmarks, etc.

2026-04-10 11:34:33 +08:00 · 2023-12-05 21:22:55 +00:00
parent 3252ecc7a4
commit 46e9cdb7fe
876 changed files with 33453 additions and 37795 deletions
--- a/bench/benchmark-blocking-sizes.cpp
+++ b/bench/benchmark-blocking-sizes.cpp
@@ -59,14 +59,12 @@ static_assert(maxsize > minsize, "maxsize must be larger than minsize");
 static_assert(maxsize < (minsize << 16), "maxsize must be less than (minsize<<16)");

 // just a helper to store a triple of K,M,N sizes for matrix product
-struct size_triple_t
-{
+struct size_triple_t {
  size_t k, m, n;
  size_triple_t() : k(0), m(0), n(0) {}
  size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
  size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
-  size_triple_t(uint16_t compact)
-  {
+  size_triple_t(uint16_t compact) {
    k = 1 << ((compact & 0xf00) >> 8);
    m = 1 << ((compact & 0x0f0) >> 4);
    n = 1 << ((compact & 0x00f) >> 0);
@@ -82,50 +80,35 @@ uint8_t log2_pot(size_t x) {
 // Convert between size tripes and a compact form fitting in 12 bits
 // where each size, which must be a POT, is encoded as its log2, on 4 bits
 // so the largest representable size is 2^15 == 32k  ... big enough.
-uint16_t compact_size_triple(size_t k, size_t m, size_t n)
-{
+uint16_t compact_size_triple(size_t k, size_t m, size_t n) {
  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
 }

-uint16_t compact_size_triple(const size_triple_t& t)
-{
-  return compact_size_triple(t.k, t.m, t.n);
-}
+uint16_t compact_size_triple(const size_triple_t& t) { return compact_size_triple(t.k, t.m, t.n); }

 // A single benchmark. Initially only contains benchmark params.
 // Then call run(), which stores the result in the gflops field.
-struct benchmark_t
-{
+struct benchmark_t {
  uint16_t compact_product_size;
  uint16_t compact_block_size;
  bool use_default_block_size;
  float gflops;
-  benchmark_t()
-    : compact_product_size(0)
-    , compact_block_size(0)
-    , use_default_block_size(false)
-    , gflops(0)
-  {
-  }
-  benchmark_t(size_t pk, size_t pm, size_t pn,
-              size_t bk, size_t bm, size_t bn)
-    : compact_product_size(compact_size_triple(pk, pm, pn))
-    , compact_block_size(compact_size_triple(bk, bm, bn))
-    , use_default_block_size(false)
-    , gflops(0)
-  {}
+  benchmark_t() : compact_product_size(0), compact_block_size(0), use_default_block_size(false), gflops(0) {}
+  benchmark_t(size_t pk, size_t pm, size_t pn, size_t bk, size_t bm, size_t bn)
+      : compact_product_size(compact_size_triple(pk, pm, pn)),
+        compact_block_size(compact_size_triple(bk, bm, bn)),
+        use_default_block_size(false),
+        gflops(0) {}
  benchmark_t(size_t pk, size_t pm, size_t pn)
-    : compact_product_size(compact_size_triple(pk, pm, pn))
-    , compact_block_size(0)
-    , use_default_block_size(true)
-    , gflops(0)
-  {}
+      : compact_product_size(compact_size_triple(pk, pm, pn)),
+        compact_block_size(0),
+        use_default_block_size(true),
+        gflops(0) {}

  void run();
 };

-ostream& operator<<(ostream& s, const benchmark_t& b)
-{
+ostream& operator<<(ostream& s, const benchmark_t& b) {
  s << hex << b.compact_product_size << dec;
  if (b.use_default_block_size) {
    size_triple_t t(b.compact_product_size);
@@ -141,17 +124,14 @@ ostream& operator<<(ostream& s, const benchmark_t& b)

 // We sort first by increasing benchmark parameters,
 // then by decreasing performance.
-bool operator<(const benchmark_t& b1, const benchmark_t& b2)
-{ 
+bool operator<(const benchmark_t& b1, const benchmark_t& b2) {
  return b1.compact_product_size < b2.compact_product_size ||
-           (b1.compact_product_size == b2.compact_product_size && (
-             (b1.compact_block_size < b2.compact_block_size || (
-               b1.compact_block_size == b2.compact_block_size &&
-                 b1.gflops > b2.gflops))));
+         (b1.compact_product_size == b2.compact_product_size &&
+          ((b1.compact_block_size < b2.compact_block_size ||
+            (b1.compact_block_size == b2.compact_block_size && b1.gflops > b2.gflops))));
 }

-void benchmark_t::run()
-{
+void benchmark_t::run() {
  size_triple_t productsizes(compact_product_size);

  if (use_default_block_size) {
@@ -168,26 +148,22 @@ void benchmark_t::run()
  // set up the matrix pool

  const size_t combined_three_matrices_sizes =
-    sizeof(Scalar) *
-      (productsizes.k * productsizes.m +
-       productsizes.k * productsizes.n +
-       productsizes.m * productsizes.n);
+      sizeof(Scalar) *
+      (productsizes.k * productsizes.m + productsizes.k * productsizes.n + productsizes.m * productsizes.n);

  // 64 M is large enough that nobody has a cache bigger than that,
  // while still being small enough that everybody has this much RAM,
  // so conveniently we don't need to special-case platforms here.
  const size_t unlikely_large_cache_size = 64 << 20;

-  const size_t working_set_size =
-    min_working_set_size ? min_working_set_size : unlikely_large_cache_size;
+  const size_t working_set_size = min_working_set_size ? min_working_set_size : unlikely_large_cache_size;

-  const size_t matrix_pool_size =
-    1 + working_set_size / combined_three_matrices_sizes;
+  const size_t matrix_pool_size = 1 + working_set_size / combined_three_matrices_sizes;
+
+  MatrixType* lhs = new MatrixType[matrix_pool_size];
+  MatrixType* rhs = new MatrixType[matrix_pool_size];
+  MatrixType* dst = new MatrixType[matrix_pool_size];

-  MatrixType *lhs = new MatrixType[matrix_pool_size];
-  MatrixType *rhs = new MatrixType[matrix_pool_size];
-  MatrixType *dst = new MatrixType[matrix_pool_size];
-  
  for (size_t i = 0; i < matrix_pool_size; i++) {
    lhs[i] = MatrixType::Zero(productsizes.m, productsizes.k);
    rhs[i] = MatrixType::Zero(productsizes.k, productsizes.n);
@@ -200,7 +176,6 @@ void benchmark_t::run()
  float time_per_iter = 0.0f;
  size_t matrix_index = 0;
  while (true) {
-
    double starttime = timer.getCpuTime();
    for (int i = 0; i < iters_at_a_time; i++) {
      dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index];
@@ -228,8 +203,7 @@ void benchmark_t::run()
  gflops = 2e-9 * productsizes.k * productsizes.m * productsizes.n / time_per_iter;
 }

-void print_cpuinfo()
-{
+void print_cpuinfo() {
 #ifdef __linux__
  cout << "contents of /proc/cpuinfo:" << endl;
  string line;
@@ -249,33 +223,30 @@ void print_cpuinfo()
 }

 template <typename T>
-string type_name()
-{
+string type_name() {
  return "unknown";
 }

-template<>
-string type_name<float>()
-{
+template <>
+string type_name<float>() {
  return "float";
 }

-template<>
-string type_name<double>()
-{
+template <>
+string type_name<double>() {
  return "double";
 }

-struct action_t
-{
-  virtual const char* invokation_name() const { abort(); return nullptr; }
+struct action_t {
+  virtual const char* invokation_name() const {
+    abort();
+    return nullptr;
+  }
  virtual void run() const { abort(); }
  virtual ~action_t() {}
 };

-void show_usage_and_exit(int /*argc*/, char* argv[],
-                         const vector<unique_ptr<action_t>>& available_actions)
-{
+void show_usage_and_exit(int /*argc*/, char* argv[], const vector<unique_ptr<action_t>>& available_actions) {
  cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
  cerr << "available actions:" << endl << endl;
  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
@@ -293,11 +264,10 @@ void show_usage_and_exit(int /*argc*/, char* argv[],
  cerr << "       avoid warm caches." << endl;
  exit(1);
 }
-     
-float measure_clock_speed()
-{
+
+float measure_clock_speed() {
  cerr << "Measuring clock speed...                              \r" << flush;
-          
+
  vector<float> all_gflops;
  for (int i = 0; i < 8; i++) {
    benchmark_t b(1024, 1024, 1024);
@@ -315,14 +285,12 @@ float measure_clock_speed()
  return result;
 }

-struct human_duration_t
-{
+struct human_duration_t {
  int seconds;
  human_duration_t(int s) : seconds(s) {}
 };

-ostream& operator<<(ostream& s, const human_duration_t& d)
-{
+ostream& operator<<(ostream& s, const human_duration_t& d) {
  int remainder = d.seconds;
  if (remainder > 3600) {
    int hours = remainder / 3600;
@@ -342,8 +310,7 @@ ostream& operator<<(ostream& s, const human_duration_t& d)

 const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data";

-void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run)
-{
+void serialize_benchmarks(const char* filename, const vector<benchmark_t>& benchmarks, size_t first_benchmark_to_run) {
  FILE* file = fopen(filename, "w");
  if (!file) {
    cerr << "Could not open file " << filename << " for writing." << endl;
@@ -358,8 +325,7 @@ void serialize_benchmarks(const char* filename, const vector<benchmark_t>& bench
  fclose(file);
 }

-bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run)
-{
+bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmarks, size_t& first_benchmark_to_run) {
  FILE* file = fopen(filename, "r");
  if (!file) {
    return false;
@@ -382,11 +348,7 @@ bool deserialize_benchmarks(const char* filename, vector<benchmark_t>& benchmark
  return true;
 }

-void try_run_some_benchmarks(
-  vector<benchmark_t>& benchmarks,
-  double time_start,
-  size_t& first_benchmark_to_run)
-{
+void try_run_some_benchmarks(vector<benchmark_t>& benchmarks, double time_start, size_t& first_benchmark_to_run) {
  if (first_benchmark_to_run == benchmarks.size()) {
    return;
  }
@@ -402,9 +364,7 @@ void try_run_some_benchmarks(
    time_now = timer.getRealTime();

    // We check clock speed every minute and at the end.
-    if (benchmark_index == benchmarks.size() ||
-        time_now > time_last_clock_speed_measurement + 60.0f)
-    {
+    if (benchmark_index == benchmarks.size() || time_now > time_last_clock_speed_measurement + 60.0f) {
      time_last_clock_speed_measurement = time_now;

      // Ensure that clock speed is as expected
@@ -425,8 +385,7 @@ void try_run_some_benchmarks(
        // which invalidates all benchmark results collected so far.
        // Either way, we better restart all over again now.
        if (benchmark_index) {
-          cerr << "Restarting at " << 100.0f * ratio_done
-               << " % because clock speed increased.          " << endl;
+          cerr << "Restarting at " << 100.0f * ratio_done << " % because clock speed increased.          " << endl;
        }
        max_clock_speed = current_clock_speed;
        first_benchmark_to_run = 0;
@@ -436,12 +395,9 @@ void try_run_some_benchmarks(
      bool rerun_last_tests = false;

      if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
-        cerr << "Measurements completed so far: "
-             << 100.0f * ratio_done
-             << " %                             " << endl;
-        cerr << "Clock speed seems to be only "
-             << current_clock_speed/max_clock_speed
-             << " times what it used to be." << endl;
+        cerr << "Measurements completed so far: " << 100.0f * ratio_done << " %                             " << endl;
+        cerr << "Clock speed seems to be only " << current_clock_speed / max_clock_speed << " times what it used to be."
+             << endl;

        unsigned int seconds_to_sleep_if_lower_clock_speed = 1;

@@ -454,9 +410,8 @@ void try_run_some_benchmarks(
            exit(2);
          }
          rerun_last_tests = true;
-          cerr << "Sleeping "
-               << seconds_to_sleep_if_lower_clock_speed
-               << " s...                                   \r" << endl;
+          cerr << "Sleeping " << seconds_to_sleep_if_lower_clock_speed << " s...                                   \r"
+               << endl;
          sleep(seconds_to_sleep_if_lower_clock_speed);
          current_clock_speed = measure_clock_speed();
          seconds_to_sleep_if_lower_clock_speed *= 2;
@@ -464,8 +419,7 @@ void try_run_some_benchmarks(
      }

      if (rerun_last_tests) {
-        cerr << "Redoing the last "
-             << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
+        cerr << "Redoing the last " << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
             << " % because clock speed had been low.   " << endl;
        return;
      }
@@ -486,8 +440,7 @@ void try_run_some_benchmarks(
    // Display progress info on stderr
    if (time_now > time_last_progress_update + 1.0f) {
      time_last_progress_update = time_now;
-      cerr << "Measurements... " << 100.0f * ratio_done
-           << " %, ETA "
+      cerr << "Measurements... " << 100.0f * ratio_done << " %, ETA "
           << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
           << "                          \r" << flush;
    }
@@ -498,19 +451,15 @@ void try_run_some_benchmarks(
  }
 }

-void run_benchmarks(vector<benchmark_t>& benchmarks)
-{
+void run_benchmarks(vector<benchmark_t>& benchmarks) {
  size_t first_benchmark_to_run;
  vector<benchmark_t> deserialized_benchmarks;
  bool use_deserialized_benchmarks = false;
  if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
-    cerr << "Found serialized session with "
-         << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
+    cerr << "Found serialized session with " << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
         << " % already done" << endl;
-    if (deserialized_benchmarks.size() == benchmarks.size() &&
-        first_benchmark_to_run > 0 &&
-        first_benchmark_to_run < benchmarks.size())
-    {
+    if (deserialized_benchmarks.size() == benchmarks.size() && first_benchmark_to_run > 0 &&
+        first_benchmark_to_run < benchmarks.size()) {
      use_deserialized_benchmarks = true;
    }
  }
@@ -531,15 +480,13 @@ void run_benchmarks(vector<benchmark_t>& benchmarks)
  for (int i = 0; i < 4; i++) {
    max_clock_speed = max(max_clock_speed, measure_clock_speed());
  }
-  
+
  double time_start = 0.0;
  while (first_benchmark_to_run < benchmarks.size()) {
    if (first_benchmark_to_run == 0) {
      time_start = timer.getRealTime();
    }
-    try_run_some_benchmarks(benchmarks,
-                            time_start,
-                            first_benchmark_to_run);
+    try_run_some_benchmarks(benchmarks, time_start, first_benchmark_to_run);
  }

  // Sort timings by increasing benchmark parameters, and decreasing gflops.
@@ -550,10 +497,8 @@ void run_benchmarks(vector<benchmark_t>& benchmarks)
  // Collect best (i.e. now first) results for each parameter values.
  vector<benchmark_t> best_benchmarks;
  for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
-    if (best_benchmarks.empty() ||
-        best_benchmarks.back().compact_product_size != it->compact_product_size ||
-        best_benchmarks.back().compact_block_size != it->compact_block_size)
-    {
+    if (best_benchmarks.empty() || best_benchmarks.back().compact_product_size != it->compact_product_size ||
+        best_benchmarks.back().compact_block_size != it->compact_block_size) {
      best_benchmarks.push_back(*it);
    }
  }
@@ -562,11 +507,9 @@ void run_benchmarks(vector<benchmark_t>& benchmarks)
  benchmarks = best_benchmarks;
 }

-struct measure_all_pot_sizes_action_t : action_t
-{
+struct measure_all_pot_sizes_action_t : action_t {
  virtual const char* invokation_name() const { return "all-pot-sizes"; }
-  virtual void run() const
-  {
+  virtual void run() const {
    vector<benchmark_t> benchmarks;
    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
@@ -593,11 +536,9 @@ struct measure_all_pot_sizes_action_t : action_t
  }
 };

-struct measure_default_sizes_action_t : action_t
-{
+struct measure_default_sizes_action_t : action_t {
  virtual const char* invokation_name() const { return "default-sizes"; }
-  virtual void run() const
-  {
+  virtual void run() const {
    vector<benchmark_t> benchmarks;
    for (int repetition = 0; repetition < measurement_repetitions; repetition++) {
      for (size_t ksize = minsize; ksize <= maxsize; ksize *= 2) {
@@ -618,8 +559,7 @@ struct measure_default_sizes_action_t : action_t
  }
 };

-int main(int argc, char* argv[])
-{
+int main(int argc, char* argv[]) {
  double time_start = timer.getRealTime();
  cout.precision(4);
  cerr.precision(4);
@@ -647,7 +587,7 @@ int main(int argc, char* argv[])
  for (int i = 2; i < argc; i++) {
    if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
      const char* equals_sign = strchr(argv[i], '=');
-      min_working_set_size = strtoul(equals_sign+1, nullptr, 10);
+      min_working_set_size = strtoul(equals_sign + 1, nullptr, 10);
    } else {
      cerr << "unrecognized option: " << argv[i] << endl << endl;
      show_usage_and_exit(argc, argv, available_actions);
@@ -657,7 +597,7 @@ int main(int argc, char* argv[])
  print_cpuinfo();

  cout << "benchmark parameters:" << endl;
-  cout << "pointer size: " << 8*sizeof(void*) << " bits" << endl;
+  cout << "pointer size: " << 8 * sizeof(void*) << " bits" << endl;
  cout << "scalar type: " << type_name<Scalar>() << endl;
  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
  cout << "minsize = " << minsize << endl;