/* * benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen * * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Description: * ------------ * This benchmark suite evaluates the performance of Eigen mathematical * operations when integrated with AMD Optimizing CPU Libraries (AOCL). It * tests: * * 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt, * log, etc.) using AOCL Vector Math Library (VML) for optimized * double-precision operations * * 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library * with support for both single-threaded and multithreaded execution * * 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using * libflame * * 4. Real-world Scenarios: Financial risk computation simulating covariance * matrix calculations and eigenvalue analysis for portfolio optimization * * The benchmark automatically detects AOCL configuration and adjusts test * execution accordingly, providing performance comparisons between standard * Eigen operations and AOCL-accelerated implementations. * * Compilation: * ------------ * # Using AOCC compiler (recommended for best AOCL compatibility): * clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I * -I${AOCL_ROOT}/include \ * -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \ * -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \ * -o build/eigen_aocl_benchmark * * # Alternative: Using GCC with proper library paths: * g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I * -I${AOCL_ROOT}/include \ * -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \ * -lamdlibm -lm -lblis -lflame -lpthread -lrt \ * -o build/eigen_aocl_benchmark * * # For multithreaded BLIS support: * clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I \ * -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \ * -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \ * -o build/eigen_aocl_benchmark_mt * * Usage: * ------ * export AOCL_ROOT=/path/to/aocl/installation * export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH * ./build/eigen_aocl_benchmark * * Developer: * ---------- * Name: Sharad Saurabh Bhaskar * Email: shbhaska@amd.com * Organization: Advanced Micro Devices, Inc. */ #include #include #include #include #include // Simple - just include Eigen headers #include #include #include // Only include CBLAS if AOCL BLIS is available #ifdef EIGEN_USE_AOCL_ALL #include #endif using namespace std; using namespace std::chrono; using namespace Eigen; void benchmarkVectorMath(int size) { VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0); VectorXd result(size); double elapsed_ms = 0; cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl; auto start = high_resolution_clock::now(); result = v.array().exp(); auto end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "exp() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().sin(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "sin() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().cos(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "cos() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().sqrt(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "sqrt() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().cbrt(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "cbrt() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().abs(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "abs() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().log(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "log() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().log10(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "log10() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().exp2(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "exp2() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().asin(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "asin() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().sinh(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "sinh() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().acos(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "acos() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().cosh(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "cosh() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().tan(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "tan() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().atan(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "atan() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().tanh(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "tanh() time: " << elapsed_ms << " ms" << endl; VectorXd v2 = VectorXd::Random(size); start = high_resolution_clock::now(); result = v.array() + v2.array(); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "add() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().pow(2.0); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "pow() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().max(v2.array()); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "max() time: " << elapsed_ms << " ms" << endl; start = high_resolution_clock::now(); result = v.array().min(v2.array()); end = high_resolution_clock::now(); elapsed_ms = duration_cast(end - start).count(); cout << "min() time: " << elapsed_ms << " ms" << endl; } // Function to benchmark BLAS operation: Matrix multiplication. void benchmarkMatrixMultiplication(int matSize) { cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize << ") ---" << endl; MatrixXd A = MatrixXd::Random(matSize, matSize); MatrixXd B = MatrixXd::Random(matSize, matSize); MatrixXd C(matSize, matSize); auto start = high_resolution_clock::now(); C = A * B; auto end = high_resolution_clock::now(); double elapsed_ms = duration_cast(end - start).count(); cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl; } // Benchmark BLIS directly using its CBLAS interface if available. void benchmarkBlisMultithreaded(int matSize, int numThreads) { #if defined(EIGEN_AOCL_USE_BLIS_MT) cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize << ", threads=" << numThreads << ") ---" << endl; vector A(matSize * matSize); vector B(matSize * matSize); vector C(matSize * matSize); for (auto &v : A) v = static_cast(rand()) / RAND_MAX; for (auto &v : B) v = static_cast(rand()) / RAND_MAX; double alpha = 1.0, beta = 0.0; string th = to_string(numThreads); setenv("BLIS_NUM_THREADS", th.c_str(), 1); auto start = high_resolution_clock::now(); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize, matSize, alpha, A.data(), matSize, B.data(), matSize, beta, C.data(), matSize); auto end = high_resolution_clock::now(); double elapsed_ms = duration_cast(end - start).count(); cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl; #else (void)matSize; (void)numThreads; cout << "\nBLIS multithreaded support not enabled." << endl; #endif } // Function to benchmark LAPACK operation: Eigenvalue decomposition. void benchmarkEigenDecomposition(int matSize) { cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize << " x " << matSize << ") ---" << endl; MatrixXd M = MatrixXd::Random(matSize, matSize); // Make matrix symmetric (necessary for eigenvalue decomposition of // self-adjoint matrices) M = (M + M.transpose()) * 0.5; SelfAdjointEigenSolver eigensolver; auto start = high_resolution_clock::now(); eigensolver.compute(M); auto end = high_resolution_clock::now(); double elapsed_ms = duration_cast(end - start).count(); if (eigensolver.info() == Success) { cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl; } else { cout << "Eigenvalue decomposition failed." << endl; } } // Function simulating a real-world FSI risk computation scenario. // Example: Compute covariance matrix from simulated asset returns, then perform // eigenvalue decomposition. void benchmarkFSIRiskComputation(int numPeriods, int numAssets) { cout << "\n--- FSI Risk Computation Benchmark ---" << endl; cout << "Simulating " << numPeriods << " periods for " << numAssets << " assets." << endl; // Simulate asset returns: each column represents an asset's returns. MatrixXd returns = MatrixXd::Random(numPeriods, numAssets); // Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1) auto start = high_resolution_clock::now(); MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1); auto end = high_resolution_clock::now(); double cov_time = duration_cast(end - start).count(); cout << "Covariance matrix computation time: " << cov_time << " ms" << endl; // Eigenvalue decomposition on covariance matrix. SelfAdjointEigenSolver eigensolver; start = high_resolution_clock::now(); eigensolver.compute(cov); end = high_resolution_clock::now(); double eig_time = duration_cast(end - start).count(); if (eigensolver.info() == Success) { cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms" << endl; cout << "Top 3 Eigenvalues: " << eigensolver.eigenvalues().tail(3).transpose() << endl; } else { cout << "Eigenvalue decomposition failed." << endl; } } int main() { cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl; cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl; cout << "Organization: Advanced Micro Devices, Inc." << endl; cout << "License: Mozilla Public License 2.0" << endl << endl; // Print AOCL configuration #ifdef EIGEN_USE_AOCL_MT cout << "AOCL Mode: MULTITHREADED (MT)" << endl; cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl; #elif defined(EIGEN_USE_AOCL_ALL) cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl; cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl; #else cout << "AOCL Mode: DISABLED" << endl; cout << "Using standard Eigen implementation" << endl; #endif cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl; // Benchmark vector math functions with varying vector sizes. vector vectorSizes = {5000000, 10000000, 50000000}; for (int size : vectorSizes) { benchmarkVectorMath(size); } // Benchmark matrix multiplication for varying sizes. vector matrixSizes = {1024}; for (int msize : matrixSizes) { benchmarkMatrixMultiplication(msize); #if defined(EIGEN_AOCL_USE_BLIS_MT) benchmarkBlisMultithreaded(msize, thread::hardware_concurrency()); #endif } // Benchmark LAPACK: Eigenvalue Decomposition. for (int msize : matrixSizes) { benchmarkEigenDecomposition(msize); } // Benchmark a complex FSI risk computation scenario. // For example, simulate 10,000 time periods (days) for 500 assets. benchmarkFSIRiskComputation(10000, 500); cout << "\n=== Benchmark Complete ===" << endl; return 0; }