1467 lines
46 KiB
Plaintext
1467 lines
46 KiB
Plaintext
/***************************************************************************************************
|
|
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
* list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
|
|
/*! \file
|
|
\brief SYR2K Grouped Example.
|
|
|
|
This workload computes a batch of SYR2K operations with distinct problem sizes. This example closely
|
|
follows 24_gemm_grouped.
|
|
|
|
Examples:
|
|
|
|
# Runs a grouped SYR2K with 100 random problem sizes
|
|
$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100
|
|
|
|
# Runs a grouped SYR2K with 100 random problem sizes (with SYR2K-K dimension equal to 1024)
|
|
$ ./examples/38_syr2k_grouped/24_gemm_grouped --groups=100 --k=1024 --verbose=true
|
|
|
|
# Runs a grouped SYR2K that is equivalent to a batched SYR2K
|
|
$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100 --n=1024 --k=1024 --verbose=true
|
|
|
|
# Execute grouped SYR2K and profile with NSight
|
|
$ nv-nsight-cu-cli ./examples/38_syr2k_grouped/38_syr2k_grouped --n=256 --k=256 --verbose=true \
|
|
--iterations=1 --reference-check=false
|
|
|
|
*/
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include <chrono>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#include "cutlass/blas3.h"
|
|
#include "cutlass/cutlass.h"
|
|
#include "cutlass/device_kernel.h"
|
|
#include "cutlass/gemm/gemm.h"
|
|
#include "cutlass/gemm/device/default_gemm_configuration.h"
|
|
#include "cutlass/gemm/kernel/rank_2k_grouped.h"
|
|
#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
|
|
#include "cutlass/gemm/device/rank_2k_grouped.h"
|
|
#include "cutlass/gemm/device/rank_2k.h"
|
|
|
|
#include "cutlass/util/command_line.h"
|
|
#include "cutlass/util/distribution.h"
|
|
#include "cutlass/util/device_memory.h"
|
|
#include "cutlass/util/tensor_view_io.h"
|
|
#include "cutlass/util/host_tensor.h"
|
|
#include "cutlass/util/reference/host/rank_2k_complex.h"
|
|
#include "cutlass/util/reference/host/tensor_compare.h"
|
|
#include "cutlass/util/reference/host/tensor_copy.h"
|
|
#include "cutlass/util/reference/host/tensor_fill.h"
|
|
#include "cutlass/util/reference/device/tensor_fill.h"
|
|
#include "cutlass/util/reference/host/tensor_norm.h"
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Result structure
|
|
struct Result {
|
|
|
|
double runtime_ms;
|
|
double initialization_time_ms;
|
|
double gflops;
|
|
cutlass::Status status;
|
|
cudaError_t error;
|
|
bool passed;
|
|
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
Result(
|
|
double runtime_ms = 0,
|
|
double initialization_time_ms = 0,
|
|
double gflops = 0,
|
|
cutlass::Status status = cutlass::Status::kSuccess,
|
|
cudaError_t error = cudaSuccess
|
|
):
|
|
runtime_ms(runtime_ms), initialization_time_ms(initialization_time_ms), gflops(gflops),
|
|
status(status), error(error), passed(true) { }
|
|
};
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Command line options parsing
|
|
struct Options {
|
|
|
|
bool help;
|
|
bool error;
|
|
bool reference_check;
|
|
bool profile_initialization;
|
|
bool sort_problems;
|
|
|
|
std::vector<cutlass::gemm::GemmCoord> problem_sizes;
|
|
|
|
int alignment;
|
|
int problem_count;
|
|
int iterations;
|
|
int cuda_streams;
|
|
bool verbose;
|
|
float alpha;
|
|
float beta;
|
|
std::string benchmark_path;
|
|
|
|
std::string output_tag;
|
|
std::ofstream output_file;
|
|
|
|
using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
|
|
std::vector<GroupScheduleMode> scheduler_modes;
|
|
|
|
std::unordered_map<std::string, GroupScheduleMode>
|
|
str_to_scheduler_mode = {
|
|
{"kDeviceOnly", GroupScheduleMode::kDeviceOnly},
|
|
{"kHostPrecompute", GroupScheduleMode::kHostPrecompute}
|
|
};
|
|
|
|
struct GroupScheduleModeHash {
|
|
size_t operator()(GroupScheduleMode m) const {
|
|
return static_cast<size_t>(m);
|
|
}
|
|
};
|
|
|
|
std::unordered_map<GroupScheduleMode, std::string, GroupScheduleModeHash>
|
|
scheduler_mode_to_str = {
|
|
{GroupScheduleMode::kDeviceOnly, "kDeviceOnly"},
|
|
{GroupScheduleMode::kHostPrecompute, "kHostPrecompute"}
|
|
};
|
|
|
|
std::vector<GroupScheduleMode> all_scheduler_modes = {GroupScheduleMode::kDeviceOnly, GroupScheduleMode::kHostPrecompute};
|
|
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
Options():
|
|
help(false),
|
|
error(false),
|
|
alignment(8),
|
|
reference_check(true),
|
|
profile_initialization(false),
|
|
sort_problems(false),
|
|
problem_count(5),
|
|
iterations(20),
|
|
cuda_streams(0),
|
|
verbose(false),
|
|
alpha(1),
|
|
beta(),
|
|
scheduler_modes({GroupScheduleMode::kDeviceOnly})
|
|
{ }
|
|
|
|
// Parses the command line
|
|
void parse(int argc, char const **args) {
|
|
cutlass::CommandLine cmd(argc, args);
|
|
|
|
if (cmd.check_cmd_line_flag("help")) {
|
|
help = true;
|
|
return;
|
|
}
|
|
|
|
cmd.get_cmd_line_argument("alignment", alignment, 8);
|
|
cmd.get_cmd_line_argument("groups", problem_count, 5);
|
|
cmd.get_cmd_line_argument("alpha", alpha, 1.0f);
|
|
cmd.get_cmd_line_argument("beta", beta, 0.0f);
|
|
cmd.get_cmd_line_argument("iterations", iterations, 20);
|
|
cmd.get_cmd_line_argument("streams", cuda_streams, 0);
|
|
cmd.get_cmd_line_argument("verbose", verbose, false);
|
|
cmd.get_cmd_line_argument("reference-check", reference_check, true);
|
|
cmd.get_cmd_line_argument("profile-initialization", profile_initialization, false);
|
|
cmd.get_cmd_line_argument("sort-problems", sort_problems, false);
|
|
cmd.get_cmd_line_argument("benchmark", benchmark_path);
|
|
|
|
std::vector<std::string> scheduler_mode_strs;
|
|
cmd.get_cmd_line_arguments("scheduler-modes", scheduler_mode_strs);
|
|
|
|
if (!scheduler_mode_strs.empty()) {
|
|
scheduler_modes.clear();
|
|
if (scheduler_mode_strs.size() == 1 && scheduler_mode_strs[0] == "all") {
|
|
scheduler_modes = all_scheduler_modes;
|
|
} else {
|
|
for (std::string precomp_str : scheduler_mode_strs) {
|
|
auto it = str_to_scheduler_mode.find(precomp_str);
|
|
if (it != str_to_scheduler_mode.end()) {
|
|
scheduler_modes.push_back(it->second);
|
|
} else if (precomp_str == "all") {
|
|
std::cerr << "Flag --scheduler-modes=all must not contain other scheduler modes in list." << std::endl;
|
|
error = true;
|
|
return;
|
|
} else {
|
|
std::cerr << "Unrecognized scheduler mode '" << precomp_str << "'" << std::endl;
|
|
error = true;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string output_path;
|
|
cmd.get_cmd_line_argument("tag", output_tag);
|
|
cmd.get_cmd_line_argument("output_file", output_path);
|
|
|
|
if (!output_path.empty()) {
|
|
|
|
std::ios_base::openmode open_mode = std::ios_base::out;
|
|
|
|
std::ifstream input_file(output_path.c_str());
|
|
|
|
if (input_file.good()) {
|
|
open_mode = std::ios_base::app;
|
|
input_file.close();
|
|
}
|
|
|
|
output_file.open(output_path.c_str(), open_mode);
|
|
|
|
if (output_file.good() && open_mode != std::ios_base::app) {
|
|
output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n";
|
|
}
|
|
}
|
|
|
|
// Decide how to initialize the problems
|
|
if (!benchmark_path.empty()) {
|
|
if (!benchmark_problems()) {
|
|
error = true;
|
|
problem_sizes.clear();
|
|
return;
|
|
}
|
|
}
|
|
else {
|
|
randomize_problems(cmd);
|
|
}
|
|
}
|
|
|
|
void randomize_problems(cutlass::CommandLine &cmd) {
|
|
|
|
//
|
|
// For now, randomly choose the problem sizes.
|
|
//
|
|
|
|
int cmd_line_m = -1;
|
|
int cmd_line_n = -1;
|
|
int cmd_line_k = -1;
|
|
|
|
cmd.get_cmd_line_argument("m", cmd_line_m);
|
|
cmd.get_cmd_line_argument("n", cmd_line_n);
|
|
cmd.get_cmd_line_argument("k", cmd_line_k);
|
|
|
|
// SYR2K is defined via only N and K.
|
|
if (cmd_line_m != -1) {
|
|
std::cerr << "Parameter M is ignored for SYR2K\n";
|
|
error = true;
|
|
return;
|
|
}
|
|
|
|
problem_sizes.reserve(problem_count);
|
|
|
|
for (int i = 0; i < problem_count; ++i) {
|
|
int n = cmd_line_n;
|
|
int k = cmd_line_k;
|
|
|
|
if (n < 1) {
|
|
n = alignment * ((rand() % 256) + 1);
|
|
}
|
|
|
|
if (k < 1) {
|
|
k = alignment * ((rand() % 256) + 1);
|
|
}
|
|
|
|
// SYR2K is defined only in terms of N and K. Replicate N into
|
|
// the SYR2K-N dimension.
|
|
cutlass::gemm::GemmCoord problem(n, n, k);
|
|
|
|
problem_sizes.push_back(problem);
|
|
}
|
|
}
|
|
|
|
/// Load a benchmark
|
|
bool benchmark_problems() {
|
|
std::ifstream file(benchmark_path);
|
|
if (!file.good()) {
|
|
return false;
|
|
}
|
|
|
|
while (file.good()) {
|
|
|
|
int idx = -1;
|
|
std::string extent_str;
|
|
|
|
file >> idx >> extent_str;
|
|
|
|
if (idx < 0 || extent_str.empty()) {
|
|
break;
|
|
}
|
|
|
|
cutlass::gemm::GemmCoord extent;
|
|
std::vector<std::string> tokens;
|
|
|
|
cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
|
|
|
|
for (int i = 0; i < int(tokens.size()); ++i) {
|
|
int x = std::atoi(tokens.at(i).c_str());
|
|
|
|
// round up
|
|
if (x % alignment) {
|
|
x += (alignment - (x % alignment));
|
|
}
|
|
|
|
extent.at(i) = x;
|
|
}
|
|
|
|
if (extent.product()) {
|
|
problem_sizes.push_back(extent);
|
|
}
|
|
}
|
|
|
|
problem_count = int(problem_sizes.size());
|
|
return true;
|
|
}
|
|
|
|
/// Prints the usage statement.
|
|
std::ostream & print_usage(std::ostream &out) const {
|
|
|
|
out << "38_syr2k_grouped\n\n"
|
|
<< " This example profiles the performance of a 'grouped' SYR2K kernel. This example closely follows 24_gemm_grouped\n\n"
|
|
<< "Options:\n\n"
|
|
<< " --help If specified, displays this usage statement.\n\n"
|
|
<< " --benchmark=<str> Executes a benchmark problem size.\n"
|
|
<< " --output_file=<str> Path to a CSV file to output results. If it exists already, results are appended.\n"
|
|
<< " --tag=<str> String tag to prepend to the CSV file.\n"
|
|
<< " --groups=<int> Number of individual SYR2K problems (default: --groups=15)\n"
|
|
<< " --m=<int> Sets the M dimension for all groups. Otherwise, it is selected randomly\n"
|
|
<< " --n=<int> Sets the N dimension for all groups. Otherwise, it is selected randomly\n"
|
|
<< " --k=<int> Sets the K dimension for all groups. Otherwise, it is selected randomly\n"
|
|
<< " --alpha=<f32> Epilogue scalar alpha (real part)\n"
|
|
<< " --beta=<f32> Epilogue scalar beta (real part)\n"
|
|
<< " --scheduler-modes=<str> List of scheduler modes to be profile for grouped GEMM scheduler (default: --scheduler_modes=kDeviceOnly)\n"
|
|
<< " --iterations=<int> Number of profiling iterations to perform.\n"
|
|
<< " --reference-check=<bool> If true, performs reference check.\n"
|
|
<< " --verbose=<bool> If true, prints problem sizes and batching structure.\n"
|
|
<< " --profile-initialization=<bool> If true, profiles the device-level kernel's initialization.\n"
|
|
<< " --sort-problems=<bool> If true, sorts problem sizes in descending order of SYR2K-K dimension.\n";
|
|
|
|
out << "\n\nExamples:\n\n"
|
|
|
|
<< "# Runs a grouped SYR2K with 100 random problem sizes\n"
|
|
<< "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100\n\n"
|
|
|
|
<< "# Runs a grouped SYR2K with 100 random problem sizes (with K dimension equal to 1024)\n"
|
|
<< "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100 --k=1024 --verbose=true\n\n"
|
|
|
|
<< "# Runs a grouped SYR2K that is equivalent to a batched SYR2K\n"
|
|
<< "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --groups=100 --n=1024 --k=1024 --verbose=true\n\n"
|
|
|
|
<< "# Runs a grouped SYR2K with each different scheduler mode\n"
|
|
<< "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --scheduler-modes=all\n\n"
|
|
|
|
<< "# Runs a grouped SYR2K with each different scheduler mode and profiles host-side initialization time\n"
|
|
<< "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --scheduler-modes=all --profile-initialization=true\n\n"
|
|
|
|
<< "# Runs a grouped SYR2K problem given an externally supplied benchmark file. This is a text file in which\n"
|
|
<< "# Each line contains a unique group index and an MxNxK triple indicating problemsize. NOTE that the\n"
|
|
<< "# GEMM-M and GEMM-N dimensions must match.\n"
|
|
<< "#\n"
|
|
<< "# For example, assume the following are the contents of 'problems.txt'\n"
|
|
<< "#\n"
|
|
<< "# 0 256x256x520\n"
|
|
<< "# 1 264x264x1024\n"
|
|
<< "# 2 48x48x1024\n"
|
|
<< "#\n"
|
|
<< "$ ./examples/38_syr2k_grouped/38_syr2k_grouped --benchmark=problems.txt\n\n"
|
|
|
|
<< "# Execute Grouped SYR2K and profile with NSight\n"
|
|
<< "$ nv-nsight-cu-cli ./examples/38_syr2k_grouped/38_syr2k_grouped --n=256 --k=256 --verbose=true --iterations=1 --reference-check=false\n\n";
|
|
|
|
return out;
|
|
}
|
|
|
|
/// Compute performance in GFLOP/s
|
|
double gflops(double runtime_s) const {
|
|
|
|
// Number of real-valued multiply-adds
|
|
int64_t fmas = int64_t();
|
|
|
|
for (auto const & problem : problem_sizes) {
|
|
fmas += problem.product();
|
|
}
|
|
|
|
// SYR2K is defined as (A x BT) + (B x AT), so the number of FMAs is twice that in a GEMM
|
|
fmas *= 2;
|
|
|
|
// Two flops per multiply-add
|
|
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
|
|
}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
template <typename Rank2K>
|
|
class BaseTestbed {
|
|
public:
|
|
//
|
|
// Type definitions
|
|
//
|
|
|
|
using ElementA = typename Rank2K::ElementA;
|
|
using ElementB = typename Rank2K::ElementB;
|
|
using ElementC = typename Rank2K::ElementC;
|
|
using ElementAccumulator = typename Rank2K::ElementAccumulator;
|
|
|
|
using EpilogueOutputOp = typename Rank2K::Rank2Kkernel::Epilogue::OutputOp;
|
|
using ElementCompute = typename EpilogueOutputOp::ElementCompute;
|
|
|
|
using LayoutA = typename Rank2K::LayoutA;
|
|
using LayoutB = typename Rank2K::LayoutB;
|
|
using LayoutC = typename Rank2K::LayoutC;
|
|
|
|
using MatrixCoord = typename LayoutC::TensorCoord;
|
|
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
Options & options;
|
|
|
|
/// Initialization
|
|
cutlass::Distribution::Kind init_A;
|
|
cutlass::Distribution::Kind init_B;
|
|
cutlass::Distribution::Kind init_C;
|
|
uint32_t seed;
|
|
|
|
cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device;
|
|
|
|
std::vector<int64_t> offset_A;
|
|
std::vector<int64_t> offset_B;
|
|
std::vector<int64_t> offset_C;
|
|
std::vector<int64_t> offset_D;
|
|
|
|
std::vector<int64_t> lda_host;
|
|
std::vector<int64_t> ldb_host;
|
|
std::vector<int64_t> ldc_host;
|
|
std::vector<int64_t> ldd_host;
|
|
|
|
cutlass::DeviceAllocation<int64_t> lda;
|
|
cutlass::DeviceAllocation<int64_t> ldb;
|
|
cutlass::DeviceAllocation<int64_t> ldc;
|
|
cutlass::DeviceAllocation<int64_t> ldd;
|
|
|
|
cutlass::DeviceAllocation<ElementA> block_A;
|
|
cutlass::DeviceAllocation<ElementB> block_B;
|
|
cutlass::DeviceAllocation<ElementC> block_C;
|
|
cutlass::DeviceAllocation<ElementC> block_D;
|
|
|
|
cutlass::DeviceAllocation<ElementA *> ptr_A;
|
|
cutlass::DeviceAllocation<ElementB *> ptr_B;
|
|
cutlass::DeviceAllocation<ElementC *> ptr_C;
|
|
cutlass::DeviceAllocation<ElementC *> ptr_D;
|
|
|
|
BaseTestbed(
|
|
Options &options_,
|
|
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
|
|
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
|
|
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
|
|
uint32_t seed_ = 3080
|
|
):
|
|
options(options_), init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
|
|
|
|
int problem_count() const {
|
|
return options.problem_count;
|
|
}
|
|
|
|
/// Helper to initialize a tensor view
|
|
template <typename Element>
|
|
void initialize_tensor(
|
|
Element *ptr,
|
|
size_t capacity,
|
|
cutlass::Distribution::Kind dist_kind,
|
|
uint32_t seed) {
|
|
|
|
if (dist_kind == cutlass::Distribution::Uniform) {
|
|
|
|
Element scope_max, scope_min;
|
|
int bits_input = cutlass::sizeof_bits<Element>::value;
|
|
int bits_output = cutlass::sizeof_bits<ElementC>::value;
|
|
|
|
if (bits_input == 1) {
|
|
scope_max = 2;
|
|
scope_min = 0;
|
|
} else if (bits_input <= 8) {
|
|
scope_max = 2;
|
|
scope_min = -2;
|
|
} else if (bits_output == 16) {
|
|
if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
|
|
scope_max = 5;
|
|
scope_min = -5;
|
|
}
|
|
else {
|
|
scope_max = 8;
|
|
scope_min = -8;
|
|
}
|
|
} else {
|
|
scope_max = 8;
|
|
scope_min = -8;
|
|
}
|
|
|
|
cutlass::reference::device::BlockFillRandomUniform(
|
|
ptr, capacity, seed, scope_max, scope_min, 0);
|
|
}
|
|
else if (dist_kind == cutlass::Distribution::Gaussian) {
|
|
|
|
cutlass::reference::device::BlockFillRandomGaussian(
|
|
ptr, capacity, seed, Element(), Element(0.5f));
|
|
}
|
|
else if (dist_kind == cutlass::Distribution::Sequential) {
|
|
|
|
// Fill with increasing elements
|
|
cutlass::reference::device::BlockFillSequential(
|
|
ptr, capacity, Element(1), Element());
|
|
}
|
|
else {
|
|
|
|
// Fill with all 1s
|
|
cutlass::reference::device::BlockFillSequential(
|
|
ptr, capacity, Element(), Element(1));
|
|
}
|
|
}
|
|
|
|
/// Allocates device-side data
|
|
void allocate() {
|
|
int64_t total_elements_A = 0;
|
|
int64_t total_elements_B = 0;
|
|
int64_t total_elements_C = 0;
|
|
int64_t total_elements_D = 0;
|
|
|
|
lda_host.resize(problem_count());
|
|
ldb_host.resize(problem_count());
|
|
ldc_host.resize(problem_count());
|
|
ldd_host.resize(problem_count());
|
|
|
|
for (int32_t i = 0; i < problem_count(); ++i) {
|
|
|
|
auto problem = options.problem_sizes.at(i);
|
|
|
|
lda_host.at(i) = LayoutA::packed({problem.n(), problem.k()}).stride(0);
|
|
ldb_host.at(i) = LayoutB::packed({problem.n(), problem.k()}).stride(0);
|
|
ldc_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
|
|
ldd_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
|
|
|
|
offset_A.push_back(total_elements_A);
|
|
offset_B.push_back(total_elements_B);
|
|
offset_C.push_back(total_elements_C);
|
|
offset_D.push_back(total_elements_D);
|
|
|
|
int64_t elements_A = problem.n() * problem.k();
|
|
int64_t elements_B = problem.n() * problem.k();
|
|
int64_t elements_C = problem.n() * problem.n();
|
|
int64_t elements_D = problem.n() * problem.n();
|
|
|
|
total_elements_A += elements_A;
|
|
total_elements_B += elements_B;
|
|
total_elements_C += elements_C;
|
|
total_elements_D += elements_D;
|
|
}
|
|
|
|
lda.reset(problem_count());
|
|
ldb.reset(problem_count());
|
|
ldc.reset(problem_count());
|
|
ldd.reset(problem_count());
|
|
|
|
block_A.reset(total_elements_A);
|
|
block_B.reset(total_elements_B);
|
|
block_C.reset(total_elements_C);
|
|
block_D.reset(total_elements_D);
|
|
}
|
|
|
|
/// Initializes device-side data
|
|
void initialize() {
|
|
problem_sizes_device.reset(problem_count());
|
|
problem_sizes_device.copy_from_host(options.problem_sizes.data());
|
|
|
|
lda.copy_from_host(lda_host.data());
|
|
ldb.copy_from_host(ldb_host.data());
|
|
ldc.copy_from_host(ldc_host.data());
|
|
ldd.copy_from_host(ldd_host.data());
|
|
|
|
//
|
|
// Assign pointers
|
|
//
|
|
|
|
std::vector<ElementA *> ptr_A_host(problem_count());
|
|
std::vector<ElementB *> ptr_B_host(problem_count());
|
|
std::vector<ElementC *> ptr_C_host(problem_count());
|
|
std::vector<ElementC *> ptr_D_host(problem_count());
|
|
|
|
for (int32_t i = 0; i < problem_count(); ++i) {
|
|
ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
|
|
ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
|
|
ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
|
|
ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
|
|
}
|
|
|
|
ptr_A.reset(problem_count());
|
|
ptr_A.copy_from_host(ptr_A_host.data());
|
|
|
|
ptr_B.reset(problem_count());
|
|
ptr_B.copy_from_host(ptr_B_host.data());
|
|
|
|
ptr_C.reset(problem_count());
|
|
ptr_C.copy_from_host(ptr_C_host.data());
|
|
|
|
ptr_D.reset(problem_count());
|
|
ptr_D.copy_from_host(ptr_D_host.data());
|
|
|
|
//
|
|
// Initialize the problems of the workspace
|
|
//
|
|
|
|
initialize_tensor(block_A.get(), block_A.size(), init_A, seed * 2021);
|
|
initialize_tensor(block_B.get(), block_B.size(), init_B, seed * 2022);
|
|
initialize_tensor(block_C.get(), block_C.size(), init_C, seed * 2023);
|
|
|
|
cutlass::reference::device::BlockFillSequential(
|
|
block_D.get(), block_D.size(), ElementC(), ElementC());
|
|
}
|
|
|
|
/// Verifies the result is a SYR2K
|
|
bool verify() {
|
|
|
|
bool passed = true;
|
|
|
|
for (int32_t i = 0; i < problem_count(); ++i) {
|
|
cutlass::gemm::GemmCoord problem = options.problem_sizes.at(i);
|
|
|
|
LayoutA layout_A(lda_host.at(i));
|
|
LayoutB layout_B(ldb_host.at(i));
|
|
LayoutC layout_C(ldc_host.at(i));
|
|
LayoutC layout_D(ldd_host.at(i));
|
|
|
|
cutlass::HostTensor<ElementA, LayoutA> host_A(
|
|
typename LayoutA::TensorCoord(problem.n(), problem.k()), /*device_backed=*/false);
|
|
cutlass::HostTensor<ElementB, LayoutB> host_B(
|
|
typename LayoutB::TensorCoord(problem.n(), problem.k()), /*device_backed=*/false);
|
|
cutlass::HostTensor<ElementC, LayoutC> host_C(
|
|
typename LayoutC::TensorCoord(problem.n(), problem.n()), /*device_backed=*/false);
|
|
cutlass::HostTensor<ElementC, LayoutC> host_D(
|
|
typename LayoutC::TensorCoord(problem.n(), problem.n()), /*device_backed=*/false);
|
|
|
|
cutlass::device_memory::copy_to_host(host_A.host_data(), block_A.get() + offset_A.at(i), problem.n() * problem.k());
|
|
cutlass::device_memory::copy_to_host(host_B.host_data(), block_B.get() + offset_B.at(i), problem.n() * problem.k());
|
|
cutlass::device_memory::copy_to_host(host_C.host_data(), block_C.get() + offset_C.at(i), problem.n() * problem.n());
|
|
cutlass::reference::host::BlockFillSequential(
|
|
host_D.host_data(), problem.n() * problem.n(), ElementC(), ElementC());
|
|
|
|
MatrixCoord extent_C{problem.n(), problem.n()};
|
|
|
|
// Reference Rank2K
|
|
cutlass::reference::host::Rank2KComplex<
|
|
ElementA, LayoutA,
|
|
ElementB, LayoutB,
|
|
ElementC, LayoutC,
|
|
ElementC, ElementAccumulator
|
|
>(
|
|
problem,
|
|
(double)options.alpha,
|
|
host_A.host_view(),
|
|
Rank2K::kTransformA,
|
|
host_B.host_view(),
|
|
Rank2K::kTransformB,
|
|
(double)options.beta,
|
|
host_C.host_view(),
|
|
host_D.host_view(),
|
|
ElementAccumulator(0),
|
|
Rank2K::kFillModeC,
|
|
Rank2K::kBlasMode
|
|
);
|
|
|
|
// Copy to host memory
|
|
std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
|
|
cutlass::device_memory::copy_to_host(matrix_D.data(), block_D.get() + offset_D.at(i), matrix_D.size());
|
|
|
|
cutlass::TensorView<ElementC, LayoutC> view_D(matrix_D.data(), layout_D, extent_C);
|
|
cutlass::TensorView<ElementC, LayoutC> view_Ref = host_D.host_view();
|
|
|
|
// Reference check
|
|
passed = cutlass::reference::host::TensorEquals(view_D, view_Ref);
|
|
|
|
if (!passed) {
|
|
std::cerr << "\n***\nError - problem " << i << " failed the QA check\n***\n" << std::endl;
|
|
return passed;
|
|
}
|
|
}
|
|
|
|
return passed;
|
|
}
|
|
};
|
|
|
|
template <typename Rank2K>
|
|
class TestbedConventional : BaseTestbed<Rank2K> {
|
|
public:
|
|
TestbedConventional(
|
|
Options &options_,
|
|
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
|
|
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
|
|
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
|
|
uint32_t seed_ = 3080
|
|
): BaseTestbed<Rank2K>(options_, init_A_, init_B_, init_C_, seed_) {}
|
|
|
|
/// Verbose printing of problem sizes
|
|
void print_problem_sizes() {
|
|
|
|
// Print groups
|
|
std::cout << this->problem_count() << " groups:\n";
|
|
|
|
int32_t idx = 0;
|
|
int64_t total_tiles = 0;
|
|
|
|
for (auto const & problem : this->options.problem_sizes) {
|
|
int tiles =
|
|
((problem.m() + Rank2K::ThreadblockShape::kM - 1) / Rank2K::ThreadblockShape::kM) *
|
|
((problem.n() + Rank2K::ThreadblockShape::kN - 1) / Rank2K::ThreadblockShape::kN);
|
|
|
|
total_tiles += tiles;
|
|
|
|
std::cout << " [" << idx << "]: "
|
|
<< problem.m() << "-by-" << problem.n() << "-by-" << problem.k()
|
|
<< " (" << tiles << " threadblock tiles)" << "\n";
|
|
|
|
++idx;
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
/// Executes a conventional SYR2K kernel.
|
|
Result profile() {
|
|
std::cout << "Conventional Rank2K:\n"
|
|
<< "====================================================" << std::endl;
|
|
|
|
Result result;
|
|
result.passed = false;
|
|
|
|
// Initialize the problem
|
|
this->allocate();
|
|
this->initialize();
|
|
|
|
if (this->options.verbose) {
|
|
print_problem_sizes();
|
|
}
|
|
|
|
//
|
|
// Create CUDA streams to maximize concurrency of SYR2K kernels
|
|
//
|
|
int32_t effective_streams = (this->options.cuda_streams ? this->options.cuda_streams : 1);
|
|
std::vector<cudaStream_t> cuda_streams;
|
|
char const *provider = "CUTLASS";
|
|
|
|
//
|
|
// Warmup run
|
|
//
|
|
|
|
if (this->options.cuda_streams) {
|
|
for (int i = 0; i < this->options.cuda_streams; ++i) {
|
|
cudaStream_t stream;
|
|
|
|
result.error = cudaStreamCreate(&stream);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "Failed to create CUDA stream." << std::endl;
|
|
return result;
|
|
}
|
|
cuda_streams.push_back(stream);
|
|
}
|
|
}
|
|
else {
|
|
cuda_streams.push_back(nullptr);
|
|
}
|
|
|
|
// Use 'D' for the in/out workspace
|
|
this->block_D.copy_from_device(this->block_C.get());
|
|
|
|
for (size_t i = 0; i < this->options.problem_sizes.size(); ++i) {
|
|
cutlass::gemm::GemmCoord const & problem = this->options.problem_sizes[i];
|
|
int32_t batch_count = 1;
|
|
int64_t lda = this->lda_host.at(i);
|
|
int64_t ldb = this->ldb_host.at(i);
|
|
int64_t ldc = this->ldc_host.at(i);
|
|
typename Rank2K::ElementA* ptrA = this->block_A.get() + this->offset_A.at(i);
|
|
typename Rank2K::ElementB* ptrB = this->block_B.get() + this->offset_B.at(i);
|
|
typename Rank2K::ElementC* ptrC = this->block_C.get() + this->offset_C.at(i);
|
|
typename Rank2K::ElementC* ptrD = this->block_D.get() + this->offset_D.at(i);
|
|
|
|
//
|
|
// Initialize the CUTLASS SYR2K operator
|
|
//
|
|
|
|
// Configure the SYR2K arguments
|
|
typename Rank2K::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
|
|
|
|
typename Rank2K::Arguments arguments{
|
|
cutlass::gemm::GemmUniversalMode::kGemm,
|
|
problem,
|
|
batch_count,
|
|
epilogue_op,
|
|
(void const *)ptrA,
|
|
(void const *)ptrB,
|
|
(void const *)ptrC,
|
|
(void *)ptrD,
|
|
int64_t(),
|
|
int64_t(),
|
|
int64_t(),
|
|
int64_t(),
|
|
int64_t(lda),
|
|
int64_t(ldb),
|
|
int64_t(ldc),
|
|
int64_t(ldc)
|
|
};
|
|
|
|
Rank2K rank2k_op;
|
|
|
|
cutlass::Status status = rank2k_op.initialize(arguments);
|
|
|
|
if (status != cutlass::Status::kSuccess) {
|
|
std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
|
|
return result;
|
|
}
|
|
|
|
status = rank2k_op();
|
|
|
|
if (status != cutlass::Status::kSuccess) {
|
|
std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Wait for completion
|
|
//
|
|
|
|
result.error = cudaDeviceSynchronize();
|
|
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
|
|
return result;
|
|
}
|
|
|
|
//
|
|
// Construct events
|
|
//
|
|
|
|
cudaEvent_t events[2];
|
|
|
|
for (auto & event : events) {
|
|
result.error = cudaEventCreate(&event);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Wait for completion
|
|
//
|
|
|
|
result.error = cudaDeviceSynchronize();
|
|
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
|
|
return result;
|
|
}
|
|
|
|
// Record an event at the start of a series of SYR2K operations
|
|
result.error = cudaEventRecord(events[0]);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return result;
|
|
}
|
|
|
|
//
|
|
// Run profiling loop
|
|
//
|
|
|
|
size_t last_stream_idx = 0;
|
|
|
|
for (int iter = 0; iter < this->options.iterations; ++iter) {
|
|
for (size_t i = 0; i < this->options.problem_sizes.size(); ++i) {
|
|
cutlass::gemm::GemmCoord const & problem = this->options.problem_sizes[i];
|
|
int32_t batch_count = 1;
|
|
int64_t lda = this->lda_host.at(i);
|
|
int64_t ldb = this->ldb_host.at(i);
|
|
int64_t ldc = this->ldc_host.at(i);
|
|
typename Rank2K::ElementA* ptrA = this->block_A.get() + this->offset_A.at(i);
|
|
typename Rank2K::ElementB* ptrB = this->block_B.get() + this->offset_B.at(i);
|
|
typename Rank2K::ElementC* ptrC = this->block_C.get() + this->offset_C.at(i);
|
|
typename Rank2K::ElementC* ptrD = this->block_D.get() + this->offset_D.at(i);
|
|
|
|
last_stream_idx = (i % effective_streams);
|
|
|
|
//
|
|
// Initialize the CUTLASS SYR2K operator
|
|
//
|
|
|
|
// Configure the SYR2K arguments
|
|
typename Rank2K::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
|
|
|
|
typename Rank2K::Arguments arguments{
|
|
cutlass::gemm::GemmUniversalMode::kGemm,
|
|
problem,
|
|
batch_count,
|
|
epilogue_op,
|
|
(void const *)ptrA,
|
|
(void const *)ptrB,
|
|
(void const *)ptrC,
|
|
(void *)ptrD,
|
|
int64_t(),
|
|
int64_t(),
|
|
int64_t(),
|
|
int64_t(),
|
|
int64_t(lda),
|
|
int64_t(ldb),
|
|
int64_t(ldc),
|
|
int64_t(ldc)
|
|
};
|
|
|
|
Rank2K rank2k_op;
|
|
|
|
cutlass::Status status = rank2k_op.initialize(arguments);
|
|
|
|
if (status != cutlass::Status::kSuccess) {
|
|
std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
|
|
return result;
|
|
}
|
|
|
|
status = rank2k_op(cuda_streams[last_stream_idx]);
|
|
|
|
if (status != cutlass::Status::kSuccess) {
|
|
std::cerr << "CUTLASS error on line " << __LINE__ << std::endl;
|
|
return result;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Stop profiling loop
|
|
//
|
|
|
|
// Record an event when the SYR2K operations have been launched.
|
|
result.error = cudaEventRecord(events[1]);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return result;
|
|
}
|
|
|
|
//
|
|
// Wait for work to be completed
|
|
//
|
|
|
|
result.error = cudaDeviceSynchronize();
|
|
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
|
|
return result;
|
|
}
|
|
|
|
// Measure elapsed runtime
|
|
float runtime_ms = 0;
|
|
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return result;
|
|
}
|
|
|
|
// Compute average runtime and GFLOPs.
|
|
result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
|
|
result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
|
|
|
|
//
|
|
// Cleanup
|
|
//
|
|
|
|
for (auto event : events) {
|
|
(void)cudaEventDestroy(event);
|
|
}
|
|
|
|
for (auto stream : cuda_streams) {
|
|
if (stream) {
|
|
(void)cudaStreamDestroy(stream);
|
|
}
|
|
}
|
|
|
|
std::cout << " " << this->options.problem_sizes.size() << " conventional Rank2Ks launched" << std::endl;
|
|
std::cout << std::endl;
|
|
std::cout << " " << "Conventional Runtime: " << result.runtime_ms << " ms" << std::endl;
|
|
std::cout << " " << "Conventional GFLOPS: " << result.gflops << std::endl;
|
|
|
|
if (this->options.output_file.good()) {
|
|
this->options.output_file << this->options.output_tag << "," << provider << ",conventional,"
|
|
<< this->problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
|
|
}
|
|
|
|
result.passed = true;
|
|
return result;
|
|
}
|
|
};
|
|
|
|
template <typename Rank2K_, cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
|
|
class TestbedGrouped : BaseTestbed<Rank2K_> {
|
|
public:
|
|
TestbedGrouped(
|
|
Options &options_,
|
|
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
|
|
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
|
|
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
|
|
uint32_t seed_ = 3080
|
|
) : BaseTestbed<Rank2K_>(options_, init_A_, init_B_, init_C_, seed_) {}
|
|
|
|
// Redefine Rank2K with different GroupScheduleMode_
|
|
using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
|
|
typename Rank2K_::ElementA, typename Rank2K_::LayoutA, Rank2K_::kTransformA, Rank2K_::kAlignmentA,
|
|
typename Rank2K_::ElementB, typename Rank2K_::LayoutB, Rank2K_::kTransformB, Rank2K_::kAlignmentB,
|
|
typename Rank2K_::ElementC, typename Rank2K_::LayoutC, Rank2K_::kFillModeC,
|
|
typename Rank2K_::ElementAccumulator,
|
|
typename Rank2K_::OperatorClass,
|
|
typename Rank2K_::ArchTag,
|
|
typename Rank2K_::ThreadblockShape,
|
|
typename Rank2K_::WarpShape,
|
|
typename Rank2K_::InstructionShape,
|
|
typename Rank2K_::EpilogueOutputOp,
|
|
typename Rank2K_::ThreadblockSwizzle,
|
|
Rank2K_::kStages,
|
|
typename Rank2K_::Operator::ArchMmaOperator::Operator,
|
|
Rank2K_::kBlasMode,
|
|
GroupScheduleMode_>::Rank2Kkernel;
|
|
|
|
using Rank2K = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
|
|
|
|
/// Verbose printing of problem sizes
|
|
void print_problem_sizes() {
|
|
|
|
// Print groups
|
|
std::cout << this->problem_count() << " groups:\n";
|
|
|
|
int32_t idx = 0;
|
|
int64_t total_tiles = 0;
|
|
|
|
for (auto const & problem : this->options.problem_sizes) {
|
|
int tiles = Rank2K::problem_tile_count(problem);
|
|
total_tiles += tiles;
|
|
|
|
std::cout << " [" << idx << "]: "
|
|
<< problem.m() << "-by-" << problem.n() << "-by-" << problem.k()
|
|
<< " (" << tiles << " threadblock tiles)" << "\n";
|
|
|
|
++idx;
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
/// Sort problems in descending order of problem-K dimension
|
|
void sort_problems() {
|
|
Rank2K::sort_problems(this->options.problem_count,
|
|
this->options.problem_sizes.data(),
|
|
this->lda_host.data(),
|
|
this->ldb_host.data(),
|
|
this->ldc_host.data(),
|
|
this->ldd_host.data(),
|
|
this->offset_A.data(),
|
|
this->offset_B.data(),
|
|
this->offset_C.data(),
|
|
this->offset_D.data());
|
|
}
|
|
|
|
/// Executes a grouped kernel and measures runtime.
|
|
Result profile() {
|
|
std::string sched_mode = this->options.scheduler_mode_to_str.find(GroupScheduleMode_)->second;
|
|
std::cout << std::endl;
|
|
std::cout << "Grouped Rank2K (CUTLASS) with mode " << sched_mode << ":\n"
|
|
<< "====================================================" << std::endl;
|
|
|
|
Result result;
|
|
|
|
int threadblock_count = Rank2K::sufficient(this->options.problem_sizes.data(), this->options.problem_count);
|
|
|
|
// Early exit
|
|
if (!threadblock_count) {
|
|
std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped SYR2K kernel." << std::endl;
|
|
return result;
|
|
}
|
|
|
|
result.passed = false;
|
|
|
|
// Initialize the problem
|
|
this->allocate();
|
|
if (this->options.sort_problems) {
|
|
sort_problems();
|
|
}
|
|
this->initialize();
|
|
|
|
if (this->options.verbose) {
|
|
print_problem_sizes();
|
|
}
|
|
|
|
// Configure the Rank2K arguments
|
|
typename Rank2K::EpilogueOutputOp::Params epilogue_op(this->options.alpha, this->options.beta);
|
|
|
|
// Configure Rank2K arguments
|
|
typename Rank2K::Arguments args(
|
|
cutlass::gemm::GemmUniversalMode::kGemm,
|
|
this->problem_sizes_device.get(),
|
|
this->problem_count(),
|
|
threadblock_count,
|
|
epilogue_op,
|
|
this->ptr_A.get(),
|
|
this->ptr_B.get(),
|
|
this->ptr_C.get(),
|
|
this->ptr_D.get(),
|
|
this->lda.get(),
|
|
this->ldb.get(),
|
|
this->ldc.get(),
|
|
this->ldd.get(),
|
|
this->options.problem_sizes.data()
|
|
);
|
|
|
|
// Initialize the Rank2K object
|
|
Rank2K rank2k{};
|
|
size_t workspace_size = rank2k.get_workspace_size(args);
|
|
cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
|
|
|
|
result.status = rank2k.initialize(args, workspace.get());
|
|
|
|
if (result.status != cutlass::Status::kSuccess) {
|
|
std::cerr << "Failed to initialize CUTLASS Grouped Rank2K kernel." << std::endl;
|
|
return result;
|
|
}
|
|
|
|
// Run the grouped Rank2K object
|
|
result.status = rank2k.run();
|
|
|
|
if (result.status != cutlass::Status::kSuccess) {
|
|
std::cerr << "Failed to run CUTLASS Grouped Rank2K kernel." << std::endl;
|
|
return result;
|
|
}
|
|
|
|
// Wait for completion
|
|
result.error = cudaDeviceSynchronize();
|
|
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
|
|
return result;
|
|
}
|
|
|
|
//
|
|
// Verify correctness
|
|
//
|
|
result.passed = true;
|
|
if (this->options.reference_check) {
|
|
result.passed = this->verify();
|
|
}
|
|
|
|
//
|
|
// Warm-up run of the grouped Rank2K object
|
|
//
|
|
result.status = rank2k.run();
|
|
|
|
if (result.status != cutlass::Status::kSuccess) {
|
|
std::cerr << "Failed to run CUTLASS Grouped Rank2K kernel." << std::endl;
|
|
return result;
|
|
}
|
|
|
|
//
|
|
// Construct events
|
|
//
|
|
|
|
cudaEvent_t events[2];
|
|
|
|
for (auto & event : events) {
|
|
result.error = cudaEventCreate(&event);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
// Record an event at the start of a series of SYR2K operations
|
|
result.error = cudaEventRecord(events[0]);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return result;
|
|
}
|
|
|
|
//
|
|
// Run profiling loop
|
|
//
|
|
|
|
for (int iter = 0; iter < this->options.iterations; ++iter) {
|
|
rank2k();
|
|
}
|
|
|
|
//
|
|
// Stop profiling loop
|
|
//
|
|
|
|
// Record an event when the Rank2K operations have been launched.
|
|
result.error = cudaEventRecord(events[1]);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return result;
|
|
}
|
|
|
|
// Wait for work on the device to complete.
|
|
result.error = cudaEventSynchronize(events[1]);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return result;
|
|
}
|
|
|
|
// Measure elapsed runtime
|
|
float runtime_ms = 0;
|
|
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
|
|
if (result.error != cudaSuccess) {
|
|
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
|
|
return result;
|
|
}
|
|
|
|
// Compute average runtime and GFLOPs.
|
|
result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
|
|
result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
|
|
|
|
//
|
|
// Cleanup
|
|
//
|
|
|
|
for (auto event : events) {
|
|
(void)cudaEventDestroy(event);
|
|
}
|
|
|
|
// Optionally profile initialization
|
|
if (this->options.profile_initialization) {
|
|
// Warm up
|
|
rank2k.initialize(args, workspace.get());
|
|
|
|
auto start_time = std::chrono::high_resolution_clock::now();
|
|
for (int32_t i = 0; i < this->options.iterations; ++i) {
|
|
rank2k.initialize(args, workspace.get());
|
|
}
|
|
auto end_time = std::chrono::high_resolution_clock::now();
|
|
|
|
std::chrono::duration<double, std::milli> duration = end_time - start_time;
|
|
duration /= double(this->options.iterations);
|
|
result.initialization_time_ms = duration.count();
|
|
}
|
|
|
|
int64_t total_tiles = Rank2K::group_tile_count(args);
|
|
std::cout << " " << total_tiles << " total threadblock tiles." << std::endl;
|
|
|
|
std::cout << std::endl;
|
|
std::cout << " " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
|
|
std::cout << " " << "Grouped GFLOPs: " << result.gflops << std::endl;
|
|
if (this->options.profile_initialization) {
|
|
std::cout << " " << "Init Runtime: " << result.initialization_time_ms << " ms" << std::endl;
|
|
}
|
|
|
|
if (this->options.output_file.good()) {
|
|
this->options.output_file << this->options.output_tag << ",CUTLASS,grouped-" << sched_mode << ","
|
|
<< this->problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
|
|
}
|
|
|
|
std::cout << "\nPassed\n";
|
|
|
|
return result;
|
|
}
|
|
};
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
int main(int argc, char const **args) {
|
|
|
|
cudaDeviceProp props;
|
|
|
|
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
|
if (error != cudaSuccess) {
|
|
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
|
|
return -1;
|
|
}
|
|
|
|
if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
|
|
|
|
//
|
|
// This example requires an NVIDIA Ampere-architecture GPU.
|
|
//
|
|
|
|
std::cout
|
|
<< "CUTLASS's Grouped Rank2K example requires a GPU of NVIDIA's Ampere Architecture or "
|
|
<< "later (compute capability 80 or greater).\n";
|
|
|
|
return 0;
|
|
}
|
|
|
|
//
|
|
// Parse options
|
|
//
|
|
|
|
Options options;
|
|
|
|
options.parse(argc, args);
|
|
|
|
if (options.help) {
|
|
options.print_usage(std::cout) << std::endl;
|
|
return 0;
|
|
}
|
|
|
|
if (options.error) {
|
|
std::cerr << "Aborting execution." << std::endl;
|
|
return -1;
|
|
}
|
|
|
|
//
|
|
// Define the Grouped and Conventional Rank2K types
|
|
//
|
|
|
|
using ElementA = double;
|
|
using ElementB = double;
|
|
using ElementOutput = double;
|
|
using ElementAccumulator = double;
|
|
const cutlass::FillMode kFillModeC = cutlass::FillMode::kLower;
|
|
const int kAlignmentA = 1;
|
|
const int kAlignmentB = 1;
|
|
const cutlass::ComplexTransform kTransformA = cutlass::ComplexTransform::kNone;
|
|
const cutlass::ComplexTransform kTransformB = cutlass::ComplexTransform::kNone;
|
|
|
|
using LayoutA = cutlass::layout::ColumnMajor;
|
|
using LayoutB = cutlass::layout::ColumnMajor;
|
|
using LayoutC = cutlass::layout::ColumnMajor;
|
|
|
|
using OperatorClass = cutlass::arch::OpClassTensorOp;
|
|
using ArchTag = cutlass::arch::Sm80;
|
|
|
|
using ThreadblockShape = cutlass::gemm::GemmShape<32, 32, 16>;
|
|
using WarpShape = cutlass::gemm::GemmShape<16, 16, 16>;
|
|
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
|
|
|
|
using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
|
|
ElementOutput, 1,
|
|
ElementAccumulator, ElementAccumulator>;
|
|
|
|
// NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels.
|
|
// This parameter is passed in at present to match the APIs of other kernels. The parameter
|
|
// is unused within the kernel.
|
|
using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
|
|
|
|
const int kStages = 4;
|
|
const bool kSplitKSerial = false;
|
|
using Operator = cutlass::arch::OpMultiplyAdd;
|
|
const cutlass::BlasMode kBlasMode = cutlass::BlasMode::kSymmetric;
|
|
|
|
// Define a grouped Rank2K kernel with all template parameters set except
|
|
// for scheduling mode. This will be used as the template for all scheduling
|
|
// modes executed.
|
|
using Rank2Kkernel = typename cutlass::gemm::kernel::DefaultRank2KGrouped<
|
|
ElementA, LayoutA, kTransformA, kAlignmentA,
|
|
ElementB, LayoutB, kTransformB, kAlignmentB,
|
|
ElementOutput, LayoutC, kFillModeC,
|
|
ElementAccumulator,
|
|
OperatorClass,
|
|
ArchTag,
|
|
ThreadblockShape,
|
|
WarpShape,
|
|
InstructionShape,
|
|
EpilogueOutputOp,
|
|
ThreadblockSwizzle,
|
|
kStages,
|
|
Operator,
|
|
kBlasMode>::Rank2Kkernel;
|
|
|
|
using Rank2KGrouped = cutlass::gemm::device::Rank2KGrouped<Rank2Kkernel>;
|
|
|
|
// Rank2k operator
|
|
using Rank2KConventional = cutlass::gemm::device::Rank2K<
|
|
ElementA, LayoutA,
|
|
ElementB, LayoutB,
|
|
ElementOutput, LayoutC, kFillModeC,
|
|
ElementAccumulator,
|
|
OperatorClass,
|
|
ArchTag,
|
|
ThreadblockShape,
|
|
WarpShape,
|
|
InstructionShape,
|
|
EpilogueOutputOp,
|
|
ThreadblockSwizzle,
|
|
kStages,
|
|
kAlignmentA,
|
|
kAlignmentB,
|
|
kSplitKSerial,
|
|
Operator,
|
|
kTransformA,
|
|
kTransformB,
|
|
kBlasMode
|
|
>;
|
|
|
|
//
|
|
// Profile it
|
|
//
|
|
|
|
TestbedConventional<Rank2KConventional> testbed(options);
|
|
|
|
Result result = testbed.profile();
|
|
if (!result.passed) {
|
|
std::cout << "Profiling CUTLASS conventional Rank2K has failed.\n";
|
|
std::cout << "\nFailed\n";
|
|
return -1;
|
|
}
|
|
|
|
using GroupScheduleMode = cutlass::gemm::kernel::GroupScheduleMode;
|
|
for (GroupScheduleMode mode : options.scheduler_modes) {
|
|
Result result;
|
|
switch (mode) {
|
|
case GroupScheduleMode::kDeviceOnly:
|
|
{
|
|
TestbedGrouped<Rank2KGrouped, GroupScheduleMode::kDeviceOnly> runner(options);
|
|
result = runner.profile();
|
|
break;
|
|
}
|
|
case GroupScheduleMode::kHostPrecompute:
|
|
{
|
|
TestbedGrouped<Rank2KGrouped, GroupScheduleMode::kHostPrecompute> runner(options);
|
|
result = runner.profile();
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result.error != cudaSuccess) {
|
|
return 1;
|
|
}
|
|
|
|
// Override verbose flag to avoid printing duplicate information for each scheduling mode
|
|
options.verbose = false;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|