357 lines
11 KiB
C++
357 lines
11 KiB
C++
/***************************************************************************************************
|
|
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
* list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
/*! \file
|
|
\brief Unit tests for epilogues
|
|
*/
|
|
#pragma once
|
|
|
|
#include <fstream>
|
|
#include <cfenv>
|
|
|
|
#include "../../common/cutlass_unit_test.h"
|
|
|
|
#include "cutlass/aligned_buffer.h"
|
|
#include "cutlass/half.h"
|
|
#include "cutlass/complex.h"
|
|
#include "cutlass/quaternion.h"
|
|
#include "cutlass/platform/platform.h"
|
|
#include "cutlass/epilogue/thread/linear_combination.h"
|
|
|
|
#include "cutlass/util/host_tensor.h"
|
|
#include "cutlass/util/tensor_view_io.h"
|
|
#include "cutlass/util/reference/host/tensor_fill.h"
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
namespace test {
|
|
namespace kernel {
|
|
|
|
template <typename Epilogue>
|
|
__global__ void epilogue_threadblock(
|
|
typename Epilogue::OutputTileIterator::Params params_D,
|
|
typename Epilogue::OutputTileIterator::Element *ptr_D,
|
|
typename Epilogue::OutputTileIterator::Params params_C,
|
|
typename Epilogue::OutputTileIterator::Element *ptr_C,
|
|
typename Epilogue::OutputOp::Params params_output_op,
|
|
cutlass::MatrixCoord problem_size,
|
|
cutlass::TensorRef<
|
|
typename Epilogue::WarpMmaOperator::ElementC,
|
|
typename Epilogue::WarpMmaOperator::LayoutC> accumulator_ref,
|
|
int epilogue_count = 1) {
|
|
|
|
__shared__ typename Epilogue::SharedStorage shared_storage;
|
|
|
|
int thread_idx = threadIdx.x;
|
|
int warp_idx = threadIdx.x / 32;
|
|
int lane_idx = threadIdx.x % 32;
|
|
|
|
//
|
|
// Construct the epilogue
|
|
//
|
|
|
|
// Tile iterator writing to output tile
|
|
typename Epilogue::OutputTileIterator iterator_D(
|
|
params_D,
|
|
ptr_D,
|
|
problem_size,
|
|
thread_idx
|
|
);
|
|
|
|
// Tile iterator writing to output tile
|
|
typename Epilogue::OutputTileIterator iterator_C(
|
|
params_C,
|
|
ptr_C,
|
|
problem_size,
|
|
thread_idx
|
|
);
|
|
|
|
// Epilogue operator
|
|
Epilogue epilogue(
|
|
shared_storage,
|
|
thread_idx,
|
|
warp_idx,
|
|
lane_idx);
|
|
|
|
//
|
|
// Initialize the accumulators
|
|
//
|
|
|
|
int warp_mn = warp_idx % (Epilogue::WarpCount::kM * Epilogue::WarpCount::kN);
|
|
int warp_m = warp_mn % Epilogue::WarpCount::kM;
|
|
int warp_n = warp_mn / Epilogue::WarpCount::kM;
|
|
|
|
accumulator_ref.add_coord_offset({
|
|
warp_m * Epilogue::WarpMmaOperator::Shape::kM,
|
|
warp_n * Epilogue::WarpMmaOperator::Shape::kN});
|
|
|
|
typename Epilogue::WarpMmaOperator::IteratorC accumulator_iterator(accumulator_ref, lane_idx);
|
|
|
|
typename Epilogue::AccumulatorTile accumulators;
|
|
|
|
accumulators.clear();
|
|
accumulator_iterator.load(accumulators);
|
|
|
|
#if 0
|
|
// For debugging, enable this block of code to fill each accumulator element with its
|
|
// source thread ID.
|
|
CUTLASS_PRAGMA_UNROLL
|
|
for (size_t i = 0; i < accumulators.size(); ++i) {
|
|
typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);
|
|
accumulators[i] = x;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
#endif
|
|
|
|
//
|
|
// Perform the epilogue operation
|
|
//
|
|
|
|
typename Epilogue::OutputOp output_op(params_output_op);
|
|
|
|
// Place the epilogue in a loop
|
|
for (int iter = 0; iter < epilogue_count; ++iter) {
|
|
epilogue(output_op, iterator_D, accumulators, iterator_C);
|
|
}
|
|
}
|
|
|
|
} // namespace kernel
|
|
} // namespace test
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
template <
|
|
typename Epilogue_
|
|
>
|
|
class EpilogueTestbed {
|
|
public:
|
|
|
|
using Epilogue = Epilogue_;
|
|
using ElementAccumulator = typename Epilogue::ElementAccumulator;
|
|
using ElementCompute = typename Epilogue::OutputOp::ElementCompute;
|
|
using ElementOutput = typename Epilogue::ElementOutput;
|
|
using OutputOpParams = typename Epilogue::OutputOp::Params;
|
|
|
|
public:
|
|
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
cutlass::MatrixCoord quantized_size;
|
|
cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> accumulator_tensor;
|
|
cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> source_tensor;
|
|
cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> output_tensor;
|
|
|
|
public:
|
|
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
EpilogueTestbed():
|
|
quantized_size(Epilogue::Shape::kM, Epilogue::Shape::kN),
|
|
accumulator_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
|
|
source_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
|
|
output_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}) {
|
|
|
|
//
|
|
// Initialize problem space
|
|
//
|
|
|
|
uint64_t seed = 2019;
|
|
|
|
cutlass::reference::host::TensorFillRandomUniform(
|
|
accumulator_tensor.host_view(),
|
|
seed,
|
|
2,
|
|
-2,
|
|
0);
|
|
|
|
cutlass::reference::host::TensorFillRandomUniform(
|
|
source_tensor.host_view(),
|
|
seed + 2018,
|
|
2,
|
|
-2,
|
|
0);
|
|
}
|
|
|
|
bool run_all() {
|
|
|
|
double alpha_values[] = {1, 0, 2.25};
|
|
double beta_values[] = {0, 1, -1.25};
|
|
|
|
// Test runtime explodes if we tried to test every case exhaustively. This tests the full
|
|
// output tile and several smaller sizes to stress predication.
|
|
for (int m_idx = 0; m_idx < 3; ++m_idx) {
|
|
for (int n_idx = 0; n_idx < 3; ++n_idx) {
|
|
|
|
int m = quantized_size.row() - m_idx * 3;
|
|
int n = quantized_size.column() - n_idx * Epilogue::kElementsPerAccess;
|
|
|
|
for (double const &alpha : alpha_values) {
|
|
for (double const &beta : beta_values) {
|
|
|
|
bool passed = run({m, n}, {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});
|
|
|
|
if (!passed) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/// Runs the test
|
|
bool run(
|
|
cutlass::MatrixCoord problem_size,
|
|
OutputOpParams output_params) {
|
|
|
|
//
|
|
// Initialize problem space
|
|
//
|
|
|
|
ElementOutput default_output = ElementOutput(-127);
|
|
cutlass::reference::host::TensorFill(output_tensor.host_view(), default_output);
|
|
|
|
accumulator_tensor.sync_device();
|
|
output_tensor.sync_device();
|
|
source_tensor.sync_device();
|
|
|
|
//
|
|
// Initialize epilogue parameters
|
|
//
|
|
|
|
typename Epilogue::OutputTileIterator::Params params_D(output_tensor.device_ref().layout());
|
|
typename Epilogue::OutputTileIterator::Params params_C(source_tensor.device_ref().layout());
|
|
|
|
//
|
|
// Launch kernel
|
|
//
|
|
|
|
dim3 grid(1, 1);
|
|
dim3 block(Epilogue::WarpCount::kCount * 32, 1);
|
|
|
|
test::kernel::epilogue_threadblock<Epilogue><<< grid, block >>>(
|
|
params_D,
|
|
output_tensor.device_data(),
|
|
params_C,
|
|
source_tensor.device_data(),
|
|
output_params,
|
|
problem_size,
|
|
accumulator_tensor.device_view());
|
|
|
|
cudaError_t result = cudaDeviceSynchronize();
|
|
|
|
if (result != cudaSuccess) {
|
|
std::cerr << "Kernel error: " << cudaGetErrorString(result) << std::endl;
|
|
return false;
|
|
}
|
|
|
|
//
|
|
// Verify results
|
|
//
|
|
output_tensor.sync_host();
|
|
|
|
int errors = 0;
|
|
int const kMaxErrors = 5;
|
|
|
|
for (int r = 0; errors < kMaxErrors && r < quantized_size.row(); ++r) {
|
|
for (int c = 0; errors < kMaxErrors && c < quantized_size.column(); ++c) {
|
|
|
|
cutlass::MatrixCoord coord{r, c};
|
|
ElementOutput got = output_tensor.at(coord);
|
|
|
|
ElementOutput expected;
|
|
if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {
|
|
ElementCompute intermediate =
|
|
output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) +
|
|
output_params.beta * ElementCompute(source_tensor.at(coord));
|
|
|
|
if ((cutlass::platform::is_same<ElementOutput, cutlass::int4b_t>::value
|
|
|| cutlass::platform::is_same<ElementOutput, cutlass::uint4b_t>::value
|
|
|| std::numeric_limits<ElementOutput>::is_integer)
|
|
&& !std::numeric_limits<ElementCompute>::is_integer) {
|
|
std::fesetround(FE_TONEAREST);
|
|
expected = ElementOutput(std::nearbyint(float(cutlass::real(intermediate))));
|
|
} else {
|
|
expected = ElementOutput(intermediate);
|
|
}
|
|
} else {
|
|
expected = default_output;
|
|
}
|
|
|
|
if (expected != got) {
|
|
|
|
using OutputIO = cutlass::ScalarIO<ElementOutput>;
|
|
|
|
EXPECT_TRUE(false)
|
|
<< "-------\n"
|
|
<< "Error - output element (" << coord << ") - expected: "
|
|
<< OutputIO(expected)
|
|
<< ", got: " << OutputIO(got)
|
|
<< ", accum: " << (accumulator_tensor.at(coord))
|
|
<< ", source: " << OutputIO(source_tensor.at(coord))
|
|
<< ", alpha: " << (output_params.alpha)
|
|
<< ", beta: " << (output_params.beta) << "\n";
|
|
|
|
++errors;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Report results on error
|
|
//
|
|
|
|
if (errors) {
|
|
std::stringstream ss;
|
|
ss
|
|
<< "output_tensor_op_" << Epilogue::Shape::kM << "x" << Epilogue::Shape::kN << "_"
|
|
<< Epilogue::WarpTileIterator::WarpShape::kM << "x"
|
|
<< Epilogue::WarpTileIterator::WarpShape::kN
|
|
<< "_slice_" << Epilogue::WarpCount::kK << ".csv";
|
|
|
|
std::ofstream output_file(ss.str());
|
|
output_file << output_tensor.host_view();
|
|
}
|
|
|
|
return !errors;
|
|
}
|
|
};
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|