cutlass/test/unit/conv/device_3x/testbed_conv.hpp

546 lines
20 KiB
C++

/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Implicit GEMM testbed for 3.x API
*/
#pragma once
#include "cutlass/cutlass.h"
#include "../../common/cutlass_unit_test.h"
#include "cute/tensor.hpp"
#include "cutlass/kernel_hardware_info.hpp"
#include "cutlass/conv/convolution.h"
#include "cutlass/conv/convnd_problem_shape.hpp"
#include "thrust/universal_vector.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/packed_stride.hpp"
#include "cutlass/util/reference/host/conv.hpp"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/device/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "conv_problem_sizes.hpp"
#include "../cache_testbed_output.h"
#include <iostream>
#include "cute/layout.hpp"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace test::conv::device {
/////////////////////////////////////////////////////////////////////////////////////////////////
// Initializes a flat device buffer
template <typename Element>
static void
initialize_values(
thrust::universal_vector<Element>& dst_ptr,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (cutlass::Distribution::Uniform == dist_kind) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 8) {
scope = 2;
}
else if (bits == 16) {
scope = 4;
}
else {
scope = 8;
}
cutlass::reference::host::BlockFillRandomUniform(
dst_ptr.data().get(), dst_ptr.size(), seed, scope, -scope, 0);
}
else if (cutlass::Distribution::Identity == dist_kind) {
cutlass::reference::host::BlockFillRandomUniform(
dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0, 0);
}
else if (cutlass::Distribution::Gaussian == dist_kind) {
cutlass::reference::host::BlockFillRandomGaussian(dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0.5);
}
else if (cutlass::Distribution::Sequential == dist_kind) {
cutlass::reference::host::BlockFillSequential(dst_ptr.data().get(), dst_ptr.size());
}
else {
std::cerr << "Invalid distribution kind!\n.";
exit(1);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
template <class Conv>
struct ConvTestbed {
// Kernel data types
using ElementA = typename Conv::ConvKernel::ElementA;
using ElementB = typename Conv::ConvKernel::ElementB;
using ElementC = cute::conditional_t<cute::is_void_v<typename Conv::ConvKernel::ElementC>,
typename Conv::ConvKernel::ElementD, typename Conv::ConvKernel::ElementC>;
using ElementD = typename Conv::ConvKernel::ElementD;
using ElementAccumulator = typename Conv::ConvKernel::ElementAccumulator;
//
// FusionOperation derived types/queries
//
using FusionOp = typename Conv::EpilogueOutputOp;
// fusion types are potentially void if the fusion is not supported
// helper so we don't try to construct HostTensor with void type
template <typename T, typename U = uint8_t>
using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
using ElementScalar = typename FusionOp::ElementScalar;
using ElementCompute = typename FusionOp::ElementCompute;
using BiasType = typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::type;
using ElementBias = non_void_t<BiasType>;
using ActivationType = non_void_t<typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::type,
cutlass::epilogue::thread::Identity<ElementCompute>>;
static constexpr bool IsActivationEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::value;
using ActivationFunctor = cute::conditional_t<IsActivationEnabled, ActivationType, cutlass::epilogue::thread::Identity<ElementCompute>>;
static constexpr bool IsBiasEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::value &&
!cute::is_same_v<BiasType, void>;
using StrideC = typename Conv::ConvKernel::StrideC;
using StrideD = typename Conv::ConvKernel::StrideD;
using ThreadEpilogueOp = typename Conv::ConvKernel::CollectiveEpilogue::ThreadEpilogueOp;
static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
using Schedule = typename Conv::DispatchPolicy::Schedule;
/// Initialization
cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform;
cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform;
cutlass::Distribution::Kind init_C = cutlass::Distribution::Uniform;
cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
uint64_t seed = 6090;
float epsilon = 0.0f;
int split_p_slices = 1;
thrust::universal_vector<ElementA> tensor_A;
thrust::universal_vector<ElementB> tensor_B;
thrust::universal_vector<ElementC> tensor_C;
thrust::universal_vector<ElementD> tensor_D_computed;
thrust::universal_vector<ElementD> tensor_D_reference;
thrust::universal_vector<ElementBias> tensor_bias;
thrust::universal_vector<ElementScalar> tensor_alpha;
thrust::universal_vector<ElementScalar> tensor_beta;
void initialize(ProblemShape const& problem_shape, uint64_t seed = 6090) {
tensor_A.resize(sizeof(ElementA) * problem_shape.size_A());
tensor_B.resize(sizeof(ElementB) * problem_shape.size_B());
tensor_C.resize(sizeof(ElementC) * problem_shape.size_C());
tensor_D_computed.resize(sizeof(ElementD) * problem_shape.size_C());
tensor_D_reference.resize(sizeof(ElementD) * problem_shape.size_C());
tensor_bias.resize(sizeof(ElementBias) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
initialize_values(tensor_A, init_A, seed);
initialize_values(tensor_B, init_B, seed * 11);
initialize_values(tensor_C, init_C, seed * 17);
initialize_values(tensor_bias, init_bias, seed * 19);
}
// Determine SMEM requirements and waive if not satisfied
bool sufficient() const {
int device_idx;
cudaError_t result = cudaGetDevice(&device_idx);
if (result != cudaSuccess) {
throw std::runtime_error("cudaGetDevice() API call failed.");
}
int max_smem_size;
result = cudaDeviceGetAttribute(&max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx);
if (result != cudaSuccess) {
throw std::runtime_error("cudaDeviceGetAttribute() failed");
}
return max_smem_size >= Conv::ConvKernel::SharedStorageSize;
}
/// Executes one test
bool run(
ProblemShape const& problem_shape,
ElementScalar alpha = ElementScalar(1),
ElementScalar beta = ElementScalar(0)
) {
// Waive test if insufficient CUDA device
if (!sufficient()) {
if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
std::cerr << "Test waived due to insufficient CUDA device.\n";
}
return true;
}
initialize(problem_shape);
cutlass::KernelHardwareInfo hw_info;
cudaGetDevice(&hw_info.device_id);
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
// configure the operator
Conv conv_op;
auto stride_C = StrideC{};
auto stride_D = StrideD{};
if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
stride_C = cutlass::make_cute_packed_stride(
StrideC{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
stride_D = cutlass::make_cute_packed_stride(
StrideD{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
}
// Need to support non-packed output strides for fprop and dgrad kernel.
else {
cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
cute::get<0, i>(stride_C) = problem_shape.stride_C[ProblemShape::RankT-2-i];
});
cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
cute::get<0, i>(stride_D) = problem_shape.stride_C[ProblemShape::RankT-2-i];
});
}
typename Conv::ConvKernel::TileScheduler::Arguments scheduler_args{};
auto args = typename Conv::Arguments {
{
problem_shape,
tensor_A.data().get(),
tensor_B.data().get(),
}, // MainloopArguments
{
{},
tensor_C.data().get(),
stride_C,
tensor_D_computed.data().get(),
stride_D,
}, // EpilogueArguments
hw_info,
scheduler_args
};
auto &fusion_args = args.epilogue.thread;
fusion_args.alpha = alpha;
fusion_args.beta = beta;
if constexpr (IsBiasEnabled) {
fusion_args.bias_ptr = tensor_bias.data().get();
}
// Clamp bound
if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
fusion_args.activation.lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::lowest();
fusion_args.activation.upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::max();
}
// Scale
if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>> ||
cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU<ElementCompute>>) {
fusion_args.activation.scale = ElementCompute{1};
}
cutlass::Status status = cutlass::Status::kInvalid;
status = conv_op.can_implement(args);
EXPECT_EQ(conv_op.can_implement(args), cutlass::Status::kSuccess);
if (status != cutlass::Status::kSuccess) {
std::cerr << "can_implement failed for the given problem_shape: \n";
print(problem_shape);
return false;
}
// find workspace requirement for parallel split-k reduction
size_t workspace_size = Conv::get_workspace_size(args);
thrust::universal_vector<uint8_t> workspace(workspace_size);
status = conv_op.initialize(args, workspace.data().get());
if (status != cutlass::Status::kSuccess) {
cudaError_t error = cudaGetLastError();
std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
return true;
}
// run conv3d operator
status = conv_op();
EXPECT_TRUE(status == cutlass::Status::kSuccess);
if (status != cutlass::Status::kSuccess) {
return false;
}
bool passed = false;
cudaError_t result = cudaDeviceSynchronize();
EXPECT_EQ(result, cudaSuccess) << " Kernel execution error: "
<< cudaGetErrorString(result);
// Create cute::Tensors using the logical rank-3 MNK multi-mode shapes the mainloop gives us
auto shape_mA = cute::reverse(problem_shape.shape_A);
auto shape_mB = cute::reverse(problem_shape.shape_B);
auto shape_mC = cute::reverse(problem_shape.shape_C);
auto shape_mBias = cute::make_shape(cute::size(cute::get<0>(problem_shape.get_shape_B())));
auto stride_mA = cute::reverse(problem_shape.stride_A);
auto stride_mB = cute::reverse(problem_shape.stride_B);
auto stride_mC = cute::reverse(problem_shape.stride_C);
auto mA = make_tensor(tensor_A.data().get(), make_layout(shape_mA, stride_mA));
auto mB = make_tensor(tensor_B.data().get(), make_layout(shape_mB, stride_mB));
auto mC = make_tensor(tensor_C.data().get(), make_layout(shape_mC, stride_mC));
auto mD_ref = make_tensor(tensor_D_reference.data().get(), make_layout(shape_mC, stride_mC));
auto mD_computed = make_tensor(tensor_D_computed.data().get(), make_layout(shape_mC, stride_mC));
auto mBias = make_tensor(tensor_bias.data().get(), make_layout(shape_mBias));
auto mAlpha = make_tensor(tensor_alpha.data().get(), make_layout(shape_mBias));
auto mBeta = make_tensor(tensor_beta.data().get(), make_layout(shape_mBias));
cutlass::reference::host::ConvEpilogueFusionParams<
ElementAccumulator,
ElementScalar,
ElementCompute,
ElementC,
ElementD,
decltype(mAlpha),
decltype(mBeta),
decltype(mBias),
ActivationFunctor>
epilogue_fusion_params{};
epilogue_fusion_params.alpha = alpha;
epilogue_fusion_params.beta = beta;
if constexpr (IsBiasEnabled) {
epilogue_fusion_params.tensor_bias = mBias;
}
auto padding = cute::reverse(problem_shape.lower_padding);
auto tstride = cute::reverse(problem_shape.traversal_stride);
auto dilation = cute::reverse(problem_shape.dilation);
cutlass::reference::host::ConvReferenceImpl<
ConvOp,
NumSpatialDimensions,
decltype(mA),
decltype(mB),
decltype(mC),
decltype(mD_ref),
decltype(padding),
decltype(tstride),
decltype(dilation),
decltype(epilogue_fusion_params)>
reference_impl(mA, mB, mC, mD_ref, padding, tstride, dilation, epilogue_fusion_params);
//
// Reference check - support caching results
//
CachedTestKey cached_test_key = CreateCachedConvNd3xTestKey<
ProblemShape,
ElementA,
ElementB,
ElementC,
ElementD
>(
ConvOp,
problem_shape,
alpha,
beta,
tensor_A,
tensor_B,
tensor_C
);
//
// Look for the cached key
//
bool cached_result_loaded = false;
CachedTestResult cached_test_result;
std::string convnd_result_cache_name =
std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
#if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
CachedTestResultListing cached_results(convnd_result_cache_name);
auto cached = cached_results.find(cached_test_key);
cached_result_loaded = cached.first;
if (cached_result_loaded) {
cached_test_result = cached.second;
}
#endif
if (!cached_result_loaded) {
// Compute reference
reference_impl.compute_reference();
#if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
cached_test_result.D = TensorHash(tensor_D_reference);
CachedTestResultListing cached_results(convnd_result_cache_name);
cached_results.append(cached_test_key, cached_test_result);
cached_results.write(convnd_result_cache_name);
#endif
} // if (!cached_result_loaded)
#if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
uint32_t tensor_D_computed_hash = TensorHash(tensor_D_computed);
passed = (tensor_D_computed_hash == cached_test_result.D);
// If hash fails, double check against reference implementation.
if(!passed) {
std::cerr << "Hash-based comparison unsuccessful for key:" << "\n" << cached_test_key
<< ", comparing with reference implementation now.\n";
if (cached_result_loaded) {
// Compute reference
reference_impl.compute_reference();
}
// Validate kernel against reference
passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
}
#else
// Validate kernel against reference
passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
#endif
EXPECT_TRUE(passed);
return passed;
}
template<
class Engine, class Layout,
class EngineA, class LayoutA,
class EngineB, class LayoutB,
class EngineAlpha, class LayoutAlpha,
class EngineBeta, class LayoutBeta,
class EngineBias, class LayoutBias>
static constexpr bool
compare_reference(
cute::Tensor<Engine, Layout> const& reference,
cute::Tensor<Engine, Layout> const& computed,
cute::Tensor<EngineA, LayoutA> const& A,
cute::Tensor<EngineB, LayoutB> const& B,
cute::Tensor<EngineAlpha, LayoutAlpha> const& tensor_alpha,
cute::Tensor<EngineBeta, LayoutBeta> const& tensor_beta,
cute::Tensor<EngineBias, LayoutBias> const& tensor_bias,
float epsilon = 0.0f) {
if (size(reference) != size(computed)) {
return false;
}
bool passed = true;
if (epsilon == 0.0f) {
// fast refcheck w/o epsilon
for (size_t i = 0; i < size_t(size(reference)); ++i) {
if (reference(i) != computed(i)) {
passed = false;
break;
}
}
} else {
// refcheck with epsilon
for (size_t i = 0; i < size_t(size(reference)); ++i) {
auto ref = static_cast<float>(reference(i));
auto act = static_cast<float>(computed(i));
auto abs_error = std::abs(act - ref);
auto rel_error = abs_error / (std::max(std::abs(act), std::abs(ref)) + 0.00001f);
if (std::isnan(abs_error) || std::isnan(rel_error) ||
std::min(abs_error, rel_error) > epsilon) {
passed = false;
break;
}
}
}
#if CUTLASS_DEBUG_TRACE_LEVEL > 1
if (not passed) {
cute::print("Reference:");
cute::print_tensor(reference);
cute::print("\nComputed:");
cute::print_tensor(computed);
cute::print("\n");
for (size_t i = 0; i < size_t(size(A)); ++i) {
printf("[%ld]: A = %f\n", i, float(A(i)));
}
for (size_t i = 0; i < size_t(size(B)); ++i) {
printf("[%ld]: B = %f\n", i, float(B(i)));
}
if constexpr (IsBiasEnabled) {
for (size_t i = 0; i < size_t(size(tensor_bias)); ++i) {
printf("[%ld]: bias = %f\n", i, float(tensor_bias(i)));
}
}
for (size_t i = 0; i < size_t(size(reference)); ++i) {
printf("[%ld]: ref = %f, computed = %f\n", i, float(reference(i)), float(computed(i)));
}
}
#endif
return passed;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename Conv>
bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f) {
using ElementScalar = typename Conv::EpilogueOutputOp::ElementScalar;
bool passed = true;
ConvTestbed<Conv> testbed;
testbed.epsilon = epsilon;
auto problem_vector = get_conv_problem_vector<
Conv::NumSpatialDimensions, Conv::DispatchPolicy::ConvOp>();
for (auto conv_problem : problem_vector) {
#if CUTLASS_DEBUG_TRACE_LEVEL > 0
print(conv_problem);
#endif
passed = testbed.run(
conv_problem,
cutlass::from_real<ElementScalar>(alpha),
cutlass::from_real<ElementScalar>(beta));
if (!passed) {
printf("Failed test for "); print(conv_problem);
return false;
}
}
return passed;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace test::conv::device
/////////////////////////////////////////////////////////////////////////////////////////////////