CUTLASS 2.5

This commit is contained in:
Andrew Kerr 2021-02-26 09:58:26 -05:00
parent ccb697bac7
commit 0e13748649
771 changed files with 15476 additions and 1717 deletions

View File

@ -1,6 +1,19 @@
# NVIDIA CUTLASS Changelog
# CUTLASS 2.x
## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26)
* Tensor reductions
* User-supplied reduction operations across one or more dimensions of tensors with affine layouts
* Optimizations for vectorized memory accesses
* Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31)
* Fused inlined operations on Convolution input
* Vector broadcast and transformation on Convolution input
* Optimizations for 3-D convolution
* Tile iterators using precomputed delta table for three spatial dimensions
* Performance parity with 2-D convolution implementation
## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19)
* Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs
* Operators: forward (Fprop), backward data gradient (Dgrad), and backward weight gradient (Wgrad) convolution
@ -126,7 +139,7 @@
## Copyright
Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
```
Redistribution and use in source and binary forms, with or without modification, are permitted

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
@ -32,7 +32,7 @@ endif()
message(STATUS "CMake Version: ${CMAKE_VERSION}")
project(CUTLASS VERSION 2.4.0 LANGUAGES CXX)
project(CUTLASS VERSION 2.5.0 LANGUAGES CXX)
include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
find_package(Doxygen QUIET)
@ -67,6 +67,8 @@ else()
set(CUTLASS_ENABLE_TOOLS_INIT ON)
endif()
set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.")
set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable CUTLASS Examples")
set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools")
set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Library")
@ -114,10 +116,6 @@ if (POLICY CMP0076)
cmake_policy(SET CMP0076 NEW)
endif()
if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 )
message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!")
endif()
include(GNUInstallDirs)
link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
@ -257,6 +255,17 @@ if (NOT CMAKE_BUILD_TYPE MATCHES "Release")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -lineinfo)
endif()
#Report CUDA build flags
if (CUDA_COMPILER MATCHES "[Cc]lang")
if(CUTLASS_CUDA_CLANG_FLAGS)
message(STATUS "Using CLANG flags: ${CUTLASS_CUDA_CLANG_FLAGS}")
endif()
else()
if(CUTLASS_CUDA_NVCC_FLAGS)
message(STATUS "Using NVCC flags: ${CUTLASS_CUDA_NVCC_FLAGS}")
endif()
endif()
if(CUDA_COMPILER MATCHES "[Cc]lang")
if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" )
@ -318,20 +327,35 @@ function(cutlass_apply_cuda_gencode_flags TARGET)
endfunction()
# Cache the flags so they are available when the function below is called anywhere globally.
set(__CUTLASS_CUDA_FLAGS ${CUTLASS_CUDA_FLAGS} CACHE INTERNAL "")
set(__CUTLASS_CUDA_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} CACHE INTERNAL "")
set(__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__CUTLASS_CUDA_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS ${CUTLASS_CUDA_CLANG_FLAGS} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS_RELEASE ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__CUTLASS_CUDA_CLANG_FLAGS_DEBUG ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "")
set(__CUTLASS_CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG} CACHE INTERNAL "")
function(cutlass_apply_standard_compile_options TARGET)
if(CUDA_COMPILER MATCHES "[Cc]lang")
set(CUDA_COMPILE_LANGUAGE CXX)
set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_CLANG_FLAGS})
set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_CLANG_FLAGS})
set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_CLANG_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_CLANG_FLAGS_DEBUG})
else()
set(CUDA_COMPILE_LANGUAGE CUDA)
set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_NVCC_FLAGS})
set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_NVCC_FLAGS})
set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_NVCC_FLAGS_RELEASE})
set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO})
set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_NVCC_FLAGS_DEBUG})
endif()
target_compile_options(
@ -464,20 +488,6 @@ endif()
################################################################################
include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake)
if (CUTLASS_ENABLE_CUBLAS)
target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1)
endif()
include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake)
if (CUTLASS_ENABLE_CUDNN)
target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1)
endif()
################################################################################
include(CTest)
enable_testing()
if (NOT TARGET test_all)
@ -497,6 +507,22 @@ install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR})
install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_LIBDIR})
install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest)
################################################################################
include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake)
if (CUTLASS_ENABLE_CUBLAS)
target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1)
endif()
include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake)
if (CUTLASS_ENABLE_CUDNN)
target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1)
endif()
################################################################################
set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake)
set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "")

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
@ -204,7 +204,7 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
# paths by default, so we add it explicitly here.
function(cutlass_correct_source_file_language_property)
if(CUDA_COMPILER MATCHES "clang")
if(CUDA_COMPILER MATCHES "[Cc]lang")
foreach(File ${ARGN})
if(File MATCHES ".*\.cu$")
set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)

View File

@ -1,8 +1,8 @@
![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
# CUTLASS 2.4
# CUTLASS 2.5
_CUTLASS 2.4 - November 2020_
_CUTLASS 2.5 - February 2021_
CUTLASS is a collection of CUDA C++ template abstractions for implementing
high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
@ -34,12 +34,18 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
See the [functionality listing](/media/docs/functionality.md) for the list of operations
supported at each level of the execution model hierarchy.
# What's New in CUTLASS 2.5
CUTLASS 2.5 is a minor update to CUTLASS adding:
- Tensor reductions
- Fused inlined operations on Convolution input
- Optimizations for 3-D convolution
- See the [CHANGELOG](CHANGELOG.md) for more details
# What's New in CUTLASS 2.4
CUTLASS 2.4 is a significant update to CUTLASS adding:
- 1-D, 2-D, and 3-D convolution targeting Tensor and CUDA cores for NVIDIA Ampere, Turing, and Volta GPU architectures
- CUTLASS profiler support for convolution
- [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation
- See the [CHANGELOG](CHANGELOG.md) for more details.
# What's New in CUTLASS 2.3
@ -47,7 +53,6 @@ CUTLASS 2.3 is a minor update to CUTLASS adding:
- GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs
- Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores
- Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit)
- See the [CHANGELOG](CHANGELOG.md) for more details.
# What's New in CUTLASS 2.2
@ -508,7 +513,7 @@ The official list of CUTLASS developers and contributors is available here: [CON
# Copyright
Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
```
Redistribution and use in source and binary forms, with or without modification, are permitted

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -284,8 +284,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -266,8 +266,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -485,6 +485,7 @@ Result profile_convolution(Options const &options) {
// Split K dimension into 1 partitions
int split_k_slices = 1;
// Construct Conv2dProblemSize with user defined output size
cutlass::conv::Conv2dProblemSize problem_size(
options.input_size,
options.filter_size,
@ -495,6 +496,8 @@ Result profile_convolution(Options const &options) {
mode,
split_k_slices);
// Construct ImplicitGemm::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename ImplicitGemm::Arguments arguments{
problem_size,
tensor_a.device_ref(),
@ -515,6 +518,9 @@ Result profile_convolution(Options const &options) {
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
result.status = implicit_gemm_op.can_implement(arguments);
CUTLASS_CHECK(result.status);
result.status = implicit_gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(result.status);

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -48,11 +48,19 @@ using ElementInputA = cutlass::half_t; // <- data type of elements
using ElementInputB = cutlass::half_t; // <- data type of elements in input matrix B
using ElementOutput = float; // <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices. Column Major for
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
// The code section below describes matrix layout of input and output matrices.
// Column Major for Matrix A, B and C.
//
// Note this example only works for ColumnMajor output because
// 1) we only have row major epilogue.
// 2) we swap A and B if the output is column major then we can still use the
// row major epilogue.
// 3) Mx1 bias vector becomes 1xM after the swapping/transposing.
// 4) we can use the existing OutputIterator to load 1xM bias vector.
using LayoutInputA = cutlass::layout::ColumnMajor;
using LayoutInputB = cutlass::layout::ColumnMajor;
using LayoutOutput = cutlass::layout::RowMajor;
using LayoutOutput = cutlass::layout::ColumnMajor;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassTensorOp;
@ -73,17 +81,18 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw
// Define the epilogue operation as LinearCombinationRelu. This is approximately equal to
//
// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij )
// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + c_ij )
//
using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput, // <- data type of output matrix
128 / cutlass::sizeof_bits<ElementOutput>::value, // <- this is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This becomes
// the vector width of math instructions in
// epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
ElementOutput, // <- data type of output matrix
128 / cutlass::sizeof_bits<ElementOutput>::value, // <- this is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This becomes
// the vector width of math instructions in
// epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue, // <- data type for alpha in linear combination function
cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // <- alpha x C + bias
// Number of pipelines you want to use
constexpr int NumStages = 2;
@ -160,9 +169,8 @@ int run() {
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
// Initialize alpha for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
// Split K dimension into 1 partitions
int split_k_slices = 1;
@ -178,7 +186,7 @@ int run() {
// to project away the N dimension by setting the stride to zero.
tensor_d.device_ref(), // <- reference to matrix D on device
{alpha, beta}, // <- tuple of alpha and beta
{alpha}, // <- alpha
split_k_slices}; // <- k-dimension split factor
// Using the arguments, query for extra workspace required for matrix multiplication computation
@ -190,8 +198,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
@ -233,7 +245,7 @@ int run() {
for (int j = 0; j < problem_size.n(); ++j) {
tensor_ref_d.at({i, j}) = std::max(
ElementOutput(0),
ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0}))
ElementOutput(tensor_ref_d.at({i, j}) + tensor_c_bias.at({i, 0}))
);
}
}

View File

@ -0,0 +1,45 @@
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
13_fused_two_gemms
fused_gemm.cu
)
cutlass_example_add_executable(
13_fused_two_convs
fused_conv2d.cu
)
target_include_directories(
13_fused_two_gemms
PRIVATE
.
)
target_include_directories(
13_fused_two_convs
PRIVATE
.
)

View File

@ -0,0 +1,76 @@
# Introduction
This example shows fusing two back-to-back GEMMs/Convolutions into one kernel.
<p align="center"><img src=/media/images/13_example_fusion.png></p>
When running two unfused GEMM/Conv operations, each operation loads one input
activation matrix, one weight matrix (or filter matrix) from the memory and then
stores the result activation matrix back to the memory.
When the two GEMM/Conv operations are fused together, the mainloops of the two
GEMMs/Convs run back to back in a single kernel. The output accumulator of the
1st GEMM/Conv will be stored in the register file and reused as the activation
input of the 2nd GEMM/Conv. This saves a round trip to memory for the activation
matrix.
This example computes the following:
- 1st GEMM/Conv: D0 = relu(alpha0 .\* A0 \*\* B0)
- 2nd GEMM/Conv: D1 = relu(alpha1 .\* D0 \*\* B1 + beta1 .\* C1)
In the above equation, operator \*\* can be matrix multiplication or convolution operation.
# Implementation Details
In order to run two GEMM/Convs in a single kernel, the example requires the same number of
threadblocks are used across 2 GEMMs/Convs. This also ensures the same threadblock tile M across
2 GEMMs/Convs.
In order to reuse the output accumulator (stored in register-file) of the 1st GEMM as the
input activation, the example enforces the following two constraints:
- thread_block_tile_N = problem_N
<p align="center"><img src=/media/images/13_example_block_resident_fusion.png></p>
This constraint ensures that each threadblock loads the entire weight/filter matrix in
addition to its own input activation tile. Therefore the input activation tile of the
2nd GEMM/Conv only depends on the output activation tile of the 1st GEMM/Conv, and the
operation can be fully block-resident.
- warp_tile_N = thread_block_tile_N
<p align="center"><img src=/media/images/13_example_rf_resident_fusion.png></p>
This constraint ensures that each warp loads the entire weight/filter kBlock in
addition to its own input activation tile. Therefore the input activation warp tile of the
2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
register file, and the operation can be fully register-file-resident.
# Copyright
Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
```
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or other materials
provided with the distribution.
* Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
to endorse or promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

View File

@ -0,0 +1,368 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_optimized_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)

View File

@ -0,0 +1,363 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_optimized_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -0,0 +1,367 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_optimized_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)

View File

@ -0,0 +1,368 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide GEMM interface
*/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 3, 3, 64}, // filter size (KRSC)
{1, 1, 1, 1}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
{128, 56, 56, 64}, // input size (NHWC)
{64, 1, 1, 64}, // filter size (KRSC)
{0, 0, 0, 0}, // padding (pad_h, _, pad_w, _)
{1, 1}, // stride (stride_h, stride_w)
{1, 1}, // dilation (dilation_h, dilation_w)
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_conv2d_fprop_optimized_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
using ElementC = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
ElementCompute alpha0 = ElementCompute(1);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(1);
ElementCompute beta1 = ElementCompute(0);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>;
using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
ElementA, cutlass::layout::TensorNCxHWx<32>,
ElementB, cutlass::layout::TensorCxRSKx<32>,
ElementC, cutlass::layout::TensorNCxHWx<32>,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
cutlass::arch::OpMultiplyAddSaturate,
cutlass::conv::IteratorAlgorithm::kOptimized
>::Kernel;
using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -0,0 +1,628 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Implicit GEMM testbed
*/
#pragma once
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/reduction/device/reduce_split_k.h"
#include "cutlass/reduction/thread/reduction_operators.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/reference/device/convolution.h"
#include "cutlass/util/reference/device/tensor_relu.h"
#include "cutlass/core_io.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
#define CHECK_GT(val1, val2) \
if((val1) <= (val2)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
#define CHECK_TRUE(val) \
if(!(val)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
template <typename Conv2d0_, typename Conv2d1_>
class B2bNonFusedConv2dRun {
public:
using Conv2d0 = Conv2d0_;
using Conv2d1 = Conv2d1_;
using ElementAccumulator = typename Conv2d0::ElementAccumulator;
using ElementCompute = typename Conv2d0::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator,
"Fused convolution operators must be the same");
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
uint64_t seed;
cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
public:
B2bNonFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else {
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
tensor_D0_computed.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1);
// configure the operator
Conv2d0 conv2d_op_0;
Conv2d1 conv2d_op_1;
typename Conv2d0::Arguments conv2d_args_0(
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_D0_computed.device_ref(),
{alpha0, beta0},
split_k_mode
);
typename Conv2d1::Arguments conv2d_args_1(
problem_size_1,
tensor_D0_computed.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_computed.device_ref(),
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
CUTLASS_CHECK(status);
status = conv2d_op_1.initialize(conv2d_args_1);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = conv2d_op_0();
CUTLASS_CHECK(status);
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
//
// Run Conv2d
//
cudaEvent_t start, stop1, stop2;
cudaEventCreate(&start);
cudaEventCreate(&stop1);
cudaEventCreate(&stop2);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop2);
cudaDeviceSynchronize();
float conv2d0Time, conv2d1Time, totalTime;
cudaEventElapsedTime(&conv2d0Time, start, stop1);
cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
std::cout << "total time " << totalTime / (float)runs << " ms\n";
tensor_D0_computed.sync_host();
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename Conv2d0::ElementA,
typename Conv2d0::LayoutA,
typename Conv2d0::ElementB,
typename Conv2d0::LayoutB,
typename Conv2d0::ElementC,
typename Conv2d0::LayoutC,
ElementCompute,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
beta0);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename Conv2d1::ElementA,
typename Conv2d1::LayoutA,
typename Conv2d1::ElementB,
typename Conv2d1::LayoutB,
typename Conv2d1::ElementC,
typename Conv2d1::LayoutC,
ElementCompute,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_nonfused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
<< "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
template <typename B2bConv2d_>
class B2bFusedConv2dRun {
public:
using B2bConv2d = B2bConv2d_;
using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
using ElementCompute = typename B2bConv2d::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
uint64_t seed;
cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
public:
B2bFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else {
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_C0.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_C1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1);
// configure the operator
B2bConv2d b2b_conv2d_op;
typename B2bConv2d::Arguments b2b_conv2d_args(
problem_size_0,
problem_size_1,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_computed.device_ref(),
{alpha0, beta0},
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
//
// Run the Conv2d
//
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop);
cudaDeviceSynchronize();
float conv2dTime;
cudaEventElapsedTime(&conv2dTime, start, stop);
std::cout << "time " << conv2dTime / (float)runs << " ms\n";
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
beta0);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
ElementAccumulator
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_fused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -43,14 +43,15 @@
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_f16() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
@ -110,7 +111,7 @@ void run_nonfused_gemm_f16() {
B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool pass = nonFusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
@ -123,8 +124,6 @@ void run_fused_gemm_f16() {
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
@ -178,7 +177,7 @@ void run_fused_gemm_f16() {
B2bFusedGemmRun<B2bGemm> fusedGemm;
std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else

View File

@ -0,0 +1,189 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/gemm.h"
#include "device/b2b_gemm.h"
#include "b2b_gemm_run.h"
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_f16_sm80() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
ElementCompute beta1 = ElementCompute(1);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using Gemm0 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
WarpShape0,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
using Gemm1 = cutlass::gemm::device::Gemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape1,
WarpShape1,
InstructionShape,
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
bool pass = nonFusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
void run_fused_gemm_f16_sm80() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
using ElementCompute = cutlass::half_t;
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
ElementCompute beta1 = ElementCompute(1);
using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
using EpilogueOutputOp0 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
>;
using EpilogueOutputOp1 =
cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
>;
using B2bGemm = cutlass::gemm::device::B2bGemm<
cutlass::half_t,
cutlass::layout::RowMajor,
cutlass::half_t,
cutlass::layout::ColumnMajor,
ElementOutput,
cutlass::layout::RowMajor,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape0,
ThreadblockShape1,
WarpShape0,
WarpShape1,
InstructionShape,
EpilogueOutputOp0,
EpilogueOutputOp1,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
>;
B2bFusedGemmRun<B2bGemm> fusedGemm;
std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else
std::cout << "Fail\n";
}
////////////////////////////////////////////////////////////////////////////////
#endif //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -121,7 +121,9 @@ struct B2bNonFusedGemmRun
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true) {
bool relu = true,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
@ -222,6 +224,14 @@ struct B2bNonFusedGemmRun
status = gemm_op_1.initialize(arguments_1);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
status = gemm_op_1();
CUTLASS_CHECK(status);
}
//
// Run the GEMM
//
@ -233,13 +243,13 @@ struct B2bNonFusedGemmRun
cudaEventRecord(start);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = gemm_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = gemm_op_1();
@ -252,9 +262,9 @@ struct B2bNonFusedGemmRun
cudaEventElapsedTime(&gemm0Time, start, stop1);
cudaEventElapsedTime(&gemm1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n";
std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n";
std::cout << "total time " << totalTime / 100.0 << " ms\n";
std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
std::cout << "total time " << totalTime / (float)runs << " ms\n";
tensor_D0.sync_host();
tensor_D1.sync_host();
@ -415,7 +425,9 @@ struct B2bFusedGemmRun
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true) {
bool relu = true,
int warm_ups = 1,
int runs = 100) {
//
// Allocate the GEMM workspace
@ -433,10 +445,6 @@ struct B2bFusedGemmRun
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
// cutlass::HostTensor<
// typename B2bGemm::ElementC,
// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
cutlass::HostTensor<
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
@ -503,6 +511,11 @@ struct B2bFusedGemmRun
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
}
//
// Run the GEMM
//
@ -513,7 +526,7 @@ struct B2bFusedGemmRun
cudaEventRecord(start);
for(int i = 0; i < 100; i++) {
for(int i = 0; i < runs; i++) {
status = b2b_gemm_op();
CUTLASS_CHECK(status);
@ -523,9 +536,8 @@ struct B2bFusedGemmRun
cudaDeviceSynchronize();
float gemmTime;
cudaEventElapsedTime(&gemmTime, start, stop);
std::cout << "time " << gemmTime / 100.0 << " ms\n";
std::cout << "time " << gemmTime / (float)runs << " ms\n";
//tensor_D0.sync_host();
tensor_D1.sync_host();
//
@ -593,7 +605,6 @@ struct B2bFusedGemmRun
<< "A0 =\n" << tensor_A0.host_view()
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
// << "\nD0 =\n" << tensor_D0.host_view()
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()
<< "\n\nReference =\n" << reference_D1.host_view()

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -43,14 +43,15 @@
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_s8() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
@ -110,7 +111,7 @@ void run_nonfused_gemm_s8() {
B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool pass = nonFusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
@ -123,8 +124,6 @@ void run_fused_gemm_s8() {
using ElementAccumulator = int32_t;
using ElementCompute = float;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
@ -178,7 +177,7 @@ void run_fused_gemm_s8() {
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -43,14 +43,15 @@
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_s8_sm80() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
using ElementCompute = float;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
@ -86,8 +87,7 @@ void run_nonfused_gemm_s8_sm80() {
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
true
cutlass::arch::OpMultiplyAddSaturate
>;
using Gemm1 = cutlass::gemm::device::Gemm<
int8_t,
@ -113,14 +113,13 @@ void run_nonfused_gemm_s8_sm80() {
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
true
cutlass::arch::OpMultiplyAddSaturate
>;
B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool pass = nonFusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(pass)
std::cout << "Pass\n";
else
@ -133,8 +132,6 @@ void run_fused_gemm_s8_sm80() {
using ElementAccumulator = int32_t;
using ElementCompute = float;
cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64);
ElementCompute alpha0 = ElementCompute(2);
ElementCompute beta0 = ElementCompute(0);
ElementCompute alpha1 = ElementCompute(2);
@ -193,7 +190,7 @@ void run_fused_gemm_s8_sm80() {
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1);
bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
if(passed)
std::cout << "Pass\n";
else

View File

@ -0,0 +1,661 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Implicit GEMM testbed
*/
#pragma once
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/reduction/device/reduce_split_k.h"
#include "cutlass/reduction/thread/reduction_operators.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/device/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/host_reorder.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/reference/device/convolution.h"
#include "cutlass/util/reference/device/tensor_relu.h"
#include "cutlass/core_io.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
#define CHECK_GT(val1, val2) \
if((val1) <= (val2)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
#define CHECK_TRUE(val) \
if(!(val)) \
std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
template <typename Conv2d0_, typename Conv2d1_, int InterleavedK>
class B2bInterleavedNonFusedConv2dRun {
public:
using Conv2d0 = Conv2d0_;
using Conv2d1 = Conv2d1_;
using ElementAccumulator = typename Conv2d0::ElementAccumulator;
using ElementCompute = typename Conv2d0::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator;
static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator,
"Fused convolution operators must be the same");
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
uint64_t seed;
cutlass::HostTensor<typename Conv2d0::ElementA, typename Conv2d0::LayoutA> tensor_A0;
cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0;
cutlass::HostTensor<typename Conv2d0::ElementB, typename Conv2d0::LayoutB> tensor_B0_reordered;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_C0;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_computed;
cutlass::HostTensor<typename Conv2d0::ElementC, typename Conv2d0::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1;
cutlass::HostTensor<typename Conv2d1::ElementB, typename Conv2d1::LayoutB> tensor_B1_reordered;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_C1;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename Conv2d1::ElementC, typename Conv2d1::LayoutC> tensor_D1_reference;
public:
B2bInterleavedNonFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else {
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
//Reorder B0 and B1
cutlass::reorder_convK<InterleavedK, InterleavedK>(
tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
cutlass::reorder_convK<InterleavedK, InterleavedK>(
tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_B0_reordered.sync_device();
tensor_C0.sync_device();
tensor_D0_computed.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_B1_reordered.sync_device();
tensor_C1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1);
// configure the operator
Conv2d0 conv2d_op_0;
Conv2d1 conv2d_op_1;
typename Conv2d0::Arguments conv2d_args_0(
problem_size_0,
tensor_A0.device_ref(),
tensor_B0_reordered.device_ref(),
tensor_C0.device_ref(),
tensor_D0_computed.device_ref(),
{alpha0, beta0},
split_k_mode
);
typename Conv2d1::Arguments conv2d_args_1(
problem_size_1,
tensor_D0_computed.device_ref(),
tensor_B1_reordered.device_ref(),
tensor_C1.device_ref(),
tensor_D1_computed.device_ref(),
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0);
CUTLASS_CHECK(status);
status = conv2d_op_1.initialize(conv2d_args_1);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = conv2d_op_0();
CUTLASS_CHECK(status);
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
//
// Run Conv2d
//
cudaEvent_t start, stop1, stop2;
cudaEventCreate(&start);
cudaEventCreate(&stop1);
cudaEventCreate(&stop2);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_0();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop1);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = conv2d_op_1();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop2);
cudaDeviceSynchronize();
float conv2d0Time, conv2d1Time, totalTime;
cudaEventElapsedTime(&conv2d0Time, start, stop1);
cudaEventElapsedTime(&conv2d1Time, stop1, stop2);
cudaEventElapsedTime(&totalTime, start, stop2);
std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
std::cout << "total time " << totalTime / (float)runs << " ms\n";
tensor_D0_computed.sync_host();
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename Conv2d0::ElementA,
typename Conv2d0::LayoutA,
typename Conv2d0::ElementB,
typename Conv2d0::LayoutB,
typename Conv2d0::ElementC,
typename Conv2d0::LayoutC,
ElementCompute,
ElementAccumulator,
cutlass::NumericConverterClamp<typename Conv2d0::ElementC, ElementCompute>
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
beta0);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename Conv2d1::ElementA,
typename Conv2d1::LayoutA,
typename Conv2d1::ElementB,
typename Conv2d1::LayoutB,
typename Conv2d1::ElementC,
typename Conv2d1::LayoutC,
ElementCompute,
ElementAccumulator,
cutlass::NumericConverterClamp<typename Conv2d1::ElementC, ElementCompute>
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_interleaved_nonfused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n"
<< "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
template <typename B2bConv2d_, int InterleavedK>
class B2bInterleavedFusedConv2dRun {
public:
using B2bConv2d = B2bConv2d_;
using ElementAccumulator = typename B2bConv2d::ElementAccumulator;
using ElementCompute = typename B2bConv2d::ElementCompute;
static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator;
public:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
uint64_t seed;
cutlass::HostTensor<typename B2bConv2d::ElementA, typename B2bConv2d::LayoutA> tensor_A0;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B0_reordered;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C0;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D0_reference;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1;
cutlass::HostTensor<typename B2bConv2d::ElementB, typename B2bConv2d::LayoutB> tensor_B1_reordered;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_C1;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_computed;
cutlass::HostTensor<typename B2bConv2d::ElementC, typename B2bConv2d::LayoutC> tensor_D1_reference;
public:
B2bInterleavedFusedConv2dRun(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2080
):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
void initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
int scope;
int bits = cutlass::sizeof_bits<Element>::value;
if (bits <= 16) {
scope = 2;
}
else {
scope = 8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope, -scope, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
}
else {
}
}
void initialize(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) {
tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0));
tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0));
tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0));
tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1));
tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1));
initialize_tensor(tensor_A0.host_view(), init_A, seed);
initialize_tensor(tensor_B0.host_view(), init_B, seed * 17);
initialize_tensor(tensor_C0.host_view(), init_C, seed * 39);
initialize_tensor(tensor_B1.host_view(), init_B, seed * 18);
initialize_tensor(tensor_C1.host_view(), init_C, seed * 40);
//Reorder B0 and B1
cutlass::reorder_convK<16, InterleavedK>(
tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0));
cutlass::reorder_convK<InterleavedK, InterleavedK>(
tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1));
tensor_A0.sync_device();
tensor_B0.sync_device();
tensor_B0_reordered.sync_device();
tensor_C0.sync_device();
tensor_D0_reference.sync_device();
tensor_B1.sync_device();
tensor_B1_reordered.sync_device();
tensor_C1.sync_device();
tensor_D1_computed.sync_device();
tensor_D1_reference.sync_device();
}
/// Executes one test
bool run(
cutlass::conv::Conv2dProblemSize const &problem_size_0,
cutlass::conv::Conv2dProblemSize const &problem_size_1,
cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
ElementCompute alpha0 = ElementCompute(1),
ElementCompute beta0 = ElementCompute(0),
ElementCompute alpha1 = ElementCompute(1),
ElementCompute beta1 = ElementCompute(0),
bool relu = true,
int warm_ups = 1,
int runs = 100) {
initialize(problem_size_0, problem_size_1);
// configure the operator
B2bConv2d b2b_conv2d_op;
typename B2bConv2d::Arguments b2b_conv2d_args(
problem_size_0,
problem_size_1,
tensor_A0.device_ref(),
tensor_B0_reordered.device_ref(),
tensor_C0.device_ref(),
tensor_B1_reordered.device_ref(),
tensor_C1.device_ref(),
tensor_D1_computed.device_ref(),
{alpha0, beta0},
{alpha1, beta1},
split_k_mode
);
cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args);
CUTLASS_CHECK(status);
for(int i = 0; i < warm_ups; i++) {
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
//
// Run the Conv2d
//
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(int i = 0; i < runs; i++) {
// run conv2d operator
status = b2b_conv2d_op();
CUTLASS_CHECK(status);
}
cudaEventRecord(stop);
cudaDeviceSynchronize();
float conv2dTime;
cudaEventElapsedTime(&conv2dTime, start, stop);
std::cout << "time " << conv2dTime / (float)runs << " ms\n";
tensor_D1_computed.sync_host();
bool passed = false;
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
ElementAccumulator,
cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
>(
kConvolutionalOperator,
problem_size_0,
tensor_A0.device_ref(),
tensor_B0.device_ref(),
tensor_C0.device_ref(),
tensor_D0_reference.device_ref(),
alpha0,
beta0);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view());
}
cutlass::reference::device::Conv2d<
typename B2bConv2d::ElementA,
typename B2bConv2d::LayoutA,
typename B2bConv2d::ElementB,
typename B2bConv2d::LayoutB,
typename B2bConv2d::ElementC,
typename B2bConv2d::LayoutC,
ElementCompute,
ElementAccumulator,
cutlass::NumericConverterClamp<typename B2bConv2d::ElementC, ElementCompute>
>(
kConvolutionalOperator,
problem_size_1,
tensor_D0_reference.device_ref(),
tensor_B1.device_ref(),
tensor_C1.device_ref(),
tensor_D1_reference.device_ref(),
alpha1,
beta1);
if(relu) {
cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view());
}
cudaError_t result = cudaDeviceSynchronize();
CHECK_TRUE(result == cudaSuccess);
// sync host (copy device data to host) for dumping error output in case of mismatches
tensor_D0_reference.sync_host();
tensor_D1_reference.sync_host();
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0);
CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0);
passed = cutlass::reference::host::TensorEquals(
tensor_D1_computed.host_view(),
tensor_D1_reference.host_view());
CHECK_TRUE(passed);
if (!passed) {
std::stringstream fname;
fname << "error_B2bImplicitGemm_device_interleaved_fused.txt";
std::cerr << "Dumping results in " << fname.str() << "\n";
std::ofstream results(fname.str());
results << problem_size_0 << std::endl;
results << problem_size_1 << std::endl;
results
<< "\nA0:\n" << tensor_A0.host_view() << "\n"
<< "\nB0:\n" << tensor_B0.host_view() << "\n"
<< "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n"
<< "\nC0:\n" << tensor_C0.host_view() << "\n"
<< "\nB1:\n" << tensor_B1.host_view() << "\n"
<< "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n"
<< "\nC1:\n" << tensor_C1.host_view() << "\n"
<< "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n"
<< "\nD1 computed:\n" << tensor_D1_computed.host_view();
}
return passed;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -243,6 +243,7 @@ struct B2bInterleavedNonFusedGemmRun
status = gemm_op_1();
CUTLASS_CHECK(status);
}
//
// Run the GEMM
//
@ -455,10 +456,6 @@ struct B2bInterleavedFusedGemmRun
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn());
// cutlass::HostTensor<
// typename B2bGemm::ElementC,
// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn());
cutlass::HostTensor<
typename B2bGemm::ElementC,
typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn());
@ -507,7 +504,6 @@ struct B2bInterleavedFusedGemmRun
tensor_B0.sync_device();
tensor_B0_reordered.sync_device();
tensor_C0.sync_device();
//tensor_D0.sync_device();
tensor_B1.sync_device();
tensor_B1_reordered.sync_device();
tensor_C1.sync_device();
@ -566,7 +562,6 @@ struct B2bInterleavedFusedGemmRun
cudaEventElapsedTime(&gemmTime, start, stop);
std::cout << "time " << gemmTime / (float)runs << " ms\n";
//tensor_D0.sync_host();
tensor_D1.sync_host();
//
@ -635,7 +630,6 @@ struct B2bInterleavedFusedGemmRun
<< "\nB0 =\n" << tensor_B0.host_view()
<< "\nB0_reordered =\n" << tensor_B0_reordered.host_view()
<< "\nC0 =\n" << tensor_C0.host_view()
// << "\nD0 =\n" << tensor_D0.host_view()
<< "\nB1 =\n" << tensor_B1.host_view()
<< "\nB1_reordered =\n" << tensor_B1_reordered.host_view()
<< "\nC1 =\n" << tensor_C1.host_view()

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -0,0 +1,274 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Template for device-level Implicit GEMM
*/
#pragma once
#include <limits>
#include "cutlass/cutlass.h"
#include "cutlass/device_kernel.h"
#include "cutlass/conv/convolution.h"
#include "kernel/b2b_implicit_gemm_convolution.h"
#include "kernel/default_b2b_conv2d_fprop.h"
namespace cutlass {
namespace conv {
namespace device {
template<typename B2bImplicitGemmKernel_>
class B2bImplicitGemmConvolution {
public:
using B2bImplicitGemmKernel = B2bImplicitGemmKernel_;
using ElementA = typename B2bImplicitGemmKernel::ElementA;
using LayoutA = typename B2bImplicitGemmKernel::LayoutA;
using ElementB = typename B2bImplicitGemmKernel::ElementB;
using LayoutB = typename B2bImplicitGemmKernel::LayoutB;
using ElementC = typename B2bImplicitGemmKernel::ElementC;
using LayoutC = typename B2bImplicitGemmKernel::LayoutC;
using ElementAccumulator = typename B2bImplicitGemmKernel::ElementAccumulator;
using ElementCompute = typename B2bImplicitGemmKernel::ElementCompute;
using OperatorClass = typename B2bImplicitGemmKernel::OperatorClass;
using ArchTag = typename B2bImplicitGemmKernel::ArchTag;
using ThreadblockShape0 = typename B2bImplicitGemmKernel::ThreadblockShape0;
using ThreadblockShape1 = typename B2bImplicitGemmKernel::ThreadblockShape1;
using WarpShape0 = typename B2bImplicitGemmKernel::WarpShape0;
using WarpShape1 = typename B2bImplicitGemmKernel::WarpShape1;
using InstructionShape = typename B2bImplicitGemmKernel::InstructionShape;
using ThreadblockSwizzle = typename B2bImplicitGemmKernel::ThreadblockSwizzle;
using EpilogueOutputOp0 = typename B2bImplicitGemmKernel::EpilogueOutputOp0;
using EpilogueOutputOp1 = typename B2bImplicitGemmKernel::EpilogueOutputOp1;
static int const kStages = B2bImplicitGemmKernel::kStages;
static int const kConvDim = B2bImplicitGemmKernel::kConvDim;
using WarpMmaOperator0 = typename B2bImplicitGemmKernel::WarpMmaOperator0;
using WarpMmaOperator1 = typename B2bImplicitGemmKernel::WarpMmaOperator1;
using ArchMmaOperator = typename B2bImplicitGemmKernel::ArchMmaOperator;
using MathOperator = typename B2bImplicitGemmKernel::MathOperator;
static cutlass::conv::Operator const kConvolutionalOperator = B2bImplicitGemmKernel::kConvolutionalOperator;
static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = B2bImplicitGemmKernel::kIteratorAlgorithm;
static int const kWarpCount =
(ThreadblockShape0::kM / WarpShape0::kM) *
(ThreadblockShape0::kN / WarpShape0::kN);
/// Argument structure
using Arguments = typename B2bImplicitGemmKernel::Arguments;
private:
/// Kernel parameters object
typename B2bImplicitGemmKernel::Params params_;
public:
/// Constructs Implicit GEMM
B2bImplicitGemmConvolution() { }
/// Determines whether the Implicit GEMM can execute the given problem.
static Status can_implement(Arguments const &args) {
// dispatch to iterators
Status status = B2bImplicitGemmKernel::B2bMma::IteratorA0::can_implement(args.problem_size_0);
if (Status::kSuccess != status) {
return status;
}
status = B2bImplicitGemmKernel::B2bMma::IteratorB0::can_implement(args.problem_size_0);
if (Status::kSuccess != status) {
return status;
}
status = B2bImplicitGemmKernel::B2bMma::IteratorB1::can_implement(args.problem_size_1);
if (Status::kSuccess != status) {
return status;
}
// Determine grid shape
ThreadblockSwizzle threadblock_swizzle;
dim3 grid = threadblock_swizzle.get_grid_shape(
threadblock_swizzle.get_tiled_shape(
cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
args.problem_size_0.split_k_slices));
if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
grid.z <= std::numeric_limits<uint16_t>::max())) {
return Status::kErrorInvalidProblem;
}
return Status::kSuccess;
}
/// Gets the workspace size
static size_t get_workspace_size(Arguments const &args) {
size_t workspace_bytes = 0;
// Determine grid shape
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0),
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
args.problem_size_0.split_k_slices);
if(args.split_k_mode == SplitKMode::kParallel) {
// Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
// The user needs to call a reduction operator to optain the final output tensor
workspace_bytes =
sizeof(ElementAccumulator) *
size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size_0)) *
size_t(grid_tiled_shape.k());
}
else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size_0.split_k_slices > 1) {
// Split-K serial: The user workspace is used to store semaphore and serialize writing the
// final reduced output to user's output tensor
workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
}
return workspace_bytes;
}
/// Initializes GEMM state from arguments.
Status initialize(
Arguments const &args,
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
if (args.problem_size_0.split_k_slices > 1) {
if (!workspace) {
return Status::kErrorWorkspaceNull;
}
cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
if (status != cudaSuccess) {
return Status::kErrorInternal;
}
}
// initialize the params structure from the arguments
params_ = typename B2bImplicitGemmKernel::Params(
args,
static_cast<int *>(workspace)
);
int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
if (smem_size >= (48 << 10)) {
cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<B2bImplicitGemmKernel>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size);
if (result != cudaSuccess) {
return Status::kErrorInternal;
}
result = cudaFuncSetAttribute(
cutlass::Kernel<B2bImplicitGemmKernel>,
cudaFuncAttributePreferredSharedMemoryCarveout, 100);
if (result != cudaSuccess) {
return Status::kErrorInternal;
}
}
return Status::kSuccess;
}
/// Initializes GEMM state from arguments.
Status update(Arguments const &args, void *workspace = nullptr) {
// update the params structure from the arguments
params_.ptr_A0 = args.ref_A0.data();
params_.ptr_B0 = args.ref_B0.data();
params_.ptr_C0 = args.ref_C0.data();
params_.ptr_B1 = args.ref_B1.data();
params_.ptr_C1 = args.ref_C1.data();
params_.ptr_D1 = args.ref_D1.data();
params_.output_op_0 = args.output_op_0;
params_.output_op_1 = args.output_op_1;
params_.semaphore = static_cast<int *>(workspace);
return Status::kSuccess;
}
/// Runs the kernel using initialized state.
Status run(cudaStream_t stream = nullptr) {
ThreadblockSwizzle threadblock_swizzle;
dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
dim3 block(32 * kWarpCount, 1, 1);
int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage));
cutlass::Kernel<B2bImplicitGemmKernel><<<grid, block, smem_size, stream>>>(params_);
cudaError_t result = cudaGetLastError();
return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
}
/// Runs the kernel using initialized state.
Status operator()(cudaStream_t stream = nullptr) {
return run(stream);
}
/// Runs the kernel using initialized state.
Status operator()(
Arguments const &args,
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
Status status = initialize(args, workspace);
if (status == Status::kSuccess) {
status = run(stream);
}
return status;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace device
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,102 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h"
#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h"
#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h"
#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h"
int run() {
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
<< std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
std::cout << "Running on SM80" << std::endl;
run_nonfused_conv2d_fprop_optimized_f16_sm80();
run_fused_conv2d_fprop_optimized_f16_sm80();
run_nonfused_conv2d_fprop_optimized_s8_sm80();
run_fused_conv2d_fprop_optimized_s8_sm80();
#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
std::cout << "Running on SM75" << std::endl;
run_nonfused_conv2d_fprop_optimized_f16_sm75();
run_fused_conv2d_fprop_optimized_f16_sm75();
run_nonfused_conv2d_fprop_optimized_s8_sm75();
run_fused_conv2d_fprop_optimized_s8_sm75();
#endif
return 0;
}
int main() {
bool notSupported = false;
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
notSupported = true;
}
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
<< std::endl;
notSupported = true;
}
if (notSupported) {
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
return run();
}

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -22,43 +22,22 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*
This example shows fusing two GEMM mainloops into one kernel. The first GEMM computes relu(alpha*A*B) and
the second GEMM computes relu(alpha*A*B+beta*C). The performance measuring environment compares against
two unfused GEMM operations, demonstrating a speedup of the fused kernel on the
NVIDIA Turing GPU architecture.
Problem size:
GEMM1 (M,N,K): 128*1600, 64, 576
GEMM2 (M,N,K): 128*1600, 128, 64
Note that GEMM1_N = GEMM2_K
The example requires the number of threadblocks be the same across 2 GEMMs and
thread_block_tile_N = problem_N so the data required by each layer is threadblock-resident. It
also requires warp_tile_N = thread_block_tile_N so the data required by each warp is
register-file-resident.
Performance:
- fp16 on Tesla T4 @ 1590MHz (non-fused vs. fused): 1.39011 ms vs. 1.26035 ms
- int8 on Tesla T4 @ 1590MHz (non-fused vs. fused): 0.751759 ms vs. 0.62971 ms
- fp16 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.721144 ms vs. 0.629864 ms
- int8 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.379049 ms vs. 0.324764 ms
- int8 on GA100 @ 1200MHz (non-fused vs. fused): 0.153795 ms vs. 0.129874 ms
*/
#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h"
#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h"
#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h"
int run() {
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
std::cout << "Running on SM80" << std::endl;
run_nonfused_gemm_f16_sm80();
run_fused_gemm_f16_sm80();
run_nonfused_gemm_s8_sm80();
run_fused_gemm_s8_sm80();
#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
std::cout << "Running on SM75" << std::endl;
run_nonfused_gemm_f16();
run_fused_gemm_f16();
run_nonfused_gemm_s8();
@ -74,9 +53,9 @@ int main() {
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples.
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
notSupported = true;
}
@ -90,7 +69,7 @@ int main() {
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
<< std::endl;
notSupported = true;

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -0,0 +1,475 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a pipelined Implicit GEMM kernel.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/array.h"
#include "cutlass/numeric_types.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/semaphore.h"
#include "cutlass/tensor_ref.h"
#include "cutlass/layout/tensor.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/conv/convolution.h"
#include "cutlass/conv/conv2d_problem_size.h"
#include "cutlass/conv/conv3d_problem_size.h"
#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
typename B2bMma_, ///! Threadblock-scoped matrix multiply-accumulate
typename Epilogue_, ///! Epilogue
typename ThreadblockSwizzle_, ///! Threadblock swizzling function
conv::Operator ConvOperator, ///! Convolutional operator (Fprop, Dgrad, Wgrad)
typename ConvProblemSize_ = Conv2dProblemSize ///! Convolutional operator on 2D or 3D problem
>
struct B2bImplicitGemmConvolution {
using B2bMma = B2bMma_;
using Epilogue = Epilogue_;
using EpilogueOutputOp0 = typename B2bMma::OutputOp;
using EpilogueOutputOp1 = typename Epilogue::OutputOp;
using ThreadblockSwizzle = ThreadblockSwizzle_;
static Operator const kConvolutionalOperator = ConvOperator;
using ElementA = typename B2bMma::IteratorA0::Element;
using LayoutA = typename B2bMma::IteratorA0::Layout;
using ElementB = typename B2bMma::IteratorB0::Element;
using LayoutB = typename B2bMma::IteratorB0::Layout;
using ElementC = typename EpilogueOutputOp1::ElementOutput;
/// Set output tensor C layout
using LayoutC = LayoutA;
using ElementAccumulator = typename EpilogueOutputOp0::ElementAccumulator;
using ElementCompute = typename EpilogueOutputOp0::ElementCompute;
using WarpMmaOperator0 = typename B2bMma::Policy0::Operator;
using WarpMmaOperator1 = typename B2bMma::Policy1::Operator;
using ArchMmaOperator = typename WarpMmaOperator0::ArchMmaOperator;
using MathOperator = typename ArchMmaOperator::Operator;
using OperatorClass = typename WarpMmaOperator0::OperatorClass;
using ArchTag = typename WarpMmaOperator0::ArchTag;
using ThreadblockShape0 = typename B2bMma::Shape0;
using ThreadblockShape1 = typename B2bMma::Shape1;
using WarpShape0 = typename WarpMmaOperator0::Shape;
using WarpShape1 = typename WarpMmaOperator1::Shape;
using InstructionShape = typename ArchMmaOperator::Shape;
static int const kStages = B2bMma::kStages;
static IteratorAlgorithm const kIteratorAlgorithm = B2bMma::IteratorA0::kIteratorAlgorithm;
/// Warp count (concept: GemmShape)
using WarpCount0 = typename B2bMma::WarpCount0;
static int const kThreadCount = 32 * WarpCount0::kCount;
using TensorRefA0 = typename B2bMma::IteratorA0::TensorRef;
using TensorRefB0 = typename B2bMma::IteratorB0::TensorRef;
using TensorRefB1 = typename B2bMma::IteratorB1::TensorRef;
using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
/// Check iterator A and B convolution dimension are the same and
// set device::B2bImplicitGemmConvolution::kConvDim
static_assert(B2bMma::IteratorA0::kConvDim == B2bMma::IteratorB0::kConvDim,
"Convolution on different different dimensions is not supported");
static int const kConvDim = B2bMma::IteratorA0::kConvDim;
/// Conv dimension and problem size structure (Conv2d or Conv3d)
using ConvProblemSize = ConvProblemSize_;
/// Wgrad C stride idx for implicit gemm algorithm
// Conv2d row-major matrix C (KxRSC)
// Conv3d row-major matrix C (KxTRSC)
static int const kWgradCStrideIdx =
cutlass::platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
/// This chooses the appropriate stride element of the C tensor.
static int const kTensorCStrideIdx =
(kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
//
//
//
using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
LayoutC,
typename Epilogue::OutputTileIterator::Layout,
TensorRefC,
ConvOperator,
ConvProblemSize
>;
/// Argument structure
struct Arguments {
//
// Data members
//
ConvProblemSize problem_size_0;
ConvProblemSize problem_size_1;
TensorRefA0 ref_A0;
TensorRefB0 ref_B0;
TensorRefC ref_C0;
TensorRefB1 ref_B1;
TensorRefC ref_C1;
TensorRefC ref_D1;
typename EpilogueOutputOp0::Params output_op_0;
typename EpilogueOutputOp1::Params output_op_1;
SplitKMode split_k_mode;
//
// Methods
//
/// Default ctor
CUTLASS_HOST_DEVICE
Arguments() { }
CUTLASS_HOST_DEVICE
Arguments(
ConvProblemSize const & problem_size_0,
ConvProblemSize const & problem_size_1
):
problem_size_0(problem_size_0),
problem_size_1(problem_size_1) { }
CUTLASS_HOST_DEVICE
Arguments(
ConvProblemSize const & problem_size_0,
ConvProblemSize const & problem_size_1,
TensorRefA0 const & ref_A0,
TensorRefB0 const & ref_B0,
TensorRefC const & ref_C0,
TensorRefB1 const & ref_B1,
TensorRefC const & ref_C1,
TensorRefC const & ref_D1,
typename EpilogueOutputOp0::Params const & output_op_0,
typename EpilogueOutputOp1::Params const & output_op_1,
SplitKMode const & split_k_mode = SplitKMode::kSerial
):
problem_size_0(problem_size_0),
problem_size_1(problem_size_1),
ref_A0(ref_A0),
ref_B0(ref_B0),
ref_C0(ref_C0),
ref_B1(ref_B1),
ref_C1(ref_C1),
ref_D1(ref_D1),
output_op_0(output_op_0),
output_op_1(output_op_1),
split_k_mode(split_k_mode)
{
}
};
/// Parameters structure
struct Params {
ConvProblemSize problem_size_0;
ConvProblemSize problem_size_1;
cutlass::gemm::GemmCoord grid_tiled_shape;
gemm::GemmCoord implicit_gemm_problem_size_0;
gemm::GemmCoord implicit_gemm_problem_size_1;
int gemm_k_iterations_0;
int gemm_k_iterations_1;
typename B2bMma::IteratorA0::Params iterator_A0;
typename B2bMma::IteratorA0::Element const *ptr_A0;
typename B2bMma::IteratorB0::Params iterator_B0;
typename B2bMma::IteratorB0::Element const *ptr_B0;
typename Epilogue::OutputTileIterator::Params iterator_C0;
typename Epilogue::OutputTileIterator::Element *ptr_C0;
typename B2bMma::IteratorB1::Params iterator_B1;
typename B2bMma::IteratorB1::Element const *ptr_B1;
typename Epilogue::OutputTileIterator::Params iterator_C1;
typename Epilogue::OutputTileIterator::Element *ptr_C1;
typename Epilogue::OutputTileIterator::Params iterator_D1;
typename Epilogue::OutputTileIterator::Element *ptr_D1;
typename EpilogueOutputOp0::Params output_op_0;
typename EpilogueOutputOp1::Params output_op_1;
int *semaphore;
SplitKMode split_k_mode;
//
// Methods
//
CUTLASS_HOST_DEVICE
Params(): gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
///
CUTLASS_HOST_DEVICE
Params(
Arguments const &args,
int *semaphore = nullptr
):
problem_size_0(args.problem_size_0),
problem_size_1(args.problem_size_1),
implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)),
implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)),
grid_tiled_shape(grid_tiled_shape),
iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())),
ptr_A0(args.ref_A0.data()),
iterator_B0(args.problem_size_0, args.ref_B0.layout()),
ptr_B0(args.ref_B0.data()),
iterator_C0(ConvOutputIteratorParameter::layout(args.ref_C0)),
ptr_C0(args.ref_C0.data()),
iterator_B1(args.problem_size_1, args.ref_B1.layout()),
ptr_B1(args.ref_B1.data()),
iterator_C1(ConvOutputIteratorParameter::layout(args.ref_C1)),
ptr_C1(args.ref_C1.data()),
iterator_D1(ConvOutputIteratorParameter::layout(args.ref_D1)),
ptr_D1(args.ref_D1.data()),
output_op_0(args.output_op_0),
output_op_1(args.output_op_1),
semaphore(semaphore),
split_k_mode(args.split_k_mode)
{
gemm_k_iterations_0 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape0::kK, args.problem_size_0);
gemm_k_iterations_1 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape1::kK, args.problem_size_1);
ThreadblockSwizzle threadblock_swizzle;
grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
implicit_gemm_problem_size_0,
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
args.problem_size_0.split_k_slices);
}
};
/// Shared memory storage structure
union SharedStorage {
typename B2bMma::B2bMmaSharedStorage main_loop;
typename Epilogue::SharedStorage epilogue;
};
//
// Methods
//
CUTLASS_HOST_DEVICE
B2bImplicitGemmConvolution() { }
/// Executes one ImplicitGEMM
CUTLASS_DEVICE
void operator()(Params const &params, SharedStorage &shared_storage) {
// Compute threadblock location
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord threadblock_tile_idx =
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
// Early exit if CTA is out of range
if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
return;
}
// Compute position within threadblock
int thread_idx = threadIdx.x;
// Construct iterators to A and B operands
typename B2bMma::IteratorA0 iterator_A0(
params.iterator_A0,
params.problem_size_0,
params.ptr_A0,
thread_idx,
MatrixCoord(
threadblock_tile_idx.m() * B2bMma::Shape0::kM,
threadblock_tile_idx.k() * B2bMma::Shape0::kK
)
);
typename B2bMma::IteratorB0 iterator_B0(
params.iterator_B0,
params.problem_size_0,
params.ptr_B0,
thread_idx,
MatrixCoord(
threadblock_tile_idx.k() * B2bMma::Shape0::kK,
threadblock_tile_idx.n() * B2bMma::Shape0::kN
)
);
typename B2bMma::IteratorB1 iterator_B1(
params.iterator_B1,
params.problem_size_1,
params.ptr_B1,
thread_idx,
MatrixCoord(
threadblock_tile_idx.k() * B2bMma::Shape1::kK,
threadblock_tile_idx.n() * B2bMma::Shape1::kN
)
);
// Broadcast the warp_id computed by lane 0 to ensure dependent code
// is compiled as warp-uniform.
int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
int lane_idx = threadIdx.x % 32;
//
// Main loop
//
EpilogueOutputOp0 output_op_0(params.output_op_0);
// Construct thread-scoped matrix multiply
B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
typename B2bMma::FragmentC0 src_accum;
typename B2bMma::FragmentC1 accumulators;
src_accum.clear();
accumulators.clear();
// Compute threadblock-scoped matrix multiply-add
b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0);
//
// Epilogue
//
EpilogueOutputOp1 output_op_1(params.output_op_1);
// Construct the semaphore.
int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
Semaphore semaphore(params.semaphore + block_idx, thread_idx);
// Compute logical position within grid
threadblock_tile_idx =
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
// If performing a reduction via split-K, fetch the initial synchronization
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
// Fetch the synchronization lock initially but do not block.
semaphore.fetch();
// Indicate which position in a serial reduction the output operator is currently updating
output_op_1.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
}
MatrixCoord threadblock_offset(
threadblock_tile_idx.m() * B2bMma::Shape1::kM,
threadblock_tile_idx.n() * B2bMma::Shape1::kN
);
// Tile iterator writing to destination tensor
typename Epilogue::OutputTileIterator iterator_D1(
params.iterator_D1,
params.ptr_D1,
ConvOutputIteratorParameter::extent(params.problem_size_1),
thread_idx,
threadblock_offset
);
// Tile iterator reading from source accumulator tensor
typename Epilogue::OutputTileIterator iterator_C1(
params.iterator_C1,
params.ptr_C1,
ConvOutputIteratorParameter::extent(params.problem_size_1),
thread_idx,
threadblock_offset
);
// Construct the epilogue
Epilogue epilogue(
shared_storage.epilogue,
thread_idx,
warp_idx,
lane_idx);
// Wait on the semaphore - this latency may have been covered by iterator construction
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
// For subsequent threadblocks, the source matrix is held in the 'D' tensor.
if (threadblock_tile_idx.k()) {
iterator_C1 = iterator_D1;
}
semaphore.wait(threadblock_tile_idx.k());
__threadfence();
}
// Each split-k-slice writes to a unique tensor location
else if (params.split_k_mode == SplitKMode::kParallel) {
iterator_D1.add_pointer_offset(threadblock_tile_idx.k() *
cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size_1));
}
// Run efficient epilogue
epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
//
// Release the semaphore
//
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
int lock = 0;
if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
// The final threadblock resets the semaphore for subsequent grids.
lock = 0;
}
else {
// Otherwise, the semaphore is incremented
lock = threadblock_tile_idx.k() + 1;
}
semaphore.release(lock);
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

File diff suppressed because it is too large Load Diff

View File

@ -1,29 +1,28 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
*modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
*notice, this list of conditions and the following disclaimer in the
*documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its
*contributors may be used to endorse or promote products derived from this
*software without specific prior written permission.
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
*DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
*DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
*OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
*NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
*EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
@ -118,6 +117,75 @@ template <
>
struct DefaultB2bGemm;
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Ampere Architecture
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of A matrix in units of elements
int kAlignmentB,
/// Element type for C and D matrix operands
typename ElementC,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Warp-level tile size (concept: GemmShape)
typename InstructionShape,
/// Epilogue output operator
typename EpilogueOutputOp0,
/// Epilogue output operator
typename EpilogueOutputOp1,
/// Threadblock-level swizzling operator
typename ThreadblockSwizzle,
/// Number of stages used in the pipelined mainloop
int Stages,
/// If true, kernel is configured to support serial reduction in the
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator>
struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
arch::Sm80, ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1, InstructionShape,
EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
Operator> {
/// Define the threadblock-scoped matrix multiply-accumulate
using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp0>::ThreadblockB2bMma;
static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
/// Define the epilogue
using Epilogue =
typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
EpilogueOutputOp1::kCount>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
};
////////////////////////////////////////////////////////////////////////////////
/// Partial specialization for Turing Architecture

View File

@ -0,0 +1,757 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
*/
#pragma once
#include "cutlass/aligned_buffer.h"
#include "cutlass/arch/memory.h"
#include "cutlass/array.h"
#include "cutlass/cutlass.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/numeric_types.h"
#include "cutlass/arch/cache_operation.h"
#include "cutlass/gemm/threadblock/mma_base.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math
/// instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Cache operation for operand A
cutlass::arch::CacheOperation::Kind CacheOpA0,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB0,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile
// (concept::MmaTensorOpFragmentIterator)
typename FragmentIteratorA1_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator |
// MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Cache operation for operand B
cutlass::arch::CacheOperation::Kind CacheOpB1,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Number of stages,
int Stages,
/// Used for partial specialization
typename Enable = bool>
class B2bImplicitGemmMultistage :
public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
public:
///< Base class
using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape0 = Shape0_;
///< Iterates over tiles of A operand in global memory
using IteratorA0 = IteratorA0_;
///< Iterates over tiles of B operand in global memory
using IteratorB0 = IteratorB0_;
///< Policy describing tuning details
using Policy0 = Policy0_;
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
///< Size of the Gemm problem - concept: gemm::GemmShape<>
using Shape1 = Shape1_;
///< Iterates over tiles of A operand in global memory
using FragmentIteratorA1 = FragmentIteratorA1_;
///< Iterates over tiles of B operand in global memory
using IteratorB1 = IteratorB1_;
///< Policy describing tuning details
using Policy1 = Policy1_;
using SmemIteratorB1 = SmemIteratorB1_;
///< Epilogue after 1st Gemm
using OutputOp = OutputOp_;
static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
//
// Dependent types
//
using ElementC = typename Policy0::Operator::ElementC;
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Internal structure exposed for introspection.
struct Detail {
static_assert(Base::kWarpGemmIterations0 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
static_assert(Base::kWarpGemmIterations1 > 1,
"The pipelined structure requires at least two warp-level "
"GEMM operations.");
/// Number of cp.async instructions to load one stage of operand A
static int const AsyncCopyIterationsPerStageA0 =
IteratorA0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const AsyncCopyIterationsPerStageB0 =
IteratorB0::ThreadMap::Iterations::kCount;
/// Number of cp.async instructions to load one stage of operand B
static int const AsyncCopyIterationsPerStageB1 =
IteratorB1::ThreadMap::Iterations::kCount;
/// Number of stages
static int const kStages = Stages;
/// Number of cp.async instructions to load on group of operand A
static int const kAccessesPerGroupA0 =
(AsyncCopyIterationsPerStageA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB0 =
(AsyncCopyIterationsPerStageB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
/// Number of cp.async instructions to load on group of operand B
static int const kAccessesPerGroupB1 =
(AsyncCopyIterationsPerStageB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
};
private:
using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
/// Warp Fragment of operand A1 loaded from accmulator tile
using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
private:
//
// Data members
//
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Iterator to write threadblock-scoped tile of B operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bImplicitGemmMultistage(
///< Shared storage needed for internal use by threadblock-scoped GEMM
typename Base::B2bMmaSharedStorage &shared_storage,
///< ID within the threadblock
int thread_idx,
///< ID of warp
int warp_idx,
///< ID of each thread within a warp
int lane_idx
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx)
{
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset(
{warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
this->warp_tile_iterator_B0_.add_tile_offset(
{Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
this->warp_tile_iterator_B1_.add_tile_offset(
{Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
}
CUTLASS_DEVICE
void copy_tiles_and_advance_0(
IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
int group_start_A0 = 0, int group_start_B0 = 0) {
iterator_A0.set_iteration_index(group_start_A0);
this->smem_iterator_A0_.set_iteration_index(group_start_A0);
// Async Copy for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
if (group_start_A0 + j < Detail::AsyncCopyIterationsPerStageA0) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
++this->smem_iterator_A0_;
}
}
iterator_B0.set_iteration_index(group_start_B0);
this->smem_iterator_B0_.set_iteration_index(group_start_B0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
if (group_start_B0 + j < Detail::AsyncCopyIterationsPerStageB0) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
++this->smem_iterator_B0_;
}
}
}
CUTLASS_DEVICE
void copy_tiles_and_advance_1(
IteratorB1 &iterator_B1,
int group_start_B1 = 0) {
iterator_B1.set_iteration_index(group_start_B1);
this->smem_iterator_B1_.set_iteration_index(group_start_B1);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
if (group_start_B1 + j < Detail::AsyncCopyIterationsPerStageB1) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
++this->smem_iterator_B1_;
}
}
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
///< problem size of GEMM
int gemm_k_iterations_0,
///< destination accumulator tile
FragmentC1 &accum,
///< iterator over A operand in global memory
IteratorA0 iterator_A0,
///< iterator over B operand in global memory
IteratorB0 iterator_B0,
///< iterator over B operand in global memory
IteratorB1 iterator_B1,
///< initial value of accumulator
FragmentC0 const &src_accum,
///< epilogue operation after 1st Gemm
OutputOp output_op_0,
///< Imaginary strides used for planar-complex only - ignored here
int64_t imag_stride_A = 0,
int64_t imag_stride_B = 0) {
//
// Prologue
//
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_0) {
iterator_A0.set_iteration_index(0);
this->smem_iterator_A0_.set_iteration_index(0);
// Async Copy for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA0; ++j) {
typename IteratorA0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA0::AccessType *>(
this->smem_iterator_A0_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorA0::Element>::value *
IteratorA0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
++this->smem_iterator_A0_;
}
iterator_B0.set_iteration_index(0);
this->smem_iterator_B0_.set_iteration_index(0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB0; ++j) {
typename IteratorB0::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB0::AccessType *>(
this->smem_iterator_B0_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorB0::Element>::value *
IteratorB0::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
dst_ptr, iterator_B0.get(), iterator_B0.valid());
++iterator_B0;
++this->smem_iterator_B0_;
}
// Move to the next stage
iterator_A0.advance();
iterator_B0.advance();
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
}
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
// Waits until kStages-2 stages have committed.
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
Operator0 warp_mma0;
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
// Start issuing the first group of the next stage outside of the mainloop
copy_tiles_and_advance_0(iterator_A0, iterator_B0);
int smem_write_stage_idx = Base::kStages - 1;
int smem_read_stage_idx = 0;
warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
//
// Mainloop
//
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k > 0)
warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
warp_loaded_frag_A0[warp_mma_k % 2],
warp_loaded_frag_B0[warp_mma_k % 2]);
// Issue global->shared copies for the next stage
int group_start_iteration_A0, group_start_iteration_B0;
if (warp_mma_k + 1 == Base::kWarpGemmIterations0) {
group_start_iteration_A0 = 0;
group_start_iteration_B0 = 0;
} else {
group_start_iteration_A0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
group_start_iteration_B0 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
}
copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
group_start_iteration_B0);
warp_mma0(
accum0,
warp_transformed_frag_A0[warp_mma_k % 2],
warp_transformed_frag_B0[warp_mma_k % 2],
accum0
);
if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages of cp.async have committed
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_A0.advance();
iterator_B0.advance();
this->smem_iterator_A0_.add_tile_offset({0, 1});
this->smem_iterator_B0_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK *
Base::kWarpGemmIterations0,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
--gemm_k_iterations_0;
}
}
}
// Insert fence and wait for all outstanding cp.async operations to commit.
cutlass::arch::cp_async_fence();
cutlass::arch::cp_async_wait<0>();
__syncthreads();
// 2nd Implicit Gemm
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
//
// Prologue
//
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
// Issue several complete stages
CUTLASS_PRAGMA_UNROLL
for (int stage = 0; stage < Base::kStages - 1;
++stage, --gemm_k_iterations_1) {
iterator_B1.set_iteration_index(0);
this->smem_iterator_B1_.set_iteration_index(0);
// Async Copy for operand B
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB1; ++j) {
typename IteratorB1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorB1::AccessType *>(
this->smem_iterator_B1_.get());
int const kSrcBytes =
sizeof_bits<typename IteratorB1::Element>::value *
IteratorB1::ThreadMap::kElementsPerAccess / 8;
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
dst_ptr, iterator_B1.get(), iterator_B1.valid());
++iterator_B1;
++this->smem_iterator_B1_;
}
// Move to the next stage
iterator_B1.advance();
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
}
// Waits until kStages-2 stages have committed.
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math
// instructions
WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
Operator1 warp_mma1;
this->warp_tile_iterator_B1_.set_kgroup_index(0);
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
// Start issuing the first group of the next stage outside of the mainloop
copy_tiles_and_advance_1(iterator_B1);
smem_write_stage_idx = Base::kStages - 1;
smem_read_stage_idx = 0;
warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
//
// Mainloop
//
CUTLASS_GEMM_LOOP
for (gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1 - (Base::kStages - 1);
gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
//
// Loop over GEMM K dimension
//
// Computes a warp-level GEMM on data held in shared memory
// Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
if (warp_mma_k > 0)
warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
warp_loaded_frag_A1[warp_mma_k % 2],
warp_loaded_frag_B1[warp_mma_k % 2]);
// Issue global->shared copies for the next stage
int group_start_iteration_B1;
if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
group_start_iteration_B1 = 0;
} else {
group_start_iteration_B1 =
(warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
}
copy_tiles_and_advance_1(iterator_B1,
group_start_iteration_B1);
warp_mma1(
accum,
warp_transformed_frag_A1[warp_mma_k % 2],
warp_transformed_frag_B1[warp_mma_k % 2],
accum
);
if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
// Inserts a fence to group cp.async instructions into stages.
cutlass::arch::cp_async_fence();
// Waits until kStages-2 stages of cp.async have committed
arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
// Move to the next stage
iterator_B1.advance();
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Add negative offsets to return iterators to the 'start' of the
// circular buffer in shared memory
if (smem_write_stage_idx == (Base::kStages - 1)) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
smem_write_stage_idx = 0;
} else {
++smem_write_stage_idx;
}
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
smem_read_stage_idx = 0;
} else {
++smem_read_stage_idx;
}
}
}
}
// Insert fence and wait for all outstanding cp.async operations to commit.
cutlass::arch::cp_async_fence();
cutlass::arch::cp_async_wait<0>();
__syncthreads();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,483 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Template for a double-buffered threadblock-scoped GEMM kernel.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/array.h"
#include "cutlass/aligned_buffer.h"
#include "cutlass/numeric_conversion.h"
#include "cutlass/numeric_types.h"
#include "cutlass/matrix_shape.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
#include "threadblock/b2b_mma_base.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
template <
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape0_,
/// Iterates over tiles of A operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorA0_,
/// Iterates over tiles of A operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorA0_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB0_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB0_,
/// Size of the Gemm problem - concept: gemm::GemmShape<>
typename Shape1_,
/// Iterates over the intermediate accumulator tile
// (concept::MmaTensorOpFragmentIterator)
typename FragmentIteratorA1_,
/// Iterates over tiles of B operand in global memory
// (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
typename IteratorB1_,
/// Iterates over tiles of B operand in shared memory
/// (concept: WriteableTileIterator | RandomAccessTileIterator)
typename SmemIteratorB1_,
/// Data type of accumulator matrix
typename ElementC_,
/// Data type of accumulator matrix
typename LayoutC_,
/// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
typename OutputOp_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy0_,
/// Policy describing tuning details (concept: MmaPolicy)
typename Policy1_,
/// Transformation applied to A operand
typename TransformA0_ = NumericArrayConverter<
typename SmemIteratorA0_::Element,
typename IteratorA0_::Element,
IteratorA0_::Fragment::kElements>,
///
/// Transformation applied to A operand
typename TransformB0_ = NumericArrayConverter<
typename SmemIteratorB0_::Element,
typename IteratorB0_::Element,
IteratorB0_::Fragment::kElements>,
///
/// Transformation applied to A operand
typename TransformB1_ = NumericArrayConverter<
typename SmemIteratorB1_::Element,
typename IteratorB1_::Element,
IteratorB1_::Fragment::kElements>,
/// Used for partial specialization
typename Enable = bool
>
class B2bImplicitGemmPipelined : public gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
public:
///< Base class
using Base = gemm::threadblock::B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
using Shape0 = Shape0_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using IteratorA0 = IteratorA0_; ///< Iterates over tiles of A operand in global memory
using IteratorB0 = IteratorB0_; ///< Iterates over tiles of B operand in global memory
using Policy0 = Policy0_; ///< Policy0 describing tuning details
using SmemIteratorA0 = SmemIteratorA0_;
using SmemIteratorB0 = SmemIteratorB0_;
using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over tiles of A operand in global memory
using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory
using Policy1 = Policy1_; ///< Policy1 describing tuning details
using SmemIteratorB1 = SmemIteratorB1_;
using ElementC = ElementC_; ///< Data type of accumulator matrix
using LayoutC = LayoutC_; ///< Layout of accumulator matrix
using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm
using TransformA0 = TransformA0_;
using TransformB0 = TransformB0_;
using TransformB1 = TransformB1_;
//
// Dependent types
//
/// Fragment of operand A loaded from global memory
using FragmentA0 = typename IteratorA0::Fragment;
/// Fragment of operand B loaded from global memory
using FragmentB0 = typename IteratorB0::Fragment;
/// Fragment of accumulator tile
using FragmentC0 = typename Policy0::Operator::FragmentC;
/// Warp-level Mma
using Operator0 = typename Policy0::Operator;
/// Fragment of operand B loaded from global memory
using FragmentB1 = typename IteratorB1::Fragment;
/// Fragment of accumulator tile
using FragmentC1 = typename Policy1::Operator::FragmentC;
/// Warp-level Mma
using Operator1 = typename Policy1::Operator;
/// Obtain the arch tag from the warp-level operator
using ArchTag = typename Policy0::Operator::ArchTag;
/// Complex transform on A0 operand
static ComplexTransform const kTransformA0 = Operator0::kTransformA;
/// Complex transform on B0 operand
static ComplexTransform const kTransformB0 = Operator0::kTransformB;
/// Complex transform on B1 operand
static ComplexTransform const kTransformB1 = Operator1::kTransformB;
// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
private:
using WarpFragmentA0 = typename Operator0::FragmentA;
using WarpFragmentB0 = typename Operator0::FragmentB;
/// Warp Fragment of operand A1 loaded from accmulator tile
using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
using WarpFragmentB1 = typename Operator1::FragmentB;
protected:
/// Iterator to write threadblock-scoped tile of A operand to shared memory
SmemIteratorA0 smem_iterator_A_;
/// Iterator to write threadblock-scoped tile of B0 operand to shared memory
SmemIteratorB0 smem_iterator_B0_;
/// Iterator to write threadblock-scoped tile of B1 operand to shared memory
SmemIteratorB1 smem_iterator_B1_;
public:
/// Construct from tensor references
CUTLASS_DEVICE
B2bImplicitGemmPipelined(
typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
int thread_idx, ///< ID within the threadblock
int warp_idx, ///< ID of warp
int lane_idx ///< ID of each thread within a warp
):
Base(shared_storage, thread_idx, warp_idx, lane_idx),
smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),
smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx),
smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) {
// Compute warp location within threadblock tile by mapping the warp_id to
// three coordinates:
// _m: the warp's position within the threadblock along the M dimension
// _n: the warp's position within the threadblock along the N dimension
// _k: the warp's position within the threadblock along the K dimension
int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
//These may change across different GEMM layers
int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
// Add per-warp offsets in units of warp-level tiles
this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
}
/// Perform a threadblock-scoped matrix multiply-accumulate
CUTLASS_DEVICE
void operator()(
int gemm_k_iterations_0, ///< number of iterations of the mainloop
FragmentC1 &accum, ///< destination accumulator tile
IteratorA0 iterator_A, ///< iterator over A operand in global memory
IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory
IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory
FragmentC0 const &src_accum, ///< source accumulator tile
OutputOp output_op_0, ///< epilogue operation after 1st Gemm
TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment
TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment
TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment
//
// Prologue
//
// Perform accumulation in the 'd' output operand
FragmentC0 accum0 = src_accum;
FragmentA0 tb_frag_A;
FragmentB0 tb_frag_B0;
tb_frag_A.clear();
tb_frag_B0.clear();
// The last kblock is loaded in the prolog
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA0 warp_frag_A0[2];
WarpFragmentB0 warp_frag_B0[2];
this->warp_tile_iterator_A0_.set_kgroup_index(0);
this->warp_tile_iterator_B0_.set_kgroup_index(0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
Operator0 warp_mma0;
int smem_write_stage_idx = 1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_GEMM_LOOP
for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
// Write fragments to shared memory
this->smem_iterator_A_.store(transform_A0(tb_frag_A));
this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
__syncthreads();
++this->smem_iterator_A_;
++this->smem_iterator_B0_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_A0_.add_tile_offset(
{0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
this->warp_tile_iterator_B0_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
0});
}
smem_write_stage_idx ^= 1;
}
this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
++this->warp_tile_iterator_A0_;
++this->warp_tile_iterator_B0_;
if (warp_mma_k == 0) {
iterator_A.load(tb_frag_A);
iterator_B0.load(tb_frag_B0);
++iterator_A;
++iterator_B0;
}
warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
warp_frag_B0[warp_mma_k % 2], accum0);
}
}
//2nd Implicit Gemm
/// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
//
// Prologue
//
FragmentB1 tb_frag_B1;
tb_frag_B1.clear();
// The last kblock is loaded in the prolog
iterator_B1.load(tb_frag_B1);
++iterator_B1;
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
++this->smem_iterator_B1_;
__syncthreads();
// Pair of fragments used to overlap shared memory loads and math instructions
WarpFragmentA1 warp_frag_A1[2];
WarpFragmentB1 warp_frag_B1[2];
this->warp_tile_iterator_B1_.set_kgroup_index(0);
warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
Operator1 warp_mma1;
smem_write_stage_idx = 1;
int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
// Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
// shared memory loads (which have the tighest latency requirement).
//
// Mainloop
//
// Note: The main loop does not support Base::kWarpGemmIterations == 2.
CUTLASS_PRAGMA_UNROLL
for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
//
// Loop over GEMM K dimension
//
CUTLASS_PRAGMA_UNROLL
for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
// Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
// as the case may be.
if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
__syncthreads();
++this->smem_iterator_B1_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy1::kPartitionsK * Base::kWarpGemmIterations1,
0});
}
smem_write_stage_idx ^= 1;
}
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
++warp_tile_iterator_A1_;
++this->warp_tile_iterator_B1_;
if (warp_mma_k == 0) {
iterator_B1.load(tb_frag_B1);
++iterator_B1;
}
warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
warp_frag_B1[warp_mma_k % 2], accum);
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -635,40 +635,9 @@ public:
++stage, --gemm_k_iterations_1) {
if (gemm_k_iterations_1 == 0) {
// iterator_A1.clear_mask();
iterator_B1.clear_mask();
}
#if 0
iterator_A1.set_iteration_index(0);
this->smem_iterator_A1_.set_iteration_index(0);
// LDGSTS for operand A
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < Detail::TBLDGSTSIterationsA1; ++j) {
typename IteratorA1::AccessType *dst_ptr =
reinterpret_cast<typename IteratorA1::AccessType *>(
this->smem_iterator_A1_.get());
CUTLASS_PRAGMA_UNROLL
for (int v = 0; v < IteratorA1::kAccessesPerVector; ++v) {
int const kSrcBytes =
sizeof_bits<typename IteratorA1::Element>::value *
IteratorA1::ThreadMap::kElementsPerAccess /
IteratorA1::kAccessesPerVector / 8;
int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
++iterator_A0;
}
++this->smem_iterator_A0_;
}
#endif
iterator_B1.set_iteration_index(0);
this->smem_iterator_B1_.set_iteration_index(0);
@ -696,19 +665,14 @@ public:
}
// Move to the next stage
//iterator_A1.add_tile_offset({0, 1});
iterator_B1.add_tile_offset({1, 0});
//this->smem_iterator_A1_.add_tile_offset({0, 1});
this->smem_iterator_B1_.add_tile_offset({1, 0});
// Defines the boundary of a stage of cp.async.
cutlass::arch::cp_async_fence();
}
// Perform accumulation in the 'd' output operand
// FragmentC0 accum0 = src_accum;
// DEPBAR+SYNC
cutlass::arch::cp_async_wait<Base::kStages - 2>();
__syncthreads();
@ -722,7 +686,6 @@ public:
Operator1 warp_mma1;
// this->warp_tile_iterator_A1_.set_kgroup_index(0);
this->warp_tile_iterator_B1_.set_kgroup_index(0);
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0);
@ -732,7 +695,6 @@ public:
++this->warp_tile_iterator_B1_;
if (gemm_k_iterations_1 == 0) {
// iterator_A1.clear_mask();
iterator_B1.clear_mask();
}
@ -762,7 +724,6 @@ public:
// Load warp-level tiles from shared memory, wrapping to k offset if
// this is the last group as the case may be.
// this->warp_tile_iterator_A1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
@ -777,6 +738,7 @@ public:
warp_loaded_frag_A1[warp_mma_k % 2],
warp_loaded_frag_B1[warp_mma_k % 2]);
warp_mma1(
accum,
warp_transformed_frag_A1[warp_mma_k % 2],
@ -823,7 +785,7 @@ public:
if (smem_read_stage_idx == (Base::kStages - 1)) {
this->warp_tile_iterator_B1_.add_tile_offset(
{-Base::kStages * Policy0::kPartitionsK *
{-Base::kStages * Policy1::kPartitionsK *
Base::kWarpGemmIterations1,
0});
smem_read_stage_idx = 0;
@ -831,7 +793,6 @@ public:
++smem_read_stage_idx;
}
// --gemm_k_iterations_1;
if (gemm_k_iterations_1 == 1) {
iterator_B1.clear_mask();
}

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -454,11 +454,11 @@ public:
this->smem_iterator_B1_.store(tb_frag_B1);
__syncthreads();
++smem_iterator_B1_;
++this->smem_iterator_B1_;
// Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
if (smem_write_stage_idx == 1) {
smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
}
else {
this->warp_tile_iterator_B1_.add_tile_offset(

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -93,7 +93,7 @@ template <
struct DefaultB2bMma;
////////////////////////////////////////////////////////////////////////////////
/// Specialization for row-major output
/// Specialization for row-major output with 2-stage pipeline
template <
/// Element type for A matrix operand
typename ElementA,
@ -110,8 +110,6 @@ template <
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename OperatorClass,
/// Tag indicating architecture to tune for
typename ArchTag,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
@ -129,7 +127,7 @@ template <
typename EpilogueOutputOp>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator, layout::RowMajor,
OperatorClass, ArchTag,
arch::OpClassTensorOp, ArchTag,
ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1,
InstructionShape, 2, Operator, EpilogueOutputOp, false> {
@ -137,11 +135,11 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
OperatorClass, 2, Operator>;
arch::OpClassTensorOp, 2, Operator>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
OperatorClass, 2, Operator>;
arch::OpClassTensorOp, 2, Operator>;
// Define iterators over tiles from the A operand
using IteratorA0 =
@ -162,7 +160,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>;
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
// Define iterators over tiles from the B operand
using IteratorB1 =
@ -181,9 +179,120 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
};
////////////////////////////////////////////////////////////////////////////////
/// Specialization for row-major output for multi-stage
template <
/// Element type for A matrix operand
typename ElementA,
/// Layout type for A matrix operand
typename LayoutA,
/// Access granularity of A matrix in units of elements
int kAlignmentA,
/// Element type for B matrix operand
typename ElementB,
/// Layout type for B matrix operand
typename LayoutB,
/// Access granularity of B matrix in units of elements
int kAlignmentB,
/// Element type for internal accumulation
typename ElementAccumulator,
/// Tag indicating architecture to tune for
typename ArchTag,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape0,
/// Threadblock-level tile size (concept: GemmShape)
typename ThreadblockShape1,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape0,
/// Warp-level tile size (concept: GemmShape)
typename WarpShape1,
/// Instruction-level tile size (concept: GemmShape)
typename InstructionShape,
/// Number of stages used in the multistage mainloop
int Stages,
/// Operation performed by GEMM
typename Operator,
/// Epilogue output operator
typename EpilogueOutputOp>
struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
kAlignmentB, ElementAccumulator, layout::RowMajor,
arch::OpClassTensorOp, ArchTag,
ThreadblockShape0, ThreadblockShape1,
WarpShape0, WarpShape1,
InstructionShape, Stages, Operator, EpilogueOutputOp, false> {
static cutlass::arch::CacheOperation::Kind const CacheOpA =
((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
? cutlass::arch::CacheOperation::Global
: cutlass::arch::CacheOperation::Always;
static cutlass::arch::CacheOperation::Kind const CacheOpB =
((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
? cutlass::arch::CacheOperation::Global
: cutlass::arch::CacheOperation::Always;
// Define the MmaCore components
using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, Operator, false, CacheOpA, CacheOpB>;
using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
Stages, Operator, false, CacheOpA, CacheOpB>;
// Define iterators over tiles from the A operand
using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
using IteratorA0 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
// Define iterators over tiles from the B operand
using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
using IteratorB0 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
// Use fragment iterator for A operand
using AccumulatorLayout = cutlass::layout::ColumnMajor;
using FragmentIteratorA1 =
cutlass::gemm::warp::MmaTensorOpFragmentIterator<
cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
using IteratorB1 =
cutlass::transform::threadblock::PredicatedTileAccessIterator<
cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB1>;
// Define the threadblock-scoped pipelined matrix multiply
using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
MmaCore0::kCacheOpA,
IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
typename MmaCore1::Shape, FragmentIteratorA1,
IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
ElementAccumulator, layout::RowMajor,
EpilogueOutputOp,
typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>;
};
////////////////////////////////////////////////////////////////////////////////
/// Specialization for column-major-interleaved output
/// Specialization for column-major-interleaved output with 2-stage pipeline
template <
/// Element type for A matrix operand
typename ElementA,
@ -258,7 +367,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout,
InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
InstructionShape, EpilogueOutputOp>;
// Define iterators over tiles from the B operand
using IteratorB1 =
@ -281,7 +390,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
////////////////////////////////////////////////////////////////////////////////
/// Specialization for column-major-interleaved output
/// Specialization for column-major-interleaved output with multi-stage
template <
/// Element type for A matrix operand
typename ElementA,
@ -360,7 +469,7 @@ struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
MmaCore1::Shape::kK, //kBlocksColumn
ElementAccumulator, ElementA, AccumulatorLayout,
InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>;
InstructionShape, EpilogueOutputOp>;
// Define iterators over tiles from the B operand
using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -191,8 +191,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
@ -258,7 +262,7 @@ int main() {
}
if (!((props.major * 10 + props.minor) >= 80)) {
std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80."
std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
<< std::endl;
notSupported = true;
}

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -223,8 +223,12 @@ int run() {
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
@ -22,7 +22,7 @@
cutlass_example_add_executable(
22_ampere_tensorop_conv2dfprop
16_ampere_tensorop_conv2dfprop
ampere_tensorop_conv2dfprop.cu
)

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -318,7 +318,7 @@ struct Options {
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "22_ampere_tensorop_conv2dfprop example\n\n"
out << "16_ampere_tensorop_conv2dfprop example\n\n"
<< " This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
<< " forward convolution on tensors of layout NHWC.\n\n"
<< "Options:\n\n"
@ -340,8 +340,8 @@ struct Options {
<< " --tag <string> String to replicate across the first column in the results table\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
<< "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
<< "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
<< "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
return out;
}
@ -474,8 +474,8 @@ Result profile_convolution(Options const &options) {
// Split K dimension into 1 partitions
int split_k_slices = 1;
typename ImplicitGemm::Arguments arguments{
{
// Construct Conv2dProblemSize with user defined output size
cutlass::conv::Conv2dProblemSize problem_size(
options.input_size,
options.filter_size,
options.padding,
@ -483,15 +483,18 @@ Result profile_convolution(Options const &options) {
options.dilation,
options.output_size(),
mode,
split_k_slices
},
split_k_slices
);
// Construct ImplicitGemm::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename ImplicitGemm::Arguments arguments{
problem_size,
tensor_a.device_ref(),
tensor_b.device_ref(),
tensor_c.device_ref(),
tensor_c.device_ref(),
{options.alpha, options.beta},
};
//
@ -505,6 +508,9 @@ Result profile_convolution(Options const &options) {
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
result.status = implicit_gemm_op.can_implement(arguments);
CUTLASS_CHECK(result.status);
result.status = implicit_gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(result.status);
@ -522,15 +528,6 @@ Result profile_convolution(Options const &options) {
if (options.reference_check) {
std::cout << "Verification on host...\n";
cutlass::conv::Conv2dProblemSize problem_size(
options.input_size,
options.filter_size,
options.padding,
options.conv_stride,
options.dilation,
mode
);
// Compute with reference implementation
cutlass::reference::host::Conv2dFprop<
ElementInputA,
@ -576,7 +573,7 @@ Result profile_convolution(Options const &options) {
std::stringstream ss;
ss << "22_ampere_workspace_conv2dfprop_"
ss << "16_ampere_workspace_conv2dfprop_"
<< options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c()
<< "_"
<< options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c()
@ -667,7 +664,7 @@ int main(int argc, char const **args) {
bool notSupported = false;
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
//
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
@ -20,14 +20,9 @@
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
13_fused_two_gemms
fused_gemm.cu
)
target_include_directories(
13_fused_two_gemms
PRIVATE
.
17_fprop_per_channel_bias
fprop_per_channel_bias.cu
)

View File

@ -0,0 +1,300 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
The convolution version of 12_gemm_bias_relu. Similarly, we put bias vector in Operand C and the
rest is the same as normal convolution.
*/
#include <iostream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/host_reorder.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/device/convolution.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using ElementAccumulator = float; // Data type of accumulator
using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation
using ElementInputA = cutlass::half_t; // Data type of elements in input tensor
using ElementInputB = cutlass::half_t; // Data type of elements in input tensor
using ElementOutput = float; // Data type of elements in output tensor
using LayoutInputA = cutlass::layout::TensorNHWC;
using LayoutInputB = cutlass::layout::TensorNHWC;
using LayoutOutput = cutlass::layout::TensorNHWC;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassTensorOp;
// This code section describes CUDA SM architecture number
using SmArch = cutlass::arch::Sm80;
// This code section describes the tile size a thread block will compute
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>; // Threadblock tile shape
// This code section describes tile size a warp will compute
using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>; // Warp tile shape
// This code section describes the size of MMA op
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; // TensorCore instruction shape
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
// Number of pipelines you want to use
constexpr int NumStages = 4;
// This code section describe iterator algorithm selected is Analytic or Optimized
static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
// This code section describes the epilogue part of the kernel, we use default value
using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu<
ElementOutput, // Data type of output matrix.
128 / cutlass::sizeof_bits<ElementOutput>::value, // The number of elements per vectorized.
// memory access. This becomes the vector width of
// math instructions in the epilogue too.
ElementAccumulator, // Data type of accumulator
ElementComputeEpilogue, // Data type for alpha in linear combination
cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // alpha X C + per channel bias
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementInputA, LayoutInputA,
ElementInputB, LayoutInputB,
ElementOutput, LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ThreadblockShape,
WarpShape,
InstructionShape,
EpilogueOp,
SwizzleThreadBlock,
NumStages,
cutlass::arch::OpMultiplyAdd,
IteratorAlgorithm
>::Kernel;
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
/////////////////////////////////////////////////////////////////////////////////////////////////
int run() {
// Construct Conv2dProblemSize with user defined output size
cutlass::conv::Conv2dProblemSize problem_size(
{1, 7, 7, 512}, // activation
{512, 3, 3, 512}, // filter
{1, 1, 1, 1}, // padding
{1, 1}, // striding
{1, 1}, // dilation
cutlass::conv::Mode::kCrossCorrelation, // mode (convolution or cross-correlation)
1 // split-k slices
);
// Initialize tensors using CUTLASS helper functions
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(problem_size.activation_extent());
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(problem_size.filter_extent());
// Create tensor C with dimensions 1x1x1xk which is the bias vector
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c_bias({1, 1, 1, problem_size.K});
// Create tensor D used to store output from CUTLASS kernel
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(problem_size.output_extent());
// Create matrix D with dimensions M x N used to store output from reference
// kernel
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(problem_size.output_extent());
// Fill input and output matrices on host using CUTLASS helper functions
cutlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
ElementInputA(4),
ElementInputA(-4),
0); // <- Fill tensor A on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
1,
ElementInputB(4),
ElementInputB(-4),
0); // <- Fill tensor B on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_c_bias.host_view(),
1,
ElementOutput(4),
ElementOutput(-4),
0); // <- Fill matrix C on host with uniform-distribution random data
cutlass::reference::host::TensorFill(
tensor_d.host_view()); // <- fill matrix D on host with zeros
cutlass::reference::host::TensorFill(
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c_bias.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated CUTLASS kernel
typename ImplicitGemm::Arguments arguments{
problem_size,
tensor_a.device_ref(), // <- reference to tensor A on device
tensor_b.device_ref(), // <- reference to tensor B on device
// tensor C is treated as the bias vector. We can enable the CONV
// to project away the N, H, W dimension by setting the stride to zero.
{tensor_c_bias.device_data(), LayoutOutput::Stride(0)},
tensor_d.device_ref(), // <- reference to tensor D on device
{alpha} };
// Instantiate CUTLASS kernel depending on templates
ImplicitGemm implicit_gemm_op;
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Check the problem size is supported or not
cutlass::Status status = implicit_gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = implicit_gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
status = implicit_gemm_op();
CUTLASS_CHECK(status);
//
// Create instantiation for device reference conv kernel
//
// Launch device reference to compute strictly the product A * B
cutlass::reference::device::Conv2d<
ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementAccumulator,
cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>>
(
cutlass::conv::Operator::kFprop,
problem_size,
tensor_a.device_ref(),
tensor_b.device_ref(),
tensor_c_bias.device_ref(),
tensor_ref_d.device_ref(),
alpha, 0
);
// Wait for kernels to finish
cudaDeviceSynchronize();
// Copy output data from CUTLASS and reference kernel to host for comparison
tensor_d.sync_host();
tensor_ref_d.sync_host();
// Compute bias + relu in host code
for (int n = 0; n < problem_size.N; ++n) {
for (int p = 0; p < problem_size.P; ++p) {
for (int q = 0; q < problem_size.Q; ++q) {
for (int k = 0; k < problem_size.K; ++k) {
tensor_ref_d.at({n, p, q, k}) =
std::max(ElementOutput(0),
ElementOutput(tensor_ref_d.at({n, p, q, k}) +
tensor_c_bias.at({0, 0, 0, k})));
}
}
}
}
// Check if output from CUTLASS kernel and reference kernel are equal or not
std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(),
tensor_ref_d.host_view())
? "Passed"
: "Failed")
<< std::endl;
CUTLASS_CHECK(status);
return 0;
}
int main(int argc, char const **args) {
bool notSupported = false;
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
//
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
notSupported = true;
}
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
<< std::endl;
notSupported = true;
}
if (notSupported) {
return 0;
}
return run();
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,4 +1,4 @@
# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
@ -78,10 +78,11 @@ foreach(EXAMPLE
10_planar_complex
11_planar_complex_array
12_gemm_bias_relu
13_fused_two_gemms
13_two_tensor_op_fusion
14_ampere_tf32_tensorop_gemm
15_ampere_sparse_tensorop_gemm
22_ampere_tensorop_conv2dfprop
16_ampere_tensorop_conv2dfprop
17_fprop_per_channel_bias
)
add_subdirectory(${EXAMPLE})

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -187,10 +187,10 @@ struct global_load<AccessType,
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
/// Fragment type to store loaded data
/// Fragment type to store data
typename AccessType,
/// The bytes of loading
int LoadBytes
/// The bytes of storing
int StoreBytes
>
struct global_store;
@ -294,7 +294,6 @@ struct global_store<AccessType, 1> {
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace arch
} // namespace cutlass

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -73,7 +73,7 @@ inline __device__ void ldsm(Array<unsigned, MatrixCount> & D, void const* ptr);
#endif
*/
#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
#if (! defined (__clang__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
extern "C" {
//
// This NVVM intrinsic is subject to change in future versions of CUDA.
@ -91,7 +91,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to
// the previous internal intrinsics if they are available.
#if (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
#if (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11)
//
// This NVVM intrinsic converts an address in shared memory to a plain
// unsigned integer. This is necessary to pass to shared memory instructions
@ -104,7 +104,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
/// CUTLASS helper to get SMEM pointer
return static_cast<unsigned>(__cvta_generic_to_shared(ptr));
#elif (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
#elif (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)
return __nvvm_get_smem_pointer(ptr);
@ -120,7 +120,10 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) {
#else
return 0;
CUTLASS_UNUSED(ptr);
CUTLASS_NOT_IMPLEMENTED();
return 0;
#endif
}
@ -146,7 +149,9 @@ inline __device__ void ldsm<layout::RowMajor, 1>(
#else
assert(0);
CUTLASS_UNUSED(D);
CUTLASS_UNUSED(ptr);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
@ -168,7 +173,9 @@ inline __device__ void ldsm<layout::RowMajor, 2>(
#else
assert(0);
CUTLASS_UNUSED(D);
CUTLASS_UNUSED(ptr);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
@ -190,7 +197,9 @@ inline __device__ void ldsm<layout::RowMajor, 4>(
#else
assert(0);
CUTLASS_UNUSED(D);
CUTLASS_UNUSED(ptr);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
@ -216,7 +225,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 1>(
#else
assert(0);
CUTLASS_UNUSED(D);
CUTLASS_UNUSED(ptr);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
@ -238,7 +249,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 2>(
#else
assert(0);
CUTLASS_UNUSED(D);
CUTLASS_UNUSED(ptr);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
@ -260,7 +273,9 @@ inline __device__ void ldsm<layout::ColumnMajor, 4>(
#else
assert(0);
CUTLASS_UNUSED(D);
CUTLASS_UNUSED(ptr);
CUTLASS_NOT_IMPLEMENTED();
#endif
}

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -74,15 +74,16 @@ template <
/// Size of the access in bytes
int SizeInBytes>
struct cp_async<SizeInBytes, CacheOperation::Always> {
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
/// Copy
CUTLASS_DEVICE
cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
#if CUDA_CP_ASYNC_ACTIVATED
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
asm volatile(
@ -108,15 +109,16 @@ template <
/// Size of the access in bytes
int SizeInBytes>
struct cp_async_zfill<SizeInBytes, CacheOperation::Always> {
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
/// Copy with zero fill
CUTLASS_DEVICE
cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
#if CUDA_CP_ASYNC_ACTIVATED
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
int src_in_bytes = (pred_guard ? SizeInBytes : 0);
@ -146,16 +148,13 @@ template <
/// Size of the access in bytes
int SizeInBytes>
struct cp_async<SizeInBytes, CacheOperation::Global> {
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
/// Copy
CUTLASS_DEVICE
cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
#if CUDA_CP_ASYNC_ACTIVATED
static_assert(SizeInBytes == 16,
static_assert(SizeInBytes == 16,
"cp.async only supports CacheOperation::Global when access size is 16B.");
unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
@ -183,16 +182,13 @@ template <
/// Size of the access in bytes
int SizeInBytes>
struct cp_async_zfill<SizeInBytes, CacheOperation::Global> {
// Make sure the size is supported.
static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16),
"Size is not supported");
/// Copy with zero fill
CUTLASS_DEVICE
cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
#if CUDA_CP_ASYNC_ACTIVATED
static_assert(SizeInBytes == 16,
static_assert(SizeInBytes == 16,
"cp.async only supports CacheOperation::Global when access size is 16B.");
unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -112,7 +112,13 @@ struct Mma<
);
#else
assert(0);
CUTLASS_UNUSED(d);
CUTLASS_UNUSED(a);
CUTLASS_UNUSED(b);
CUTLASS_UNUSED(c);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
};
@ -178,7 +184,13 @@ struct Mma<
);
#else
assert(0);
CUTLASS_UNUSED(d);
CUTLASS_UNUSED(a);
CUTLASS_UNUSED(b);
CUTLASS_UNUSED(c);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
};
@ -230,7 +242,13 @@ struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
#else
assert(0);
CUTLASS_UNUSED(d);
CUTLASS_UNUSED(a);
CUTLASS_UNUSED(b);
CUTLASS_UNUSED(c);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
};
@ -291,7 +309,13 @@ struct Mma<
);
#else
assert(0);
CUTLASS_UNUSED(d);
CUTLASS_UNUSED(a);
CUTLASS_UNUSED(b);
CUTLASS_UNUSED(c);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
};
@ -352,7 +376,13 @@ struct Mma<
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
#else
assert(0);
CUTLASS_UNUSED(d);
CUTLASS_UNUSED(a);
CUTLASS_UNUSED(b);
CUTLASS_UNUSED(c);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
};
@ -413,7 +443,13 @@ struct Mma<
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
#else
assert(0);
CUTLASS_UNUSED(d);
CUTLASS_UNUSED(a);
CUTLASS_UNUSED(b);
CUTLASS_UNUSED(c);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
};
@ -472,7 +508,13 @@ struct Mma<
: "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));
#else
assert(0);
CUTLASS_UNUSED(d);
CUTLASS_UNUSED(a);
CUTLASS_UNUSED(b);
CUTLASS_UNUSED(c);
CUTLASS_NOT_IMPLEMENTED();
#endif
}
};

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -45,9 +45,6 @@ template <
class Array<T, N, false> {
public:
static_assert(sizeof_bits<T>::value * N >= 8,
"Array<> specialized for sub-byte types assume the actual stored element size is 1 byte");
static int const kSizeBits = sizeof_bits<T>::value * N;
/// Storage type

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:

View File

@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
@ -52,6 +52,23 @@ enum class ComplexTransform {
kConjugate
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Defines ComplexTransform inversions
template <ComplexTransform kTransform>
struct InvertComplexTransform;
/// Invert ComplexTransform from kNone to kConjugate
template <>
struct InvertComplexTransform<ComplexTransform::kNone> {
static ComplexTransform const transform = ComplexTransform::kConjugate;
};
/// Invert ComplexTransform from kConjugate to kNone
template <>
struct InvertComplexTransform<ComplexTransform::kConjugate> {
static ComplexTransform const transform = ComplexTransform::kNone;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
//
@ -291,6 +308,30 @@ CUTLASS_HOST_DEVICE T &imag(complex<T> &z) {
return z.imag();
}
/// Returns the real part of the real number
template <typename T>
CUTLASS_HOST_DEVICE T const &real(T const &r) {
return r;
}
/// Returns the real part of the real number
template <typename T>
CUTLASS_HOST_DEVICE T &real(T &r) {
return r;
}
/// Returns the imaginary part of the real number
template <typename T>
CUTLASS_HOST_DEVICE T const &imag(T const &r) {
return T();
}
/// Returns the imaginary part of the complex number
template <typename T>
CUTLASS_HOST_DEVICE T &imag(T &r) {
return T();
}
//
// Output operators
//

Some files were not shown because too many files have changed in this diff Show More