cutlass/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h

293 lines
10 KiB
C++

/***************************************************************************************************
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
#include <cuda_fp16.h>
template <typename T>
__device__
T add(T const & a, T const &b){
return (a + b);
}
template <>
__device__
half2 add(half2 const & a, half2 const &b){
return (__hadd2(a,b));
}
template <typename T>
struct RELU{
__device__
T operator()(T const & a){
return a > T(0) ? a : T(0);
}
__device__
half2 operator()(half2 const & a){
float2 a_fp32x2 = __half22float2(a);
a_fp32x2.x = a_fp32x2.x > 0.f ? a_fp32x2.x : 0.f;
a_fp32x2.y = a_fp32x2.y > 0.f ? a_fp32x2.y : 0.f;
if(a_fp32x2.x < 0.f || a_fp32x2.y < 0.f)
printf(" %f %f\n", a_fp32x2.x ,a_fp32x2.y);
return __float22half2_rn(a_fp32x2);
}
};
template <typename T>
struct LEAKY_RELU{
__device__
T operator()(T const & a, T const & scale = half(1)){
return a > T(0) ? a : scale * a;
}
__device__
half2 operator()(half2 const & a, half const & scale = half(1)){
half2 zero = __half2half2(half(0));
half2 gt_zero = __hge2(a, zero);
half2 le_zero = __hle2(a, zero);
half2 scale_f16x2 = __half2half2(scale);
half2 mask_scale_f16x2 = __hfma2(le_zero, scale_f16x2, gt_zero);
return __hmul2(a, mask_scale_f16x2);
}
};
template <int N, int BLOCKDIM>
__global__ void leaky_and_activation(half* inout, half* bias, half scale, bool mat_bias){
constexpr bool N_MOD_2 = N & 1 ? false : true;
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
LEAKY_RELU<half> Act;
Access_tp src_v[iter];
Access_tp bias_v[iter];
int batch_id = blockIdx.y;
int batch_offset = batch_id * gridDim.x * N;
for(int i = 0; i < iter; i++){
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
if (idx < N){
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
if (mat_bias)
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
else
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i]),scale);
}
}
}
template <int N, int BLOCKDIM>
__global__ void leaky_and_activation(half* inout, half scale){
constexpr bool N_MOD_2 = N & 1 ? false : true;
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
int batch_id = blockIdx.y;
int batch_offset = batch_id * gridDim.x * N;
LEAKY_RELU<half> Act;
Access_tp src_v[iter];
for(int i = 0; i < iter; i++){
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
if (idx < N){
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i], scale);
}
}
}
template <int N, int BLOCKDIM>
void leaky_and_activation(half* inout, half* bias, int m, int b, half scale, bool mat_bias){
dim3 grid(m, b);
if (bias == nullptr)
leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, scale);
else
leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, scale, mat_bias);
}
template <int N, int BLOCKDIM>
__global__ void relu_and_activation(half* inout, half* bias, bool mat_bias){
constexpr bool N_MOD_2 = N & 1 ? false : true;
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
RELU<half> Act;
Access_tp src_v[iter];
Access_tp bias_v[iter];
int batch_id = blockIdx.y;
int batch_offset = batch_id * gridDim.x * N;
for(int i = 0; i < iter; i++){
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
if (idx < N){
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
if (mat_bias)
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
else
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i]));
}
}
}
template <int N, int BLOCKDIM>
__global__ void relu_and_activation(half* inout){
constexpr bool N_MOD_2 = N & 1 ? false : true;
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
int batch_id = blockIdx.y;
int batch_offset = batch_id * gridDim.x * N;
RELU<half> Act;
Access_tp src_v[iter];
for(int i = 0; i < iter; i++){
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
if (idx < N){
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i]);
}
}
}
template <int N, int BLOCKDIM>
void relu_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
dim3 grid(m, b);
if (bias == nullptr)
relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
else
relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
}
template <int N, int BLOCKDIM>
__global__ void identity_and_activation(half* inout, half* bias, bool mat_bias){
constexpr bool N_MOD_2 = N & 1 ? false : true;
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
int batch_id = blockIdx.y;
int batch_offset = batch_id * gridDim.x * N;
Access_tp src_v[iter];
Access_tp bias_v[iter];
for(int i = 0; i < iter; i++){
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
if (idx < N){
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
if (mat_bias)
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
else
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (add(src_v[i],bias_v[i]));
}
}
}
template <int N, int BLOCKDIM>
__global__ void identity_and_activation(half* inout){
constexpr bool N_MOD_2 = N & 1 ? false : true;
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
int batch_id = blockIdx.y;
int batch_offset = batch_id * gridDim.x * N;
Access_tp src_v[iter];
for(int i = 0; i < iter; i++){
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
if (idx < N){
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (src_v[i]);
}
}
}
template <int N, int BLOCKDIM>
void identity_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
dim3 grid(m, b);
if (bias == nullptr)
identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
else
identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
}