293 lines
10 KiB
C++
293 lines
10 KiB
C++
/***************************************************************************************************
|
|
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
* list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
* contributors may be used to endorse or promote products derived from
|
|
* this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
|
|
#pragma once
|
|
#include <cuda_fp16.h>
|
|
|
|
template <typename T>
|
|
__device__
|
|
T add(T const & a, T const &b){
|
|
return (a + b);
|
|
}
|
|
|
|
template <>
|
|
__device__
|
|
half2 add(half2 const & a, half2 const &b){
|
|
return (__hadd2(a,b));
|
|
}
|
|
|
|
template <typename T>
|
|
struct RELU{
|
|
__device__
|
|
T operator()(T const & a){
|
|
return a > T(0) ? a : T(0);
|
|
}
|
|
__device__
|
|
half2 operator()(half2 const & a){
|
|
float2 a_fp32x2 = __half22float2(a);
|
|
a_fp32x2.x = a_fp32x2.x > 0.f ? a_fp32x2.x : 0.f;
|
|
a_fp32x2.y = a_fp32x2.y > 0.f ? a_fp32x2.y : 0.f;
|
|
if(a_fp32x2.x < 0.f || a_fp32x2.y < 0.f)
|
|
printf(" %f %f\n", a_fp32x2.x ,a_fp32x2.y);
|
|
return __float22half2_rn(a_fp32x2);
|
|
}
|
|
};
|
|
|
|
template <typename T>
|
|
struct LEAKY_RELU{
|
|
__device__
|
|
T operator()(T const & a, T const & scale = half(1)){
|
|
return a > T(0) ? a : scale * a;
|
|
}
|
|
__device__
|
|
half2 operator()(half2 const & a, half const & scale = half(1)){
|
|
half2 zero = __half2half2(half(0));
|
|
half2 gt_zero = __hge2(a, zero);
|
|
half2 le_zero = __hle2(a, zero);
|
|
|
|
|
|
half2 scale_f16x2 = __half2half2(scale);
|
|
half2 mask_scale_f16x2 = __hfma2(le_zero, scale_f16x2, gt_zero);
|
|
return __hmul2(a, mask_scale_f16x2);
|
|
}
|
|
};
|
|
|
|
template <int N, int BLOCKDIM>
|
|
__global__ void leaky_and_activation(half* inout, half* bias, half scale, bool mat_bias){
|
|
|
|
constexpr bool N_MOD_2 = N & 1 ? false : true;
|
|
|
|
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
|
|
|
|
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
|
|
|
|
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
|
|
|
|
LEAKY_RELU<half> Act;
|
|
Access_tp src_v[iter];
|
|
Access_tp bias_v[iter];
|
|
|
|
int batch_id = blockIdx.y;
|
|
int batch_offset = batch_id * gridDim.x * N;
|
|
|
|
for(int i = 0; i < iter; i++){
|
|
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
|
|
if (idx < N){
|
|
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
|
|
if (mat_bias)
|
|
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
|
|
else
|
|
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
|
|
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i]),scale);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template <int N, int BLOCKDIM>
|
|
__global__ void leaky_and_activation(half* inout, half scale){
|
|
|
|
constexpr bool N_MOD_2 = N & 1 ? false : true;
|
|
|
|
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
|
|
|
|
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
|
|
|
|
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
|
|
|
|
int batch_id = blockIdx.y;
|
|
int batch_offset = batch_id * gridDim.x * N;
|
|
|
|
LEAKY_RELU<half> Act;
|
|
Access_tp src_v[iter];
|
|
|
|
for(int i = 0; i < iter; i++){
|
|
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
|
|
if (idx < N){
|
|
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
|
|
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i], scale);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template <int N, int BLOCKDIM>
|
|
void leaky_and_activation(half* inout, half* bias, int m, int b, half scale, bool mat_bias){
|
|
|
|
dim3 grid(m, b);
|
|
if (bias == nullptr)
|
|
leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, scale);
|
|
else
|
|
leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, scale, mat_bias);
|
|
}
|
|
|
|
template <int N, int BLOCKDIM>
|
|
__global__ void relu_and_activation(half* inout, half* bias, bool mat_bias){
|
|
|
|
constexpr bool N_MOD_2 = N & 1 ? false : true;
|
|
|
|
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
|
|
|
|
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
|
|
|
|
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
|
|
|
|
RELU<half> Act;
|
|
Access_tp src_v[iter];
|
|
Access_tp bias_v[iter];
|
|
|
|
int batch_id = blockIdx.y;
|
|
int batch_offset = batch_id * gridDim.x * N;
|
|
|
|
for(int i = 0; i < iter; i++){
|
|
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
|
|
if (idx < N){
|
|
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
|
|
if (mat_bias)
|
|
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
|
|
else
|
|
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
|
|
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i]));
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template <int N, int BLOCKDIM>
|
|
__global__ void relu_and_activation(half* inout){
|
|
|
|
constexpr bool N_MOD_2 = N & 1 ? false : true;
|
|
|
|
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
|
|
|
|
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
|
|
|
|
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
|
|
|
|
int batch_id = blockIdx.y;
|
|
int batch_offset = batch_id * gridDim.x * N;
|
|
|
|
RELU<half> Act;
|
|
Access_tp src_v[iter];
|
|
|
|
for(int i = 0; i < iter; i++){
|
|
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
|
|
if (idx < N){
|
|
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
|
|
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i]);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template <int N, int BLOCKDIM>
|
|
void relu_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
|
|
dim3 grid(m, b);
|
|
if (bias == nullptr)
|
|
relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
|
|
else
|
|
relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
|
|
}
|
|
|
|
|
|
template <int N, int BLOCKDIM>
|
|
__global__ void identity_and_activation(half* inout, half* bias, bool mat_bias){
|
|
|
|
constexpr bool N_MOD_2 = N & 1 ? false : true;
|
|
|
|
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
|
|
|
|
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
|
|
|
|
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
|
|
|
|
int batch_id = blockIdx.y;
|
|
int batch_offset = batch_id * gridDim.x * N;
|
|
|
|
Access_tp src_v[iter];
|
|
Access_tp bias_v[iter];
|
|
|
|
for(int i = 0; i < iter; i++){
|
|
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
|
|
if (idx < N){
|
|
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
|
|
if (mat_bias)
|
|
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
|
|
else
|
|
bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
|
|
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (add(src_v[i],bias_v[i]));
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
template <int N, int BLOCKDIM>
|
|
__global__ void identity_and_activation(half* inout){
|
|
|
|
constexpr bool N_MOD_2 = N & 1 ? false : true;
|
|
|
|
using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
|
|
|
|
constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
|
|
|
|
constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
|
|
|
|
int batch_id = blockIdx.y;
|
|
int batch_offset = batch_id * gridDim.x * N;
|
|
Access_tp src_v[iter];
|
|
|
|
for(int i = 0; i < iter; i++){
|
|
int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
|
|
if (idx < N){
|
|
src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
|
|
*reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (src_v[i]);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
template <int N, int BLOCKDIM>
|
|
void identity_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
|
|
dim3 grid(m, b);
|
|
if (bias == nullptr)
|
|
identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
|
|
else
|
|
identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
|
|
}
|