6807 lines
271 KiB
Python
6807 lines
271 KiB
Python
#################################################################################################
|
|
#
|
|
# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: BSD-3-Clause
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice, this
|
|
# list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
#
|
|
# 3. Neither the name of the copyright holder nor the names of its
|
|
# contributors may be used to endorse or promote products derived from
|
|
# this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
#################################################################################################
|
|
|
|
"""
|
|
Utilities for enumerating CUTLASS library kernels
|
|
"""
|
|
|
|
import argparse
|
|
import enum
|
|
from itertools import product
|
|
import logging
|
|
import os.path
|
|
import shutil
|
|
import sys
|
|
import copy
|
|
from typing import Any, Optional, Sequence, Tuple
|
|
|
|
_LOGGER = logging.getLogger(__name__)
|
|
|
|
def logging_prefix(indent_level: int = 0) -> str:
|
|
"""String prefix for start of each debug log entry"""
|
|
prefix = '*** '
|
|
indent = ' '
|
|
return f"{prefix}{indent_level * indent}"
|
|
|
|
def log_debug_line(line: str, indent_level: int = 0) -> None:
|
|
"""Log one line of debug output"""
|
|
prefix = logging_prefix(indent_level)
|
|
_LOGGER.debug(prefix + line)
|
|
|
|
# Certain usecases of cutlass_library nearly always prefer to run as scripts with
|
|
# relative imports, rather than via an installed Python package. An example of this
|
|
# is using CUTLASS's CMake system to generate a library of kernels to be profiled.
|
|
# To make it easy to use these use cases when an existing installation of cutlass_library
|
|
# exists, this global flag can be set to true (via command-line arguments) to ensure
|
|
# that package-based installations are not used.
|
|
|
|
# Create a temporary argument parser to check only for the availability of the
|
|
# --disable-cutlass-package-imports argument, which controls whether package-based
|
|
# imports are disabled.
|
|
def _add_package_disablement_flag(argparser):
|
|
argparser.add_argument("--disable-cutlass-package-imports", action='store_true', required=False,
|
|
help="Disable use of cutlass_library from Python package")
|
|
|
|
_parser = argparse.ArgumentParser()
|
|
_add_package_disablement_flag(_parser)
|
|
_args, _ = _parser.parse_known_args()
|
|
|
|
# Add `CUTLASS_IGNORE_PACKAGE` to `builtins` so that it is visible for gating future
|
|
# imports without requiring importing another module. Ideally, we would just place this
|
|
# as a global variable in a module to that could be imported and checked (e.g.,
|
|
# utils.CUTLASS_IGNORE_PACKAGE). However, this raises the issue of determining
|
|
# where this module should be sourced (from the cutlass_library package or from
|
|
# a relative import), which is the problem this variable is being used to solve in the
|
|
# first place.
|
|
import builtins
|
|
builtins.CUTLASS_IGNORE_PACKAGE = _args.disable_cutlass_package_imports
|
|
|
|
try:
|
|
if CUTLASS_IGNORE_PACKAGE:
|
|
raise ImportError("Disabling attempt to import cutlass_library")
|
|
from cutlass_library.library import *
|
|
from cutlass_library.manifest import *
|
|
except ImportError:
|
|
from library import *
|
|
from manifest import *
|
|
###################################################################################################
|
|
|
|
#
|
|
def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
|
|
|
|
# by default, use the latest CUDA Toolkit version
|
|
cuda_version = [11, 0, 132]
|
|
|
|
# Update cuda_version based on parsed string
|
|
if semantic_ver_string != '':
|
|
for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]):
|
|
if i < len(cuda_version):
|
|
cuda_version[i] = x
|
|
else:
|
|
cuda_version.append(x)
|
|
return cuda_version >= [major, minor, patch]
|
|
|
|
|
|
###################################################################################################
|
|
###################################################################################################
|
|
|
|
#
|
|
def EpilogueAlignment(max_alignment, tile, epilogue_steps = 8):
|
|
''' Helper to compute the maximum alignment of the epilogue '''
|
|
|
|
def product(X, identity = 1):
|
|
result = identity
|
|
for item in X:
|
|
result *= item
|
|
return result
|
|
|
|
elements_per_thread = product(tile.threadblock_shape[:-1]) // product(tile.warp_count) // 32 // epilogue_steps
|
|
return min(max_alignment, elements_per_thread)
|
|
|
|
def DefaultSwizzlingFunctor():
|
|
return SwizzlingFunctor.Identity8
|
|
# To use StreamK decomposition for basic GEMMs, set `swizzling_functor = SwizzlingFunctor.StreamK`
|
|
|
|
#
|
|
def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
|
|
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
|
|
swizzling_functor = DefaultSwizzlingFunctor()):
|
|
|
|
if complex_transforms is None:
|
|
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
operations = []
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
for layout in layouts:
|
|
for tile_description in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
for complex_transform in complex_transforms:
|
|
|
|
# If alignment is a tuple or a list, then we have different alignments for A and B
|
|
alignment_a = alignment if isinstance(alignment, int) else alignment[0]
|
|
alignment_b = alignment if isinstance(alignment, int) else alignment[1]
|
|
alignment_c = min(8, alignment_a) if isinstance(alignment, int) else alignment[2]
|
|
|
|
A = TensorDescription(element_a, layout[0], alignment_a, complex_transform[0])
|
|
B = TensorDescription(element_b, layout[1], alignment_b, complex_transform[1])
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
|
|
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
|
|
def CreateGemmUniversal3xOperator(
|
|
manifest, layouts, tile_descriptions, data_types,
|
|
schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
|
|
complex_transforms=None,
|
|
epilogue_functor=EpilogueFunctor.LinearCombination,
|
|
swizzling_functor=SwizzlingFunctor.Identity1,
|
|
tile_schedulers=[TileSchedulerType.Persistent]):
|
|
|
|
if type(data_types) is dict:
|
|
data_types = [data_types]
|
|
|
|
for s in schedules:
|
|
assert(len(s) == 2)
|
|
|
|
if complex_transforms is None:
|
|
complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
|
|
|
|
operations = []
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0]]
|
|
|
|
combinations = product(layouts, tile_descriptions, data_types, complex_transforms, schedules, tile_schedulers)
|
|
for layout, tile_description, data_type, complex_transform, schedules, tile_scheduler in combinations:
|
|
kernel_schedule, epilogue_schedule = schedules
|
|
A = TensorDescription(
|
|
data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
|
|
B = TensorDescription(
|
|
data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
|
|
|
|
C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
|
|
D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
|
|
|
|
gemm_op_extra_args = {}
|
|
gemm_kind = GemmKind.Universal3x
|
|
element_compute = data_type.get("epi_type", data_type["acc_type"])
|
|
|
|
operation = GemmOperation(
|
|
gemm_kind, tile_description.minimum_compute_capability,
|
|
tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
|
|
kernel_schedule, epilogue_schedule, tile_scheduler, **gemm_op_extra_args)
|
|
|
|
manifest.append(operation)
|
|
operations.append(operation)
|
|
|
|
return operations
|
|
|
|
#
|
|
def CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type, \
|
|
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
|
|
swizzling_functor = SwizzlingFunctor.Identity8):
|
|
|
|
if complex_transforms is None:
|
|
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
gemm_kinds = [GemmKind.Sparse]
|
|
|
|
operations = []
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
for layout in layouts:
|
|
for tile_description in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
for complex_transform in complex_transforms:
|
|
|
|
alignment_c = min(8, alignment)
|
|
|
|
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
|
|
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
new_operation = GemmOperation(GemmKind.Sparse, tile_description.minimum_compute_capability, \
|
|
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
#
|
|
def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_type, \
|
|
alignment_constraints, complex_transforms):
|
|
|
|
if complex_transforms is None:
|
|
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray]
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
for gemm_kind in gemm_kinds:
|
|
for layout in layouts:
|
|
for tile_description in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
for complex_transform in complex_transforms:
|
|
|
|
alignment_c = min(8, alignment)
|
|
|
|
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
|
|
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
manifest.append(GemmOperation(gemm_kind, \
|
|
tile_description.minimum_compute_capability, \
|
|
tile_description, A, B, C, element_epilogue))
|
|
return
|
|
|
|
#
|
|
def CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, \
|
|
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
|
|
swizzling_functor = SwizzlingFunctor.Identity8):
|
|
|
|
if complex_transforms is None:
|
|
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
operations = []
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
for layout in layouts:
|
|
for tile_description in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
for complex_transform in complex_transforms:
|
|
|
|
alignment_c = min(8, alignment)
|
|
|
|
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
|
|
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
new_operation = GroupedGemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
|
|
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
#
|
|
def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_type, \
|
|
alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
|
|
swizzling_functor = SwizzlingFunctor.Identity8):
|
|
|
|
element_a, element_c, element_epilogue = data_type
|
|
|
|
operations = []
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
for layout in layouts:
|
|
for fill_mode in fill_modes:
|
|
for tile_description in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
|
|
# SERK supported layouts (RowMajor, ColumnMajor) with no conjugation
|
|
complex_transform = ComplexTransform.none
|
|
|
|
# HERK supported layouts (RowMajor + conj, ColumnMajor)
|
|
if blas_mode == BlasMode.hermitian and layout[0] == LayoutType.RowMajor:
|
|
complex_transform = ComplexTransform.conj
|
|
|
|
alignment_c = 1 # Alignment only applies to A in SYRK
|
|
|
|
A = TensorDescription(element_a, layout[0], alignment, complex_transform)
|
|
C = SymmetricTensorDescription(element_c, layout[1], fill_mode, alignment_c)
|
|
|
|
# Rank-K update
|
|
new_operation = RankKOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
|
|
tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
# Rank-2K update
|
|
new_operation = Rank2KOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
|
|
tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
#
|
|
def CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, data_type, \
|
|
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
|
|
swizzling_functor = SwizzlingFunctor.Identity8):
|
|
|
|
if complex_transforms is None:
|
|
complex_transforms = [(ComplexTransform.none),]
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
operations = []
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
for layout in layouts:
|
|
for side_mode in side_modes:
|
|
for fill_mode in fill_modes:
|
|
for diag_type in diag_types:
|
|
for tile_description in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
for complex_transform in complex_transforms:
|
|
|
|
alignment_c = min(8, alignment)
|
|
|
|
A = TriangularTensorDescription(element_a, layout[0], side_mode, fill_mode, diag_type,
|
|
alignment, complex_transform)
|
|
B = TensorDescription(element_b, layout[1], alignment)
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
new_operation = TrmmOperation(TrmmKind.Universal, tile_description.minimum_compute_capability, \
|
|
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
#
|
|
def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, data_type, \
|
|
alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
|
|
swizzling_functor = SwizzlingFunctor.Identity8):
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
operations = []
|
|
|
|
# by default, only generate the largest tile and largest alignment
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
for layout in layouts:
|
|
for side_mode in side_modes:
|
|
for fill_mode in fill_modes:
|
|
for tile_description in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
|
|
# SYMM supported layouts (RowMajor, ColumnMajor) with no conjugation
|
|
complex_transform = ComplexTransform.none
|
|
|
|
alignment_a = 1 # No vectorized access for the triangular matrix
|
|
alignment_c = min(8, alignment)
|
|
|
|
A = SymmetricTensorDescription(element_a, layout[0], fill_mode, alignment_a, complex_transform, side_mode)
|
|
# tensor A and B have same data type and layout
|
|
B = TensorDescription(element_b, layout[0], alignment)
|
|
C = TensorDescription(element_c, layout[1], alignment_c)
|
|
|
|
# SYMM/HEMM update
|
|
new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
|
|
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
# SYMM/HEMM update
|
|
new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
|
|
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
###########################################################################################################
|
|
# ConvolutionOperator support variations
|
|
# ____________________________________________________________________
|
|
# ConvolutionalOperator | Analytic | Optimized
|
|
# ____________________________________________________________________
|
|
# | Fprop | (strided) | (strided)
|
|
# | Dgrad | (strided, unity*) | (strided, unity)
|
|
# | Wgrad | (strided) | (strided)
|
|
# ____________________________________________________________________
|
|
#
|
|
# Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low
|
|
###########################################################################################################
|
|
# Convolution for 2D operations
|
|
def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
|
|
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
|
|
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
# one exceptional case
|
|
|
|
# iterator algorithm (analytic and optimized)
|
|
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
|
|
|
|
# by default, only generate the largest tile size, largest alignment, and optimized iterator
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
iterator_algorithms = [IteratorAlgorithm.Optimized]
|
|
|
|
operations = []
|
|
|
|
for tile in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
|
|
alignment_c = min(8, alignment)
|
|
|
|
A = TensorDescription(element_a, layout[0], alignment)
|
|
B = TensorDescription(element_b, layout[1], alignment)
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
swizzling_functor_ = swizzling_functor
|
|
|
|
#
|
|
# Conv2d Fprop
|
|
#
|
|
if ConvKind.Fprop in conv_kinds:
|
|
|
|
# Strided support for Analytic and Optimized Fprop
|
|
for iterator_algorithm in iterator_algorithms:
|
|
new_operations = [
|
|
# None grouped kernel
|
|
Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_),
|
|
]
|
|
|
|
# Instance group conv kernel
|
|
if tile.math_instruction.opcode_class == OpcodeClass.TensorOp and A.layout == LayoutType.TensorNHWC and \
|
|
tile.minimum_compute_capability >= 80:
|
|
# SingleGroup kernel
|
|
new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_, group_mode=GroupMode.SingleGroup))
|
|
|
|
# Analytic iterator supports MultipleGroup mode
|
|
if iterator_algorithm == IteratorAlgorithm.Analytic:
|
|
new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_, group_mode=GroupMode.MultipleGroup))
|
|
|
|
for new_operation in new_operations:
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
#
|
|
# Conv2d Dgrad
|
|
#
|
|
if ConvKind.Dgrad in conv_kinds:
|
|
|
|
# Unity stride for Analytic and Optimized Dgrad
|
|
for iterator_algorithm in iterator_algorithms:
|
|
new_operation = Conv2dOperation(ConvKind.Dgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
# Strided support for Analytic Dgrad
|
|
# strided dgrad uses a special threadblock swizzle
|
|
# note that SwizzlingFunctor.StridedDgradHorizontal might be
|
|
# better for problem sizes with large activation channel count
|
|
swizzling_functor_strided_dgrad_ = SwizzlingFunctor.StridedDgradIdentity1
|
|
|
|
if IteratorAlgorithm.Analytic in iterator_algorithms:
|
|
new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
# Strided support for Optimized Dgrad
|
|
if IteratorAlgorithm.Optimized in iterator_algorithms:
|
|
new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
#
|
|
# Conv2d Wgrad
|
|
#
|
|
if ConvKind.Wgrad in conv_kinds:
|
|
|
|
# Strided support for Analytic and Optimized Wgrad
|
|
for iterator_algorithm in iterator_algorithms:
|
|
new_operation = Conv2dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
# Convolution for 2D operations specialized for few channels
|
|
def CreateConv2dFixedChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
|
|
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
|
|
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
# one exceptional case
|
|
|
|
# iterator algorithm (analytic and optimized)
|
|
iterator_algorithms = [IteratorAlgorithm.FixedChannels,]
|
|
|
|
# by default, only generate the largest tile size, largest alignment, and optimized iterator
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
channel_counts = [channel_counts[0],]
|
|
|
|
operations = []
|
|
|
|
|
|
|
|
for tile in tile_descriptions:
|
|
for channel_count in channel_counts:
|
|
|
|
alignment_c = EpilogueAlignment(channel_count, tile)
|
|
|
|
A = TensorDescription(element_a, layout[0], channel_count)
|
|
B = TensorDescription(element_b, layout[1], channel_count)
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
swizzling_functor_ = swizzling_functor
|
|
|
|
#
|
|
# Conv2d Fprop
|
|
#
|
|
if ConvKind.Fprop in conv_kinds:
|
|
|
|
# Strided support for Analytic and Optimized Fprop
|
|
for iterator_algorithm in iterator_algorithms:
|
|
new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
# Convolution for 2D operations specialized for few channels
|
|
def CreateConv2dFewChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
|
|
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
|
|
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
# one exceptional case
|
|
|
|
# iterator algorithm (analytic and optimized)
|
|
iterator_algorithms = [IteratorAlgorithm.FewChannels,]
|
|
|
|
# by default, only generate the largest tile size, largest alignment, and optimized iterator
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
channel_counts = [channel_counts[0],]
|
|
|
|
operations = []
|
|
|
|
for tile in tile_descriptions:
|
|
for channel_count in channel_counts:
|
|
|
|
alignment_c = EpilogueAlignment(channel_count, tile)
|
|
|
|
A = TensorDescription(element_a, layout[0], channel_count)
|
|
B = TensorDescription(element_b, layout[1], channel_count)
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
swizzling_functor_ = swizzling_functor
|
|
|
|
#
|
|
# Conv2d Fprop
|
|
#
|
|
if ConvKind.Fprop in conv_kinds:
|
|
|
|
# Strided support for Analytic and Optimized Fprop
|
|
for iterator_algorithm in iterator_algorithms:
|
|
new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
# Convolution for 3D operations
|
|
def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
|
|
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination):
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
# one exceptional case
|
|
alignment_c = min(8, alignment)
|
|
|
|
# iterator algorithm (analytic and optimized)
|
|
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
|
|
|
|
# by default, only generate the largest tile size and optimized iterators
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
iterator_algorithms = [IteratorAlgorithm.Optimized]
|
|
|
|
operations = []
|
|
|
|
# All tile sizes for Conv3dFprop and Conv3dWgrad
|
|
for tile in tile_descriptions:
|
|
A = TensorDescription(element_a, layout, alignment)
|
|
B = TensorDescription(element_b, layout, alignment)
|
|
C = TensorDescription(element_c, layout, alignment_c)
|
|
|
|
#
|
|
# Conv3d Fprop
|
|
#
|
|
if ConvKind.Fprop in conv_kinds:
|
|
# Strided support for Analytic and Optimized Fprop
|
|
for iterator_algorithm in iterator_algorithms:
|
|
new_operation = Conv3dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided)
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
#
|
|
# Conv3d Wgrad
|
|
#
|
|
if ConvKind.Wgrad in conv_kinds:
|
|
|
|
# Strided support for Analytic and Optimized Wgrad
|
|
for iterator_algorithm in iterator_algorithms:
|
|
new_operation = Conv3dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
# All tile sizes for Conv3dDgrad
|
|
for tile in tile_descriptions:
|
|
|
|
A = TensorDescription(element_a, layout, alignment)
|
|
B = TensorDescription(element_b, layout, alignment)
|
|
C = TensorDescription(element_c, layout, alignment_c)
|
|
|
|
#
|
|
# Conv3d Dgrad
|
|
#
|
|
if ConvKind.Dgrad in conv_kinds:
|
|
# Unity stride for Optimized Dgrad
|
|
new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
# Strided support for Analytic Dgrad
|
|
# Conv3dDgrad has a naive strided support which does not cut down redundant MMAs
|
|
new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
|
|
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
# Convolution for Depthwise 2d conv
|
|
def CreateDepthwiseConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
|
|
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
|
|
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
|
|
|
|
element_a, element_b, element_c, element_epilogue = data_type
|
|
|
|
# iterator algorithm (FixedStrideDilation, Optimized)
|
|
iterator_algorithms = [IteratorAlgorithm.FixedStrideDilation, IteratorAlgorithm.Optimized]
|
|
|
|
# by default, only generate the largest tile size, largest alignment, and optimized iterator
|
|
if manifest.kernel_filter == '':
|
|
tile_descriptions = [tile_descriptions[0],]
|
|
alignment_constraints = [alignment_constraints[0],]
|
|
|
|
operations = []
|
|
|
|
for tile in tile_descriptions:
|
|
for alignment in alignment_constraints:
|
|
|
|
alignment_c = min(8, alignment)
|
|
|
|
A = TensorDescription(element_a, layout[0], alignment)
|
|
B = TensorDescription(element_b, layout[1], alignment)
|
|
C = TensorDescription(element_c, layout[2], alignment_c)
|
|
|
|
swizzling_functor_ = swizzling_functor
|
|
|
|
if ConvKind.Fprop in conv_kinds:
|
|
|
|
# Strided support for Optimized and FixedStridedDilation Depthwise Conv
|
|
for iterator_algorithm in iterator_algorithms:
|
|
stride_support = StrideSupport.Strided
|
|
if iterator_algorithm == IteratorAlgorithm.FixedStrideDilation:
|
|
if tile.stride == [-1, -1] or tile.dilation == [-1,-1]:
|
|
continue
|
|
stride_support = StrideSupport.Fixed
|
|
|
|
if iterator_algorithm == IteratorAlgorithm.Optimized:
|
|
if tile.stride != [-1, -1] or tile.dilation != [-1,-1]:
|
|
continue
|
|
new_operation = Conv2dOperation(ConvKind.Fprop,
|
|
iterator_algorithm,
|
|
tile.minimum_compute_capability,
|
|
tile,
|
|
A, B, C,
|
|
element_epilogue,
|
|
stride_support,
|
|
epilogue_functor,
|
|
swizzling_functor_,
|
|
group_mode=GroupMode.Depthwise)
|
|
|
|
manifest.append(new_operation)
|
|
operations.append(new_operation)
|
|
|
|
return operations
|
|
|
|
class ConvOperation3x:
|
|
"""All parameters of a CUTLASS 3 convolution operation.
|
|
|
|
Unlike CUTLASS 2 convolutions, CUTLASS 3 convolutions do not
|
|
distinguish between 2-D and 3-D convolutions by kernel class name.
|
|
Instead, for CUTLASS 3 convolutions, the tensor layouts encode
|
|
whether the convolution is 2-D or 3-D. Thus, this class deduces
|
|
the OperationKind (either Conv2d or Conv3d) from the layouts,
|
|
rather than taking it as a constructor parameter.
|
|
"""
|
|
def __init__(self,
|
|
conv_kind: ConvKind,
|
|
tile_description: TileDescription,
|
|
A: TensorDescription,
|
|
B: TensorDescription,
|
|
C: TensorDescription,
|
|
element_compute: Optional[DataType] = None,
|
|
D: Optional[TensorDescription] = None,
|
|
kernel_schedule: KernelScheduleType = KernelScheduleType.ScheduleAuto,
|
|
epilogue_schedule: EpilogueScheduleType = EpilogueScheduleType.ScheduleAuto,
|
|
tile_scheduler: TileSchedulerType = TileSchedulerType.Default,
|
|
log_indent_level: int = 1):
|
|
log_debug_line(f'ConvOperation3x::init: conv_kind: {conv_kind}', log_indent_level)
|
|
log_indent_level = log_indent_level + 1
|
|
|
|
self.conv_kind = conv_kind
|
|
self.tile_description = tile_description
|
|
self.A = A
|
|
self.B = B
|
|
self.C = C
|
|
self.element_compute = C.element if element_compute is None else element_compute
|
|
self.kernel_schedule = kernel_schedule
|
|
self.epilogue_schedule = epilogue_schedule
|
|
|
|
self.arch = tile_description.minimum_compute_capability
|
|
self.tile_scheduler = tile_scheduler
|
|
if D == None:
|
|
self.D = C
|
|
else:
|
|
self.D = D
|
|
|
|
self.is_3x = True
|
|
self.group_mode = GroupMode.NoneGroup # CUTLASS 3 convolutions currently aren't grouped
|
|
|
|
operation_kind = None
|
|
for layout in (A.layout, B.layout, C.layout):
|
|
assert(isinstance(layout, LayoutType))
|
|
new_operation_kind = convolution_tensor_layout_type_to_operation_kind(layout)
|
|
if operation_kind is None:
|
|
operation_kind = new_operation_kind
|
|
else: # CUTLASS 3 convolutions don't permit mixing 2-D and 3-D layouts.
|
|
assert(operation_kind == new_operation_kind)
|
|
assert(operation_kind is not None)
|
|
self.operation_kind = operation_kind
|
|
|
|
def __str__(self):
|
|
return f"ConvOperation3x: operation_kind={self.operation_kind}, conv_kind={self.conv_kind}, tile_description={self.tile_description}"
|
|
|
|
def is_complex(self):
|
|
complex_operators = [
|
|
MathOperation.multiply_add_complex,
|
|
MathOperation.multiply_add_complex_gaussian,
|
|
MathOperation.multiply_add_complex_fast_f32
|
|
]
|
|
return self.tile_description.math_instruction.math_operation in complex_operators
|
|
|
|
def is_mixed_input(self):
|
|
return self.A.element != self.B.element
|
|
|
|
def accumulator_type(self):
|
|
accum = self.tile_description.math_instruction.element_accumulator
|
|
if self.is_complex():
|
|
return get_complex_from_real(accum)
|
|
return accum
|
|
|
|
def short_math_name(self):
|
|
prefix = ''
|
|
if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
|
|
prefix = 'g'
|
|
return prefix + ShortDataTypeNames[self.accumulator_type()]
|
|
|
|
def is_tensor_op(self):
|
|
tensor_ops = [
|
|
OpcodeClass.TensorOp,
|
|
OpcodeClass.WmmaTensorOp
|
|
]
|
|
return self.tile_description.math_instruction.opcode_class in tensor_ops
|
|
|
|
def instruction_shape_string(self):
|
|
math_operations_map = {
|
|
MathOperation.xor_popc: 'xor',
|
|
MathOperation.and_popc: 'and'
|
|
}
|
|
if self.is_tensor_op():
|
|
is0, is1, is2 = self.tile_description.math_instruction.instruction_shape
|
|
math_op = self.tile_description.math_instruction.math_operation
|
|
math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
|
|
return f"{is0}x{is1}x{is2}{math_op_string}"
|
|
else:
|
|
return ''
|
|
|
|
def intermediate_type_string(self):
|
|
'''
|
|
Name of the distinct intermediate type used by the tensor operation,
|
|
or the empty string if none.
|
|
|
|
Tensor ops (opcode_clas *TensorOp) may use an intermediate data type
|
|
that differs from the element type of A or the accumulator type.
|
|
'''
|
|
if not self.is_tensor_op():
|
|
return ''
|
|
elif self.tile_description.math_instruction.element_a == self.A.element:
|
|
return ''
|
|
elif self.tile_description.math_instruction.element_a == self.tile_description.math_instruction.element_accumulator:
|
|
return ''
|
|
else:
|
|
return DataTypeNames[self.tile_description.math_instruction.element_a]
|
|
|
|
def core_name(self):
|
|
inst_shape = self.instruction_shape_string()
|
|
intermediate_type = self.intermediate_type_string()
|
|
conv_kind_name = ConvKindNames[self.conv_kind]
|
|
return f"{self.short_math_name()}{inst_shape}{intermediate_type}{conv_kind_name}"
|
|
|
|
def extended_name(self):
|
|
core_name = self.core_name()
|
|
element_a = DataTypeNames[self.A.element]
|
|
element_b = DataTypeNames[self.B.element]
|
|
element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator]
|
|
element_c = DataTypeNames[self.C.element]
|
|
element_d = DataTypeNames[self.D.element]
|
|
return f"{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}"
|
|
|
|
def is_complex(self):
|
|
complex_operators = [
|
|
MathOperation.multiply_add_complex,
|
|
MathOperation.multiply_add_complex_gaussian,
|
|
MathOperation.multiply_add_complex_fast_f32
|
|
]
|
|
return self.tile_description.math_instruction.math_operation in complex_operators
|
|
|
|
def layout_names(self):
|
|
'''Layout strings for A and B, respectively'''
|
|
if self.is_complex():
|
|
return (ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
|
|
ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)])
|
|
else:
|
|
return (ShortLayoutTypeNames[self.A.layout],
|
|
ShortLayoutTypeNames[self.B.layout])
|
|
|
|
def extended_name(self):
|
|
core_name = self.core_name()
|
|
element_a = DataTypeNames[self.A.element]
|
|
element_b = DataTypeNames[self.B.element]
|
|
element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator]
|
|
element_c = DataTypeNames[self.C.element]
|
|
element_d = DataTypeNames[self.D.element]
|
|
layout_a, layout_b = self.layout_names()
|
|
return f"{core_name}_{element_a}{layout_a}_{element_b}{layout_b}_{element_acc}_{element_c}_{element_d}"
|
|
|
|
def configuration_name(self):
|
|
prefix = 'cutlass3x'
|
|
arch = self.arch
|
|
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
|
|
tbm = self.tile_description.tile_shape[0]
|
|
tbn = self.tile_description.tile_shape[1]
|
|
tbk = self.tile_description.tile_shape[2]
|
|
cm = self.tile_description.cluster_shape[0]
|
|
cn = self.tile_description.cluster_shape[1]
|
|
ck = self.tile_description.cluster_shape[2]
|
|
alignment = max(self.A.alignment, self.B.alignment)
|
|
tile_scheduler = TileSchedulerSuffixes[self.tile_scheduler]
|
|
kernel_schedule = KernelScheduleSuffixes[self.kernel_schedule]
|
|
epilogue_schedule = EpilogueScheduleSuffixes[self.epilogue_schedule]
|
|
|
|
return f"{prefix}_sm{arch}_{opcode_class_name}_{self.extended_name()}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{self.tile_description.stages}_align{alignment}{tile_scheduler}{kernel_schedule}{epilogue_schedule}"
|
|
|
|
def procedural_name(self):
|
|
return self.configuration_name()
|
|
|
|
def convolution_tensor_layout_type_to_operation_kind(layout: LayoutType) -> OperationKind:
|
|
if layout == LayoutType.TensorNHWC or layout == LayoutType.TensorKCSR:
|
|
return OperationKind.Conv2d
|
|
elif layout == LayoutType.TensorNDHWC or layout == LayoutType.TensorKCSRT:
|
|
return OperationKind.Conv3d
|
|
else:
|
|
raise RuntimeError(f'LayoutType {layout} does not have a corresponding OperationKind')
|
|
|
|
def CreateConvOperator3x(manifest: Manifest,
|
|
dims_and_alignments: Sequence[Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]],
|
|
tile_descriptions: Sequence[Sequence[TileDescription]],
|
|
data_types,
|
|
schedule_pairs: Sequence[Tuple[KernelScheduleType, KernelScheduleType]] = \
|
|
[(KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto)],
|
|
complex_transforms: Optional[Sequence[ComplexTransform]] = None,
|
|
tile_schedulers: Sequence[TileSchedulerType] = [TileSchedulerType.Persistent],
|
|
conv_kind: ConvKind = ConvKind.Fprop,
|
|
log_indent_level: int = 1):
|
|
"""
|
|
Create zero or more CUTLASS 3 two-dimensional convolution operators.
|
|
|
|
Create a CUTLASS 3 two-dimensional convolution operator
|
|
for all feasible combinations of the input parameters.
|
|
Add the operators to the manifest.
|
|
|
|
dims_and_alignments: 3-level list. Each outer list term is a list [A, B, C].
|
|
Each inner list (A, B, or C) has the form [num_spatial_dimensions, alignment].
|
|
Both are integers; the first is the number of spatial dimensions
|
|
(currently, only 2 or 3 are supported), and the second is the byte alignment.
|
|
We deduce the operation_kind (either OperationKind.Conv2d or OperationKind.Conv3d)
|
|
from num_spatial_dimensions.
|
|
|
|
This function doesn't take layouts, unlike the GEMM functions.
|
|
CUTLASS 3 convolutions currently support three input layouts:
|
|
|
|
* TensorNWC for 1-D convolutions,
|
|
* TensorNHWC for 2-D convolutions, and
|
|
* TensorNDHWC for 3-D convolutions.
|
|
|
|
Output (C and D) layouts are the same as input layouts,
|
|
except for Wgrad convolutions, where the layouts are
|
|
|
|
* TensorKCS for 1-D convolutions,
|
|
* TensorKCSR for 2-D convolutions, and
|
|
* TensorKCSRT for 3-D convolutions.
|
|
|
|
The output layouts are completely constrained by the input layouts
|
|
and the convolution kind.
|
|
|
|
tile_descriptions: 2-level list.
|
|
Outer level has one list per math instruction.
|
|
Inner level has one TileDescription for each cluster shape.
|
|
|
|
data_types: Either a single data_type dictionary, or a list of them.
|
|
Keys: 'a_type', 'b_type', 'c_type', 'd_type', 'acc_type', 'epi_type'
|
|
|
|
complex_transforms: Optional list of pairs.
|
|
First element of each pair is the complex transform for A, and
|
|
second element of each pair is the complex transform for B.
|
|
|
|
schedule_pairs: [(kernel_schedule, epilogue_schedule), ...]
|
|
|
|
conv_kind: Convolution kind (Fprop, Dgrad, or Wgrad).
|
|
"""
|
|
log_debug_line('CreateConvOperator3x', log_indent_level)
|
|
log_indent_level = log_indent_level + 1
|
|
log_debug_line(f'conv_kind: {conv_kind}', log_indent_level)
|
|
|
|
for triple in dims_and_alignments:
|
|
spatial_dimensionality = None # to be determined by loop below
|
|
assert(len(triple) == 3)
|
|
for entry in triple: # [A, B, C]
|
|
assert(len(entry) == 2)
|
|
[dim, alignment] = entry
|
|
assert(type(dim) is int)
|
|
assert(dim == 2 or dim == 3)
|
|
assert(type(alignment) is int)
|
|
assert(alignment > 0)
|
|
if spatial_dimensionality is None:
|
|
spatial_dimensionality = dim
|
|
else:
|
|
# A, B, and C need to have the same spatial dimensionality
|
|
assert(spatial_dimensionality == dim)
|
|
|
|
def input_and_output_layouts(spatial_dim: int, kind: ConvKind) -> Tuple[LayoutType, LayoutType]:
|
|
if spatial_dim == 1:
|
|
input_layout = LayoutType.TensorNWC
|
|
if kind == ConvKind.Wgrad:
|
|
output_layout = LayoutType.TensorKCS
|
|
else:
|
|
output_layout = input_layout
|
|
elif spatial_dim == 2:
|
|
input_layout = LayoutType.TensorNHWC
|
|
if kind == ConvKind.Wgrad:
|
|
output_layout = LayoutType.TensorKCSR
|
|
else:
|
|
output_layout = input_layout
|
|
elif spatial_dim == 3:
|
|
input_layout = LayoutType.TensorNDHWC
|
|
if kind == ConvKind.Wgrad:
|
|
output_layout = LayoutType.TensorKCSRT
|
|
else:
|
|
output_layout = input_layout
|
|
else:
|
|
assert(False)
|
|
return (input_layout, output_layout)
|
|
|
|
def dims_to_layouts(A_B_C: Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]) -> \
|
|
Tuple[Tuple[LayoutType, int], Tuple[LayoutType, int], Tuple[LayoutType, int]]:
|
|
[A, B, C] = A_B_C
|
|
[spatial_dim, alignment] = A
|
|
[input_layout, output_layout] = input_and_output_layouts(spatial_dim, conv_kind)
|
|
return ((input_layout, A[1]),
|
|
(input_layout, B[1]),
|
|
(output_layout, C[1]))
|
|
|
|
# layouts: list of triples (A, B, C).
|
|
# Each of A, B, and C has the form [layout, alignment].
|
|
layouts = [dims_to_layouts(A_B_C) for A_B_C in dims_and_alignments]
|
|
|
|
if type(data_types) is dict:
|
|
data_types = [data_types]
|
|
|
|
for s in schedule_pairs:
|
|
assert(len(s) == 2)
|
|
|
|
if complex_transforms is None:
|
|
complex_transforms = [(ComplexTransform.none, ComplexTransform.none)]
|
|
|
|
# product produces a one-pass generator, so the loop must call it anew each time.
|
|
def make_combinations():
|
|
return product(
|
|
layouts,
|
|
tile_descriptions,
|
|
data_types,
|
|
complex_transforms,
|
|
schedule_pairs,
|
|
tile_schedulers
|
|
)
|
|
|
|
operations = []
|
|
for layout_triple, tile_description, data_type, complex_transform_pair, schedule_pair, tile_scheduler in make_combinations():
|
|
A_layout, A_alignment = layout_triple[0]
|
|
A_xform = complex_transform_pair[0]
|
|
B_layout, B_alignment = layout_triple[1]
|
|
B_xform = complex_transform_pair[1]
|
|
C_layout, C_alignment = layout_triple[2]
|
|
D_layout = C_layout
|
|
D_alignment = C_alignment
|
|
|
|
A = TensorDescription(data_type["a_type"], A_layout, A_alignment, A_xform)
|
|
B = TensorDescription(data_type["b_type"], B_layout, B_alignment, B_xform)
|
|
C = TensorDescription(data_type["c_type"], C_layout, C_alignment)
|
|
D = TensorDescription(data_type["d_type"], D_layout, D_alignment)
|
|
element_compute = data_type.get("epi_type", data_type["acc_type"])
|
|
kernel_schedule, epilogue_schedule = schedule_pair
|
|
|
|
operation = ConvOperation3x(conv_kind=conv_kind,
|
|
tile_description=tile_description,
|
|
A=A,
|
|
B=B,
|
|
C=C,
|
|
element_compute=element_compute,
|
|
D=D,
|
|
kernel_schedule=kernel_schedule,
|
|
epilogue_schedule=epilogue_schedule,
|
|
tile_scheduler=tile_scheduler,
|
|
log_indent_level=log_indent_level)
|
|
log_debug_line(f'Created ConvOperation3x: {str(operation)}', log_indent_level)
|
|
manifest.append(operation)
|
|
operations.append(operation)
|
|
|
|
return operations
|
|
|
|
###################################################################################################
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM50_Simt(manifest, cuda_version):
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 50
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
if math_inst.element_a == DataType.f32:
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM50_Simt_complex(manifest, cuda_version):
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add_complex),
|
|
]
|
|
|
|
min_cc = 50
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
DataType.cf32,
|
|
DataType.cf32,
|
|
DataType.cf32,
|
|
DataType.cf32,
|
|
]
|
|
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM50(manifest, cuda_version):
|
|
GenerateSM50_Simt(manifest, cuda_version)
|
|
GenerateSM50_Simt_complex(manifest, cuda_version)
|
|
|
|
###################################################################################################
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM60_Simt(manifest, cuda_version):
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 60
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
#
|
|
def GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version):
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 60
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [8,]
|
|
|
|
filter_3x3 = [3, 3]
|
|
filter_5x5 = [5, 5]
|
|
|
|
# [stride_h, stride_w]
|
|
# [-1, -1] means all stride size.
|
|
strides = [[-1,-1], [1, 1], [2, 2]]
|
|
# [dilation_h, dilation_w]
|
|
# [-1, -1] means all dilation size.
|
|
dilations = [[-1,-1], [1, 1], [2, 2]]
|
|
|
|
#groups per thread block
|
|
g16 = 16
|
|
g32 = 32
|
|
g64 = 64
|
|
|
|
#output shape per thread block
|
|
npq_1x4x4 = [1, 4, 4]
|
|
npq_1x8x8 = [1, 8, 8]
|
|
npq_1x10x10 = [1, 10, 10]
|
|
|
|
tile_descriptions = []
|
|
for math_inst in math_instructions:
|
|
for stride, dilation in product(strides, dilations):
|
|
tile_descriptions.extend([
|
|
# filter3x3 ThreadBlock_output, filter, stage, warp
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_3x3, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_3x3, 4, stride, dilation, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_3x3, 4, stride, dilation,[4, 1, 1], math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_3x3, 4, stride, dilation, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
|
|
# filter5x5 ThreadBlock_output, filter, stage, warp
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_5x5, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
|
|
Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc)
|
|
])
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateDepthwiseConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM60(manifest, cuda_version):
|
|
GenerateSM60_Simt(manifest, cuda_version)
|
|
GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version)
|
|
|
|
###################################################################################################
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM61_Simt(manifest, cuda_version):
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 4], \
|
|
DataType.s8, DataType.s8, DataType.s32, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 61
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
#
|
|
|
|
#
|
|
def GenerateSM61(manifest, cuda_version):
|
|
GenerateSM61_Simt(manifest, cuda_version)
|
|
|
|
###################################################################################################
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM70_TensorOp_884(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 70
|
|
max_cc = 75
|
|
|
|
alignment_constraints = [8, 4, 2, 1]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints)
|
|
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
|
|
|
|
#
|
|
def GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 70
|
|
max_cc = 75
|
|
|
|
alignment_constraints = [8, 2, 1]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, complex_transforms)
|
|
|
|
|
|
#
|
|
def GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version):
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 16, 16], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.WmmaTensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 16, 16], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.WmmaTensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 70
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [8,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints)
|
|
|
|
#
|
|
##################################################################################################
|
|
#
|
|
|
|
def GenerateSM70(manifest, cuda_version):
|
|
GenerateSM70_TensorOp_884(manifest, cuda_version)
|
|
GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version)
|
|
|
|
# To limit build size, WMMA GEMMs are disabled for now.
|
|
#
|
|
#GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version)
|
|
|
|
###################################################################################################
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst):
|
|
|
|
min_cc = 75
|
|
max_cc = 1024
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 2, [2, 2, 2], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
|
|
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
|
|
CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [1, 2, 4])
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
|
|
CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [1, 2, 4])
|
|
|
|
#
|
|
def GenerateSM75_TensorOp_1688(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [8, 4, 2, 1]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 2, [1, 2, 2], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints)
|
|
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
|
|
|
|
# Separate generator for 'few channels' specializations
|
|
GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst)
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [8, 2, 1]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, complex_transforms)
|
|
|
|
#
|
|
def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[8, 8, 16], \
|
|
DataType.s8, DataType.s8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[8, 8, 16], \
|
|
DataType.u8, DataType.u8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [16,]
|
|
alignment_constraints_small_channels = [16, 8, 4]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
|
|
TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
DataType.s32,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
DataType.f32,
|
|
]
|
|
|
|
operations = []
|
|
|
|
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] >= 128:
|
|
op.C.alignment = 16
|
|
else:
|
|
op.C.alignment = 8
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[8, 8, 16], \
|
|
DataType.s8, DataType.s8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[8, 8, 16], \
|
|
DataType.u8, DataType.u8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [16,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
DataType.f32,
|
|
]
|
|
|
|
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
op.C.alignment = 8
|
|
#
|
|
|
|
#
|
|
def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[8, 8, 32], \
|
|
DataType.s4, DataType.s4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[8, 8, 32], \
|
|
DataType.u4, DataType.u4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 89
|
|
|
|
alignment_constraints = [32,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
DataType.s32,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
DataType.f32,
|
|
]
|
|
|
|
operations = []
|
|
|
|
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] >= 128:
|
|
op.C.alignment = 16
|
|
elif op.tile_description.threadblock_shape[1] == 64:
|
|
op.C.alignment = 8
|
|
else:
|
|
op.C.alignment = 8
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[8, 8, 32], \
|
|
DataType.s4, DataType.s4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[8, 8, 32], \
|
|
DataType.u4, DataType.u4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 89
|
|
|
|
alignment_constraints = [32,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
DataType.f32,
|
|
]
|
|
|
|
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
op.C.alignment = 16
|
|
#
|
|
|
|
#
|
|
def GenerateSM75_TensorOp_88128(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[8, 8, 128], \
|
|
DataType.b1, DataType.b1, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.xor_popc),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = {
|
|
MathOperation.xor_popc: 89,
|
|
MathOperation.and_popc: 90
|
|
}
|
|
|
|
alignment_constraints = [128,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 512], 2, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 256, 512], 2, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 256, 512], 2, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([256, 64, 512], 2, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
]
|
|
|
|
data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 10, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 16, 16], \
|
|
DataType.s8, DataType.s8, DataType.s32, \
|
|
OpcodeClass.WmmaTensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [16,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
DataType.f32,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
DataType.f32,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM75_Simt_complex(manifest, cuda_version):
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add_complex),
|
|
]
|
|
|
|
min_cc = 75
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc)
|
|
]
|
|
data_type = [
|
|
DataType.cf32,
|
|
DataType.cf32,
|
|
DataType.cf32,
|
|
DataType.cf32
|
|
]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
def GenerateSM75(manifest, cuda_version):
|
|
GenerateSM75_TensorOp_1688(manifest, cuda_version)
|
|
GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version)
|
|
GenerateSM75_TensorOp_8816_TN(manifest, cuda_version)
|
|
GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version)
|
|
GenerateSM75_TensorOp_8832_TN(manifest, cuda_version)
|
|
GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version)
|
|
GenerateSM75_TensorOp_88128(manifest, cuda_version)
|
|
#GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version)
|
|
GenerateSM75_Simt_complex(manifest, cuda_version)
|
|
|
|
|
|
###################################################################################################
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_16816(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.bf16, DataType.bf16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [8, 4, 2]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 3, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
|
|
CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type, 8)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints)
|
|
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
|
|
CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
|
|
CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type_mixed, 8)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 32], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 32], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 32], \
|
|
DataType.bf16, DataType.bf16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [8]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints)
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.bf16, DataType.bf16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.f16, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [8, ]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([ 64, 128, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, complex_transforms)
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
# Upcast on Operand A
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.s8, DataType.f16, DataType.f16, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.s8, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.u8, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.u8, DataType.bf16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.s8, DataType.bf16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
# For mixed-input alignment constraints are a list of lists, where the
|
|
# inner list contains the alignment constraints for operands/matrices
|
|
# [[alignA, alignB, alignC],..]
|
|
alignment_constraints = [[16, 8, 8],]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
# 128x128
|
|
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
# 128x64
|
|
TileDescription([128, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
# 128x32
|
|
TileDescription([128, 32, 64], 9, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
# 128x16
|
|
TileDescription([128, 16, 64], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 16, 64], 3, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
# streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
|
|
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
|
|
|
|
for op in operations:
|
|
if (DataTypeSize[op.C.element] == 16) and \
|
|
(op.tile_description.threadblock_shape[1] <= 32):
|
|
op.C.alignment = 4
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.f16, DataType.s8, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.bf16, DataType.s8, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.f16, DataType.u8, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.bf16, DataType.u8, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_mixed_input_upcast),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
# For mixed-input alignment constraints are a list of lists, where the
|
|
# inner list contains the alignment constraints for operands/matrices
|
|
# [[alignA, alignB, alignC],..]
|
|
alignment_constraints = [[8, 16, 8],]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
# 128x128
|
|
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
# 128x64
|
|
TileDescription([128, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
# 128x32
|
|
TileDescription([128, 32, 64], 9, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 32], 9, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
# 128x16
|
|
TileDescription([128, 16, 64], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 16, 64], 3, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 16, 32], 9, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 16, 32], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 16, 32], 3, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
# 256x16
|
|
TileDescription([256, 16, 32], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 16, 32], 3, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
# streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
|
|
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] <= 32:
|
|
op.C.alignment = 4
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 32], \
|
|
DataType.s8, DataType.s8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[16, 8, 32], \
|
|
DataType.u8, DataType.u8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
smem_usage = 164
|
|
|
|
alignment_constraints = [16,]
|
|
alignment_constraints_small_channels = [16, 8, 4]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 32, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 64], 6, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 64], 6, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 32, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
|
|
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
|
|
|
|
operations = []
|
|
|
|
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] >= 128:
|
|
if op.tile_description.threadblock_shape[0] == 32:
|
|
op.C.alignment = 8
|
|
else:
|
|
op.C.alignment = 16
|
|
else:
|
|
op.C.alignment = 8
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 64], \
|
|
DataType.s8, DataType.s8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [16,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.s8, DataType.s8, DataType.s32, DataType.s32]
|
|
data_type_mixed = [DataType.s8, DataType.s8, DataType.s8, DataType.f32]
|
|
|
|
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
operations = []
|
|
|
|
operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] >= 128:
|
|
op.C.alignment = 16
|
|
else:
|
|
op.C.alignment = 8
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 32], \
|
|
DataType.s8, DataType.s8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[16, 8, 32], \
|
|
DataType.u8, DataType.u8, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [16,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
|
|
|
|
operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
op.C.alignment = 8
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 64], \
|
|
DataType.s4, DataType.s4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[16, 8, 64], \
|
|
DataType.u4, DataType.u4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
alignment_constraints = [32,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 256], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 256], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
|
|
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
operations = []
|
|
|
|
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] >= 128:
|
|
op.C.alignment = 16
|
|
elif op.tile_description.threadblock_shape[1] == 64:
|
|
op.C.alignment = 8
|
|
else:
|
|
op.C.alignment = 8
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 128], \
|
|
DataType.s4, DataType.s4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
alignment_constraints = [32,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([ 64, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 256], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 256], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.s4, DataType.s4, DataType.s32, DataType.s32]
|
|
data_type_mixed = [DataType.s4, DataType.s4, DataType.s4, DataType.f32]
|
|
|
|
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
operations = []
|
|
|
|
operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] > 128:
|
|
op.C.alignment = 16
|
|
else:
|
|
op.C.alignment = 8
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 64], \
|
|
DataType.s4, DataType.s4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
MathInstruction( \
|
|
[16, 8, 64], \
|
|
DataType.u4, DataType.u4, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_saturate),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
alignment_constraints = [32,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
|
|
|
|
operations = []
|
|
|
|
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
|
|
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
|
|
|
|
for op in operations:
|
|
op.C.alignment = 16
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_168256(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 256], \
|
|
DataType.b1, DataType.b1, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.xor_popc),
|
|
MathInstruction( \
|
|
[16, 8, 256], \
|
|
DataType.b1, DataType.b1, DataType.s32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.and_popc),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = {
|
|
MathOperation.xor_popc: 89,
|
|
MathOperation.and_popc: 90
|
|
}
|
|
|
|
alignment_constraints = [128,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 512], 3, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 256, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([256, 64, 512], 4, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 256, 512], 4, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 128, 512], 5, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 64, 512], 6, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 128, 512], 6, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 64, 512], 10, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([256, 128, 1024], 3, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 256, 1024], 3, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([256, 64, 1024], 4, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 256, 1024], 4, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([128, 64, 1024], 3, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 128, 1024], 3, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
TileDescription([ 64, 64, 1024], 5, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
|
|
]
|
|
|
|
data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [4, 2, 1]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
data_type_mixed = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_a,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type_mixed, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f16, DataType.f16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_fast_f16),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.bf16, DataType.bf16, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_fast_bf16),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [4, 2, 1]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_fast_f32),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [4, 2, 1]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
def GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_fast_f32)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
|
|
]
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
|
|
|
|
#
|
|
def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 16], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [4]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 32], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
|
|
|
|
CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
|
|
]
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_fast_f32),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1, 2, 4] # Alignment only applies to A in SYRK
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f32, DataType.f32, DataType.f32]
|
|
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_fast_f32),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
DataType.cf32, DataType.cf32, DataType.cf32
|
|
]
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
# SYRK
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HERK
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_fast_f32),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1, 2, 4]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_fast_f32),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
|
|
]
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
complex_transforms = [
|
|
ComplexTransform.none, ComplexTransform.conj,
|
|
]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_symm(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
# A and B have same layouts
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_fast_f32),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [
|
|
1, 2, 4
|
|
]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
|
|
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.tf32, DataType.tf32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex),
|
|
MathInstruction( \
|
|
[16, 8, 8], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_fast_f32),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
|
|
]
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
# SYMM
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HEMM
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
# SYRK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HERK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [ComplexTransform.none,]
|
|
|
|
# SYRK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HERK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
ComplexTransform.none, ComplexTransform.conj,
|
|
]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
ComplexTransform.none, ComplexTransform.conj,
|
|
]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
# SYMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HEMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
#
|
|
def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[8, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [ComplexTransform.none,]
|
|
|
|
# SYMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HEMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM80_Simt_f32(manifest, cuda_version):
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
|
|
#
|
|
def GenerateSM80_Simt_f64(manifest, cuda_version):
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
math_inst.element_accumulator,
|
|
math_inst.element_accumulator,
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
#
|
|
|
|
|
|
##################################################################################################
|
|
#
|
|
def GenerateSM80_Simt_complex(manifest, cuda_version):
|
|
math_instructions = [
|
|
MathInstruction( \
|
|
[1, 1, 1], \
|
|
DataType.f32, DataType.f32, DataType.f32, \
|
|
OpcodeClass.Simt, \
|
|
MathOperation.multiply_add_complex),
|
|
]
|
|
|
|
min_cc = 80
|
|
max_cc = 1024
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
data_type = [
|
|
DataType.cf32,
|
|
DataType.cf32,
|
|
DataType.cf32,
|
|
DataType.cf32
|
|
]
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
for math_inst in math_instructions:
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints, complex_transforms)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
|
|
#
|
|
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM80(manifest, cuda_version):
|
|
GenerateSM80_TensorOp_16816(manifest, cuda_version)
|
|
GenerateSM80_SparseTensorOp_16832(manifest, cuda_version)
|
|
GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version)
|
|
GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_complex(manifest, cuda_version)
|
|
# 3xTF32
|
|
GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_symm(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_trmm(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
|
|
GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_16864_TN(manifest, cuda_version)
|
|
GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version)
|
|
GenerateSM80_TensorOp_168256(manifest, cuda_version)
|
|
GenerateSM80_Simt_f32(manifest, cuda_version)
|
|
GenerateSM80_Simt_f64(manifest, cuda_version)
|
|
GenerateSM80_Simt_complex(manifest, cuda_version)
|
|
|
|
###################################################################################################
|
|
|
|
def GenerateSM89_TensorOp_16832_fp8(manifest, cuda_version):
|
|
if (
|
|
not CudaToolkitVersionSatisfies(cuda_version, 12, 4)
|
|
):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor)
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e4m3, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e4m3, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e5m2, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e5m2, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e4m3, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e4m3, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e5m2, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
MathInstruction(
|
|
[16, 8, 32],
|
|
DataType.e5m2, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
]
|
|
|
|
min_cc = 89
|
|
max_cc = 89
|
|
|
|
alignment_constraints = [16,]
|
|
alignment_constraints_small_channels = [16, 8, 4]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 64], 6, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 64], 6, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 3, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 32, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 64], 6, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 64], 6, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 32, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 32, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 32, 128, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_types = [
|
|
[
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
DataType.f32,
|
|
math_inst.element_accumulator
|
|
],
|
|
]
|
|
|
|
operations = []
|
|
for data_type in data_types:
|
|
operations += CreateGemmOperator(manifest, layouts, tile_descriptions, data_type,
|
|
alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
|
|
operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
|
|
|
|
operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
|
|
data_type, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] >= 128:
|
|
if op.tile_description.threadblock_shape[0] == 32:
|
|
op.C.alignment = 8
|
|
else:
|
|
op.C.alignment = 16
|
|
else:
|
|
op.C.alignment = 8
|
|
|
|
#
|
|
def GenerateSM89_SparseTensorOp_16864_fp8(manifest, cuda_version):
|
|
|
|
if (
|
|
not CudaToolkitVersionSatisfies(cuda_version, 12, 4)
|
|
):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor)
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e4m3, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e4m3, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e5m2, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e5m2, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e4m3, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e4m3, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e5m2, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
MathInstruction(
|
|
[16, 8, 64],
|
|
DataType.e5m2, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add_fast_accum),
|
|
]
|
|
|
|
min_cc = 89
|
|
max_cc = 89
|
|
|
|
alignment_constraints = [16,]
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([ 64, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_types = [
|
|
[
|
|
math_inst.element_a,
|
|
math_inst.element_b,
|
|
DataType.f32,
|
|
math_inst.element_accumulator
|
|
],
|
|
]
|
|
|
|
operations = []
|
|
for data_type in data_types:
|
|
operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type,
|
|
alignment_constraints, None, EpilogueFunctor.LinearCombination)
|
|
|
|
for op in operations:
|
|
if op.tile_description.threadblock_shape[1] >= 128:
|
|
op.C.alignment = 16
|
|
else:
|
|
op.C.alignment = 8
|
|
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM89(manifest, cuda_version):
|
|
GenerateSM89_TensorOp_16832_fp8(manifest, cuda_version)
|
|
GenerateSM89_SparseTensorOp_16864_fp8(manifest, cuda_version)
|
|
|
|
###################################################################################################
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments.
|
|
layouts = [
|
|
[[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 8], [LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 1]],
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction(
|
|
[64, 128, 16],
|
|
DataType.f16, DataType.f16, DataType.f16,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 16],
|
|
DataType.f16, DataType.f16, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 16],
|
|
DataType.bf16, DataType.bf16, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions_small = [
|
|
# Not compatible with TmaWarpSpecializedCooperative
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
tile_descriptions_medium = [
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
tile_descriptions_large = [
|
|
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1]*2, math_inst.instruction_shape[2]*4],
|
|
0, [4, 2, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1]*2, math_inst.instruction_shape[2]*4],
|
|
0, [4, 2, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
tile_descriptions = tile_descriptions_medium + tile_descriptions_large
|
|
|
|
data_type = {
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
}
|
|
|
|
# Set alignment c based on Destination format.
|
|
for layout in layouts:
|
|
if data_type["c_type"] in [DataType.s32, DataType.f32]:
|
|
layout[2][1] = 4
|
|
elif data_type["c_type"] in [DataType.f16, DataType.bf16]:
|
|
layout[2][1] = 8
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
schedules = [
|
|
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
]
|
|
stream_k_schedules = [[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
|
|
else:
|
|
schedules = [
|
|
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
# TmaWarpSpecializedCooperative and TmaWarpSpecializedPingpong require CUDA version >= 12.1 for optimal performance.
|
|
]
|
|
stream_k_schedules = []
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# Add stream-K variants
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# persistent kernels with TMA epilogues
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# not enough smem for 256x128 f32 out with C allocation
|
|
if data_type["d_type"] == DataType.f32:
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_medium, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_medium, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
tile_schedulers=[TileSchedulerType.StreamK])
|
|
else:
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# Emit instance without C allocation + load
|
|
data_type["c_type"] = DataType.void
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# for mixed precision kernels, also generate kernels that write output matrix in the A/B format
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
data_type_mixed = {
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_a,
|
|
"d_type" : math_inst.element_a,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
}
|
|
|
|
# Set alignment c based on Destination format.
|
|
for layout in layouts:
|
|
if data_type_mixed["c_type"] in [DataType.s32, DataType.f32]:
|
|
layout[2][1] = 4
|
|
elif data_type_mixed["c_type"] in [DataType.f16, DataType.bf16]:
|
|
layout[2][1] = 8
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, schedules)
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# persistent kernels with TMA epilogues
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# Emit instance without C allocation+load
|
|
data_type_mixed["c_type"] = DataType.void
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments.
|
|
layouts = [
|
|
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction(
|
|
[64, 128, 16],
|
|
DataType.f16, DataType.f16, DataType.f16,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 16],
|
|
DataType.f16, DataType.f16, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 16],
|
|
DataType.bf16, DataType.bf16, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions_small = [
|
|
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
|
|
]
|
|
tile_descriptions_medium = [
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
|
|
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1]*2, math_inst.instruction_shape[2]*4],
|
|
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
|
|
]
|
|
tile_descriptions = tile_descriptions_small + tile_descriptions_medium
|
|
|
|
data_type = {
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
}
|
|
|
|
# Set alignment c based on Destination format.
|
|
for layout in layouts:
|
|
if data_type["c_type"] in [DataType.s32, DataType.f32]:
|
|
layout[2][1] = 4
|
|
elif data_type["c_type"] in [DataType.f16, DataType.bf16]:
|
|
layout[2][1] = 8
|
|
|
|
schedules = [
|
|
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
]
|
|
stream_k_schedules = []
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
schedules += [
|
|
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
]
|
|
stream_k_schedules += [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# Add stream-K variants
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# persistent kernels with TMA epilogues
|
|
# if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
# tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# # Emit instance without C allocation + load
|
|
# data_type["c_type"] = DataType.void
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
# tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# for mixed precision kernels, also generate kernels that write output matrix in the A/B format
|
|
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
|
|
if math_inst.element_a != math_inst.element_accumulator:
|
|
data_type_mixed = {
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_a,
|
|
"d_type" : math_inst.element_a,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
}
|
|
|
|
# Set alignment c based on Destination format.
|
|
for layout in layouts:
|
|
if data_type_mixed["c_type"] in [DataType.s32, DataType.f32]:
|
|
layout[2][1] = 4
|
|
elif data_type_mixed["c_type"] in [DataType.f16, DataType.bf16]:
|
|
layout[2][1] = 8
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, schedules)
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# persistent kernels with TMA epilogues
|
|
# if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
# tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
# # Emit instance without C allocation+load
|
|
# data_type_mixed["c_type"] = DataType.void
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
# tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments
|
|
layouts_tf32 = [
|
|
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
|
|
[[LayoutType.RowMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4]],
|
|
[[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
|
|
[[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4]],
|
|
]
|
|
|
|
math_inst = MathInstruction(
|
|
[64, 128, 8],
|
|
DataType.tf32, DataType.tf32, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add)
|
|
|
|
math_inst_largeN = MathInstruction(
|
|
[64, 256, 8],
|
|
DataType.tf32, DataType.tf32, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
tile_descriptions_large = [
|
|
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst_largeN.instruction_shape[0]*2, math_inst_largeN.instruction_shape[1], math_inst_largeN.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst_largeN, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst_largeN.instruction_shape[0]*2, math_inst_largeN.instruction_shape[1], math_inst_largeN.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst_largeN, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
|
|
tile_descriptions_medium = [
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
|
|
tile_descriptions_small = [
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
|
|
|
|
data_types = [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : DataType.f32,
|
|
"b_type" : DataType.f32,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : DataType.f32
|
|
}
|
|
]
|
|
|
|
schedules_default = [
|
|
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
]
|
|
|
|
# TMA kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
|
|
# because they use NN kernels underneath and transposing its epilogue will get the correct output
|
|
schedules_transposed_epilogue = [
|
|
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
|
|
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
|
|
]
|
|
|
|
# TMA kernels with TN, NN, or NT layout
|
|
layouts_tf32_tn_nn_nt = [layouts_tf32[0], layouts_tf32[2], layouts_tf32[3]]
|
|
# TMA kernels with TT layout
|
|
layouts_tf32_tt = [layouts_tf32[1]]
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_small, data_types, [
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_medium, data_types, [
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions_large, data_types, [
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_small, data_types, [
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.EpilogueTransposed]
|
|
])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_medium, data_types, [
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.EpilogueTransposed]
|
|
])
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions_large, data_types, [
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.EpilogueTransposed],
|
|
])
|
|
|
|
else:
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn_nt, tile_descriptions, data_types, schedules_default)
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_types, schedules_transposed_epilogue)
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments.
|
|
layouts = [
|
|
[[LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor, 2], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 1], [LayoutType.RowMajor, 1], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.ColumnMajor, 1], [LayoutType.RowMajor, 1], [LayoutType.ColumnMajor, 1]],
|
|
]
|
|
|
|
math_inst = MathInstruction(
|
|
[64, 128, 8],
|
|
DataType.tf32, DataType.tf32, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
tile_descriptions_medium = [
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1])
|
|
]
|
|
|
|
tile_descriptions_small = [
|
|
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1])
|
|
]
|
|
|
|
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
|
|
|
|
data_types = [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : DataType.f32,
|
|
"b_type" : DataType.f32,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : DataType.f32
|
|
}
|
|
]
|
|
|
|
is_tt_layout = lambda v: v[0][0] == LayoutType.RowMajor and v[1][0] == LayoutType.RowMajor
|
|
# Split kernels into TN/NT, NN or TT layouts
|
|
layouts_tn_nn_nt = filter(lambda v: not is_tt_layout(v), layouts)
|
|
layouts_tt = filter(is_tt_layout, layouts)
|
|
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
|
|
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
])
|
|
|
|
# Kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
|
|
# because they use NN kernels underneath and transposing its epilogue will get the correct output
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tt, tile_descriptions, data_types, [
|
|
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
|
|
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
|
|
])
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
])
|
|
|
|
# Stream-K schedules
|
|
CreateGemmUniversal3xOperator(manifest, layouts_tn_nn_nt, tile_descriptions, data_types, [
|
|
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
], tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments
|
|
layouts = [
|
|
[[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16]],
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.s8, DataType.s8, DataType.s32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.u8, DataType.u8, DataType.s32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
for math_inst in math_instructions:
|
|
# 64x128x128
|
|
tile_descriptions_small = [
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
# 128x128x128
|
|
tile_descriptions_medium = [
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
]
|
|
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
|
|
|
|
data_types = [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.s8,
|
|
"d_type" : math_inst.element_a,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : DataType.f32
|
|
}
|
|
]
|
|
|
|
for data_type in data_types:
|
|
for layout in layouts:
|
|
layout[2][1] = 128 // DataTypeSize[data_type["d_type"]]
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type)
|
|
|
|
# persistent kernels with TMA epilogues
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# Emit instance without C allocation+load
|
|
data_types += [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.void,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
}
|
|
]
|
|
for data_type in data_types:
|
|
# Set output alignment based on destination format first
|
|
for layout in layouts:
|
|
layout[2][1] = 128 // DataTypeSize[data_type["d_type"]]
|
|
# Pingpong persistent
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized]])
|
|
# Cooperative persistent
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_medium, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]],
|
|
tile_schedulers=[TileSchedulerType.Persistent, TileSchedulerType.StreamK]
|
|
)
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments
|
|
layouts = [
|
|
[[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
|
|
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
|
|
]
|
|
|
|
math_instructions = [
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.s8, DataType.s8, DataType.s32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.u8, DataType.u8, DataType.s32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
for math_inst in math_instructions:
|
|
tile_descriptions_small = [
|
|
# TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
|
|
]
|
|
tile_descriptions_medium = [
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
|
|
]
|
|
tile_descriptions = tile_descriptions_medium + tile_descriptions_small
|
|
|
|
data_types = [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.s8,
|
|
"d_type" : math_inst.element_a,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : DataType.f32
|
|
}
|
|
]
|
|
|
|
for data_type in data_types:
|
|
for layout in layouts:
|
|
layout[2][1] = 128 // DataTypeSize[data_type["d_type"]]
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, [
|
|
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
])
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, [
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
])
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]],
|
|
tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments
|
|
layouts = [
|
|
[[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]], # TN Layout
|
|
]
|
|
|
|
math_instructions = [
|
|
# inst 64x128x32
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e4m3, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e4m3, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e5m2, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e5m2, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
for math_inst in math_instructions:
|
|
data_types = [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f32,
|
|
"d_type" : DataType.f32,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f32,
|
|
"d_type" : DataType.e4m3,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f32,
|
|
"d_type" : DataType.e5m2,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.bf16,
|
|
"d_type" : DataType.bf16,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.bf16,
|
|
"d_type" : DataType.e4m3,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.bf16,
|
|
"d_type" : DataType.e5m2,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f16,
|
|
"d_type" : DataType.f16,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f16,
|
|
"d_type" : DataType.e4m3,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f16,
|
|
"d_type" : DataType.e5m2,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
]
|
|
|
|
data_types_large_tile = [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.void,
|
|
"d_type" : DataType.e5m2,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.void,
|
|
"d_type" : DataType.e4m3,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
}
|
|
]
|
|
|
|
if math_inst.instruction_shape[1] == 128:
|
|
tile_descriptions_small = [
|
|
# 64x128x128
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
TileDescription([math_inst.instruction_shape[0], math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
]
|
|
tile_descriptions_large = [
|
|
# 256x128x128
|
|
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
]
|
|
tile_descriptions = [
|
|
# 128x128x128
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,2,1]),
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [2,1,1]),
|
|
]
|
|
|
|
else:
|
|
assert False, "math inst is not supported"
|
|
|
|
# some schedules disabled to save on library size
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
schedules = [
|
|
#[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
]
|
|
stream_k_schedules = [[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized]]
|
|
else:
|
|
schedules = [
|
|
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
# TmaWarpSpecializedCooperative require CUDA version >= 12.1 for optimal performance.
|
|
]
|
|
stream_k_schedules = []
|
|
|
|
for data_type in data_types:
|
|
# With No-SMEM epilogues
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# Persistent kernels with TMA epilogues
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
# Small tiles
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_small, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.TmaWarpSpecialized],
|
|
[KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, EpilogueScheduleType.NoSmemWarpSpecialized]])
|
|
|
|
# Large tiles
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_large, data_types_large_tile,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
# Add stream-K variants (with and without TMA epilogues)
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
[[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative],
|
|
[KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
# layouts for ABC and their alignments
|
|
layouts = [
|
|
[[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]], # TN Layout
|
|
[[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]], # TN Layout
|
|
]
|
|
|
|
math_instructions = [
|
|
# inst 64x128x32
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e4m3, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e4m3, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e5m2, DataType.e4m3, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
MathInstruction(
|
|
[64, 128, 32],
|
|
DataType.e5m2, DataType.e5m2, DataType.f32,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add),
|
|
# inst 64x64x32
|
|
# MathInstruction(
|
|
# [64, 64, 32],
|
|
# DataType.e4m3, DataType.e4m3, DataType.f32,
|
|
# OpcodeClass.TensorOp,
|
|
# MathOperation.multiply_add),
|
|
# MathInstruction(
|
|
# [64, 64, 32],
|
|
# DataType.e4m3, DataType.e5m2, DataType.f32,
|
|
# OpcodeClass.TensorOp,
|
|
# MathOperation.multiply_add),
|
|
# MathInstruction(
|
|
# [64, 64, 32],
|
|
# DataType.e5m2, DataType.e4m3, DataType.f32,
|
|
# OpcodeClass.TensorOp,
|
|
# MathOperation.multiply_add),
|
|
# MathInstruction(
|
|
# [64, 64, 32],
|
|
# DataType.e5m2, DataType.e5m2, DataType.f32,
|
|
# OpcodeClass.TensorOp,
|
|
# MathOperation.multiply_add),
|
|
]
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
for math_inst in math_instructions:
|
|
data_types = [
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f32,
|
|
"d_type" : DataType.f32,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f32,
|
|
"d_type" : DataType.e4m3,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f32,
|
|
"d_type" : DataType.e5m2,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.bf16,
|
|
"d_type" : DataType.bf16,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.bf16,
|
|
"d_type" : DataType.e4m3,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.bf16,
|
|
"d_type" : DataType.e5m2,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f16,
|
|
"d_type" : DataType.f16,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f16,
|
|
"d_type" : DataType.e4m3,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
{
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : DataType.f16,
|
|
"d_type" : DataType.e5m2,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
},
|
|
]
|
|
|
|
if math_inst.instruction_shape[1] == 128:
|
|
tile_descriptions = [
|
|
# 128x128x128
|
|
TileDescription([math_inst.instruction_shape[0]*2, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
|
|
]
|
|
|
|
# elif math_inst.instruction_shape[1] == 64:
|
|
# tile_descriptions = [
|
|
# # 256x64x128
|
|
# TileDescription([math_inst.instruction_shape[0]*4, math_inst.instruction_shape[1], math_inst.instruction_shape[2]*4],
|
|
# 0, [4, 1, 1], math_inst, min_cc, max_cc, [1,1,1]),
|
|
# ]
|
|
|
|
else:
|
|
assert False, "math inst is not supported"
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
schedules = [
|
|
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
# [KernelScheduleType.CpAsyncWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized],
|
|
]
|
|
stream_k_schedules = [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized]]
|
|
else:
|
|
schedules = [
|
|
# [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
|
|
[KernelScheduleType.CpAsyncWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
|
|
]
|
|
stream_k_schedules = []
|
|
|
|
|
|
for data_type in data_types:
|
|
# With No-SMEM epilogues
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
|
|
|
|
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
|
|
# Persistent kernels with TMA epilogues
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
|
|
|
|
# Add stream-K variants (with and without TMA epilogues)
|
|
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, stream_k_schedules, tile_schedulers=[TileSchedulerType.StreamK])
|
|
# CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
|
|
# [[KernelScheduleType.CpAsyncWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]],
|
|
# tile_schedulers=[TileSchedulerType.StreamK])
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = MathInstruction(
|
|
[16, 8, 4],
|
|
DataType.f64, DataType.f64, DataType.f64,
|
|
OpcodeClass.TensorOp,
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions,
|
|
data_type, alignment_constraints)
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
(ComplexTransform.none, ComplexTransform.none),
|
|
(ComplexTransform.conj, ComplexTransform.none),
|
|
(ComplexTransform.none, ComplexTransform.conj),
|
|
(ComplexTransform.conj, ComplexTransform.conj)
|
|
]
|
|
|
|
CreateGemmOperator(manifest, layouts, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
# SYRK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HERK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [ComplexTransform.none,]
|
|
|
|
# SYRK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HERK computation
|
|
CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
ComplexTransform.none, ComplexTransform.conj,
|
|
]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
diag_types = [
|
|
DiagType.NonUnit, DiagType.Unit,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [
|
|
ComplexTransform.none, ComplexTransform.conj,
|
|
]
|
|
|
|
CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
|
|
data_type, alignment_constraints, complex_transforms)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_symm(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
|
|
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
# SYMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HEMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
#
|
|
def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
|
|
return
|
|
|
|
layouts = [
|
|
(LayoutType.ColumnMajor, LayoutType.ColumnMajor),
|
|
]
|
|
|
|
side_modes = [
|
|
SideMode.Left, SideMode.Right,
|
|
]
|
|
|
|
fill_modes = [
|
|
FillMode.Lower, FillMode.Upper,
|
|
]
|
|
|
|
math_inst = \
|
|
MathInstruction( \
|
|
[16, 8, 4], \
|
|
DataType.f64, DataType.f64, DataType.f64, \
|
|
OpcodeClass.TensorOp, \
|
|
MathOperation.multiply_add_complex_gaussian)
|
|
|
|
min_cc = 90
|
|
max_cc = 90
|
|
|
|
alignment_constraints = [1,]
|
|
|
|
tile_descriptions = [
|
|
TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
|
|
#TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
|
|
]
|
|
|
|
data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
|
|
|
|
complex_transforms = [ComplexTransform.none,]
|
|
|
|
# SYMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.symmetric)
|
|
|
|
# HEMM computation
|
|
CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
|
|
data_type, alignment_constraints, BlasMode.hermitian)
|
|
#
|
|
|
|
###################################################################################################
|
|
|
|
def GenerateSM90_Conv3x(manifest, cuda_version,
|
|
log_indent_level: int = 0):
|
|
"""
|
|
Generate CUTLASS 3 convolution kernel(s) for SM90.
|
|
|
|
This is meant to be called from GenerateSM90.
|
|
"""
|
|
log_debug_line('GenerateSM90_Conv3x', log_indent_level)
|
|
log_indent_level = log_indent_level + 1
|
|
|
|
if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
|
|
return
|
|
|
|
minimum_compute_capability = 90
|
|
maximum_compute_capability = 90
|
|
|
|
spatial_dims = [2, 3]
|
|
|
|
def make_dims_and_alignments_triple(dim: int):
|
|
byte_alignment_required_by_tma = 16
|
|
return ((dim, byte_alignment_required_by_tma), # A
|
|
(dim, byte_alignment_required_by_tma), # B
|
|
(dim, byte_alignment_required_by_tma)) # C
|
|
dims_and_alignments = [make_dims_and_alignments_triple(dim) for dim in spatial_dims]
|
|
|
|
def make_math_instruction(data_types: Tuple[DataType, DataType, DataType],
|
|
instruction_shape: Tuple[int, int, int]) -> MathInstruction:
|
|
default_opcode = OpcodeClass.TensorOp
|
|
default_math_op = MathOperation.multiply_add
|
|
[A_data_type, B_data_type, C_data_type] = data_types
|
|
return MathInstruction(
|
|
instruction_shape,
|
|
A_data_type, B_data_type, C_data_type,
|
|
default_opcode,
|
|
default_math_op
|
|
)
|
|
data_types_and_instruction_shapes = [
|
|
((DataType.f16, DataType.f16, DataType.f16), (64, 64, 16)),
|
|
((DataType.f16, DataType.f16, DataType.f32), (64, 64, 16)),
|
|
((DataType.bf16, DataType.bf16, DataType.f32), (64, 64, 16)),
|
|
]
|
|
math_instructions = map(lambda x: make_math_instruction(*x),
|
|
data_types_and_instruction_shapes)
|
|
|
|
cluster_shapes = [
|
|
[2, 1, 1],
|
|
[1, 1, 1],
|
|
]
|
|
conv_kinds = [
|
|
ConvKind.Fprop,
|
|
ConvKind.Dgrad
|
|
]
|
|
mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecializedSm90
|
|
stages = 0 # zero means "deduce the number of stages automatically"
|
|
|
|
# tile_descriptions is a 2-level list.
|
|
# Each inner list is for each cluster shape.
|
|
for math_inst in math_instructions:
|
|
tile_descriptions = []
|
|
for cluster_shape in cluster_shapes:
|
|
tile_shape = [
|
|
math_inst.instruction_shape[0],
|
|
math_inst.instruction_shape[1],
|
|
math_inst.instruction_shape[2] * 4
|
|
]
|
|
warp_count = [4, 1, 1]
|
|
tile_description = TileDescription(
|
|
tile_shape, stages, warp_count, math_inst,
|
|
minimum_compute_capability, maximum_compute_capability,
|
|
cluster_shape)
|
|
tile_descriptions.append(tile_description)
|
|
|
|
# It's typical to get the data types from the math instruction.
|
|
data_type = {
|
|
"a_type" : math_inst.element_a,
|
|
"b_type" : math_inst.element_b,
|
|
"c_type" : math_inst.element_accumulator,
|
|
"d_type" : math_inst.element_accumulator,
|
|
"acc_type" : math_inst.element_accumulator,
|
|
"epi_type" : math_inst.element_accumulator
|
|
}
|
|
|
|
for conv_kind in conv_kinds:
|
|
epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
|
|
schedule_pairs = [
|
|
(mainloop_schedule, epilogue_schedule)
|
|
]
|
|
CreateConvOperator3x(manifest,
|
|
dims_and_alignments = dims_and_alignments,
|
|
tile_descriptions = tile_descriptions,
|
|
data_types = data_type,
|
|
schedule_pairs = schedule_pairs,
|
|
tile_schedulers = [TileSchedulerType.Default], # -> void
|
|
conv_kind = conv_kind,
|
|
log_indent_level = log_indent_level)
|
|
|
|
def GenerateSM90(manifest, cuda_version):
|
|
GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_symm(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version)
|
|
GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version)
|
|
GenerateSM90_Conv3x(manifest, cuda_version)
|
|
|
|
###################################################################################################
|
|
|
|
def numeric_log_level(log_level: str) -> int:
|
|
"""
|
|
Converts the string identifier of the log level into the numeric identifier used
|
|
in setting the log level
|
|
|
|
:param x: string representation of log level (e.g., 'INFO', 'DEBUG')
|
|
:type x: str
|
|
|
|
:return: numeric representation of log level
|
|
:rtype: int
|
|
"""
|
|
numeric_level = getattr(logging, log_level.upper(), None)
|
|
if not isinstance(numeric_level, int):
|
|
raise ValueError(f'Invalid log level: {log_level}')
|
|
return numeric_level
|
|
|
|
|
|
# This function for defining the ArgumentParser is used to make it easy for the CUTLASS Python interface
|
|
# to leverage the functionality in this file without running this script via a shell prompt.
|
|
def define_parser():
|
|
parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels")
|
|
parser.add_argument("--operations", default="all", help="Specifies the operation to generate (gemm, all)")
|
|
parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory")
|
|
parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory")
|
|
parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.")
|
|
parser.add_argument("--architectures", default='53;60;61;70;75;80;90', help="Target compute architectures")
|
|
parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.')
|
|
parser.add_argument("--ignore-kernels", default='', help='Comma delimited list of kernels to exclude from build.')
|
|
parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose compute capability range is not satisfied by the build target are excluded.')
|
|
parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
|
|
parser.add_argument('--kernel-filter-file', type=str, default=None, required=False, help='Full path of filter file')
|
|
parser.add_argument('--selected-kernel-list', type=str, default=None, required=False,
|
|
help='Specify the output log file containing all enabled kernels in this build')
|
|
parser.add_argument("--interface-dir", default=None, required=False, help="Interface header to kernels")
|
|
parser.add_argument("--disable-full-archs-compilation", action="store_true", required=False, help="Disable compilation for every archs in --architectures")
|
|
parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
|
|
help='Logging level to be used by the generator script')
|
|
_add_package_disablement_flag(parser)
|
|
return parser
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = define_parser()
|
|
args = parser.parse_args()
|
|
|
|
# Set the logging level based on the user-provided `--log-level` command-line option
|
|
logging.basicConfig(level=args.log_level)
|
|
|
|
manifest = Manifest(args)
|
|
|
|
GenerateSM50(manifest, args.cuda_version)
|
|
GenerateSM60(manifest, args.cuda_version)
|
|
GenerateSM61(manifest, args.cuda_version)
|
|
GenerateSM70(manifest, args.cuda_version)
|
|
GenerateSM75(manifest, args.cuda_version)
|
|
GenerateSM80(manifest, args.cuda_version)
|
|
GenerateSM89(manifest, args.cuda_version)
|
|
GenerateSM90(manifest, args.cuda_version)
|
|
if 'library' in args.generator_target.split(','):
|
|
manifest.emit(GeneratorTarget.Library)
|
|
|
|
if args.selected_kernel_list is not None:
|
|
if len(manifest.selected_kernels) > 0:
|
|
with open(args.selected_kernel_list, 'w') as file_writer:
|
|
for line in manifest.selected_kernels:
|
|
file_writer.write("%s\n" % line)
|
|
|
|
###################################################################################################
|