cutlass/examples/40_cutlass_py/customizable/gemm_grouped.py

299 lines
14 KiB
Python

################################################################################
#
# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
import sys
print("This example is deprecated. Please see examples/python for examples of using "
"the CUTLASS Python interface.")
sys.exit(0)
import numpy as np
import cutlass.backend as pycutlass
from cutlass.backend import *
from cutlass.backend.utils.device import device_cc
import csv
import argparse
# parse the arguments
parser = argparse.ArgumentParser(
description="Launch CUTLASS GEMM Grouped kernels from Python")
# Operation description
# math instruction description
parser.add_argument("-i", "--instruction_shape",
default=[1, 1, 1], nargs=3, type=int,
help="This option describes the size of MMA op")
parser.add_argument("-ta", "--element_a", default="float32", type=str,
choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],
help='Data type of elements in input tensor A')
parser.add_argument("-tb", "--element_b", default="float32", type=str,
choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],
help='Data type of elements in input tensor B')
parser.add_argument("-tc", "--element_c", default="float32", type=str,
choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],
help='Data type of elements in input tensor C and output tensor D')
parser.add_argument("-tacc", "--element_acc", default="float32", type=str,
choices=['float64', 'float32', 'float16', 'bfloat16', 'int32', 'int8'],
help='Data type of accumulator')
parser.add_argument('-m', "--math", default="multiply_add",
type=str, choices=["multiply_add", "multiply_add_fast_bf16", "multiply_add_fast_f32"], help="math instruction")
parser.add_argument('-op', "--opcode", default="Simt", type=str,
choices=["Simt", 'TensorOp'], help='This option describes whether you want to use tensor \
cores (TensorOp) or regular SIMT cores (Simt) on GPU SM')
# tile description
parser.add_argument("-b", "--threadblock_shape",
default=[128, 128, 8], nargs=3, type=int,
help="This option describes the tile size a thread block with compute")
parser.add_argument("-s", "--stages", default=4,
type=int, help="Number of pipelines you want to use")
parser.add_argument("-w", "--warp_count", default=[
4, 2, 1], nargs=3, type=int,
help="This option describes the number of warps along M, N, and K of the threadblock")
parser.add_argument("-cc", "--compute_capability", default=80,
type=int, help="This option describes CUDA SM architecture number")
# A
parser.add_argument('-la', "--layout_a", default="RowMajor", type=str, choices=[
"RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"],
help="Memory layout of input tensor A")
parser.add_argument('-aa', '--alignment_a', default=1,
type=int, help="Memory alignment of input tensor A")
# B
parser.add_argument('-lb', "--layout_b", default="RowMajor", type=str, choices=[
"RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"],
help="Memory layout of input tensor B")
parser.add_argument('-ab', '--alignment_b', default=1,
type=int, help="Memory alignment of input tensor B")
# C
parser.add_argument('-lc', "--layout_c", default="RowMajor", type=str, choices=[
"RowMajor", "ColumnMajor", "RowMajorInterleaved32", "ColumnMajorInterleaved32"],
help="Memory layout of input tensor C and output tensor D")
parser.add_argument('-ac', '--alignment_c', default=1,
type=int, help="Memory alignment of input tensor C and output tensor D")
# epilogue
parser.add_argument("-te", "--element_epilogue", default="float32", type=str,
choices=['float64', 'float32', 'float16', 'bfloat16'], help='Epilogue datatype')
parser.add_argument("-ep", "--epilogue_functor", default="LinearCombination",
type=str, choices=['LinearCombination', 'FastLinearCombinationClamp', 'LinearCombinationClamp'],
help="This option describes the epilogue part of the kernel")
# swizzling
parser.add_argument("-sw", "--swizzling_functor", default="IdentitySwizzle1", type=str, choices=[
"IdentitySwizzle1", "IdentitySwizzle2", "IdentitySwizzle4", "IdentitySwizzle8", "HorizontalSwizzle"],
help="This option describes how thread blocks are scheduled on GPU. \
NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels. \
This parameter is passed in at present to match the APIs of other kernels. The parameter \
is unused within the kernel")
# precompute mode
parser.add_argument("-pm", "--precompute_mode",
default="Device", type=str, choices=["Host", "Device"],
help="Grouped Gemm Scheduing on device only (Device) or using host precompute (Host)")
# arguments
parser.add_argument("-p", "--problem_size_dir", type=str, default="grouped_gemm_problem_size.csv",
help="path to the csv file contains the problem sizes")
parser.add_argument("-alpha", "--alpha", default=1.0, type=float, help="alpha")
parser.add_argument("-beta", "--beta", default=0.0, type=float, help="beta")
parser.add_argument('-bias', '--bias', action='store_true', help="C is bias vector")
# Activation function
parser.add_argument("-activ", "--activation_function", default="identity",
choices=["identity", "relu", "leaky_relu", "tanh", "sigmoid", "silu", "hardswish", "gelu"], help="activation function")
parser.add_argument("-activ_arg", "--activation_args", default=[], nargs="+", type=float,
help="addition arguments for activation")
parser.add_argument('--print_cuda', action="store_true",
help="print the underlying CUDA kernel")
try:
args = parser.parse_args()
except:
sys.exit(0)
cc = device_cc()
if args.compute_capability != cc:
raise Exception(("Parameter --compute-capability of {} "
"does not match that of the device of {}.").format(args.compute_capability, cc))
pycutlass.get_memory_pool(init_pool_size=2**30, max_pool_size=2**32)
np.random.seed(0)
element_a = getattr(cutlass_bindings, args.element_a)
element_b = getattr(cutlass_bindings, args.element_b)
element_c = getattr(cutlass_bindings, args.element_c)
element_acc = getattr(cutlass_bindings, args.element_acc)
math_operation = getattr(MathOperation, args.math)
opclass = getattr(cutlass_bindings.OpClass, args.opcode)
math_inst = MathInstruction(
args.instruction_shape, element_a, element_b,
element_acc, opclass, math_operation
)
tile_description = TileDescription(
args.threadblock_shape, args.stages, args.warp_count,
math_inst
)
layout_a = getattr(cutlass_bindings, args.layout_a)
layout_b = getattr(cutlass_bindings, args.layout_b)
layout_c = getattr(cutlass_bindings, args.layout_c)
A = TensorDescription(
element_a, layout_a, args.alignment_a
)
B = TensorDescription(
element_b, layout_b, args.alignment_b
)
C = TensorDescription(
element_c, layout_c, args.alignment_c
)
element_epilogue = getattr(cutlass_bindings, args.element_epilogue)
if args.activation_function == "identity":
epilogue_functor = getattr(pycutlass, args.epilogue_functor)(
C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
else:
epilogue_functor = getattr(pycutlass, "LinearCombinationGeneric")(
getattr(pycutlass, args.activation_function)(element_epilogue),
C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
swizzling_functor = getattr(cutlass_bindings, args.swizzling_functor)
precompute_mode = getattr(SchedulerMode, args.precompute_mode)
operation = GemmOperationGrouped(
arch=args.compute_capability, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
precompute_mode=precompute_mode
)
if args.print_cuda:
print(operation.rt_module.emit())
pycutlass.compiler.add_module([operation, ])
reference_module = ReferenceModule(A, B, C)
# get problems
problem_sizes = []
with open(args.problem_size_dir) as csv_file:
reader = csv.reader(csv_file)
for row in reader:
problem_sizes.append(
cutlass_bindings.gemm.GemmCoord(int(row[0]), int(row[1]), int(row[2]))
)
problem_count = len(problem_sizes)
tensor_As = []
tensor_Bs = []
tensor_Cs = []
tensor_Ds = []
problem_sizes_coord = []
tensor_D_refs = []
for problem_size in problem_sizes:
if args.element_a != "int8":
if args.element_a == "bfloat16":
tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
* problem_size.k(),))).astype(bfloat16)
else:
tensor_A = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.m()
* problem_size.k(),))).astype(getattr(np, args.element_a))
else:
tensor_A = np.random.uniform(low=-2, high=2, size=(problem_size.m()
* problem_size.k(),)).astype(getattr(np, args.element_a))
if args.element_b != "int8":
if args.element_b == "bfloat16":
tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()
* problem_size.n(),))).astype(bfloat16)
else:
tensor_B = np.ceil(np.random.uniform(low=-8.5, high=7.5, size=(problem_size.k()
* problem_size.n(),))).astype(getattr(np, args.element_b))
else:
tensor_B = np.random.uniform(low=-2, high=2, size=(problem_size.k()
* problem_size.n(),)).astype(getattr(np, args.element_b))
if args.element_c != "int8":
if args.bias:
if args.layout_c == "RowMajor":
c_size = problem_size.n()
elif args.layout_c == "ColumnMajor":
c_size = problem_size.m()
else:
raise ValueError(args.layout_c)
else:
c_size = problem_size.m() * problem_size.n()
if args.element_c == "bfloat16":
tensor_C = np.ceil(
np.random.uniform(low=-8.5, high=7.5, size=(c_size,))
).astype(bfloat16)
else:
tensor_C = np.ceil(
np.random.uniform(low=-8.5, high=7.5, size=(c_size,))
).astype(getattr(np, args.element_c))
else:
tensor_C = np.random.uniform(
low=-2, high=2, size=(problem_size.m() * problem_size.n(),)
).astype(getattr(np, args.element_c))
tensor_D = np.zeros(
shape=(problem_size.m() * problem_size.n(),)
).astype(getattr(np, args.element_c))
tensor_As.append(tensor_A)
tensor_Bs.append(tensor_B)
tensor_Cs.append(tensor_C)
tensor_Ds.append(tensor_D)
tensor_D_ref = reference_module.run(
tensor_A, tensor_B, tensor_C, problem_size,
args.alpha, args.beta, args.bias)
tensor_D_ref = getattr(pycutlass, args.activation_function).numpy(*([tensor_D_ref,] + args.activation_args))
tensor_D_refs.append(tensor_D_ref)
problem_sizes_coord.append(problem_size)
arguments = GemmGroupedArguments(
operation, problem_sizes_coord, tensor_As, tensor_Bs, tensor_Cs, tensor_Ds,
output_op=operation.epilogue_type(*([args.alpha, args.beta] + args.activation_args))
)
operation.run(arguments)
arguments.sync()
for tensor_d, tensor_d_ref in zip(tensor_Ds, tensor_D_refs):
try:
assert np.array_equal(tensor_d, tensor_d_ref)
except:
assert np.allclose(tensor_d, tensor_d_ref, rtol=1e-5)
print("Passed.")