hqjenny-centrifuge/examples/gemx_tl/python/test_gemm.py

231 lines
8.8 KiB
Python

import numpy as np
import gemm
import sys
import random
import argparse
def cmp( A, B):
if np.array_equal(A, B):
print ("Success!\n")
else:
print ("not equal :(")
sys.exit()
def multiply_and_cmp(C, A, B, X, m, n, post_scale):
#Calculate golden C
m64 = np.matmul(np.int64(A), np.int64(B)) #intermediate accumulation to 64 bits
bias64 = np.int64(X) #bias to 64 bits
output64 = m64 + bias64
o64d = output64*post_scale[0]
o64m = o64d/(2**post_scale[1])
C_cpu = np.int16(o64m) #scale down for 16 bits
C_fpga = C.flatten()
C_cpu = C_cpu.flatten()
C_fpga = np.reshape(C_fpga, (m, n))
C_cpu = np.reshape(C_cpu, (m, n))
if np.array_equal(C_fpga, C_cpu):
print ("Success!\n")
else:
print ("Not equal!")
print (C_fpga.shape, C_cpu.shape)
np.savetxt("cpu_out.np", C_cpu, fmt="%d")
np.savetxt("fpga_out.np", C_fpga, fmt="%d")
np.savetxt("bias.np", X, fmt="%d")
np.savetxt("A.np", A, fmt="%d")
np.savetxt("B.np", B, fmt="%d")
sys.exit();
def test_basic_randint ( A_range, B_range, bias_range, m, k, n, post_scale):
mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)
bias = []
if bias_range != 0:
bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
else:
bias = np.zeros ( (m, n), dtype=np.int32);
test_basic(mat_A, mat_B, bias, post_scale)
def test_basic_randint_shift ( A_range, A_shift, B_range, B_shift, bias_range, bias_shift, m, k, n, post_scale):
mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
mat_A = mat_A + A_shift
mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)
mat_B = mat_B + B_shift
bias = []
if bias_range != 0:
bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
else:
bias = np.zeros ( (m, n), dtype=np.int32); bias = bias + bias_shift
test_basic(mat_A, mat_B, bias, post_scale)
#def test_basic_gauss ( a_mu, a_sigma, b_mu, b_sigma, m, k, n, add_bias = False):
# mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
# mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)
# bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
# test_basic(mat_A, mat_B, bias, add_bias)
def test_basic(mat_A, mat_B, bias, post_scale = [1,1]):
m = mat_A.shape[0]
k = mat_A.shape[1]
n = mat_B.shape[1]
print ("test_basic: %d %d %d %d %d" % (m, k, n, post_scale[0], post_scale[1] ))
print ("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
print ("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
print ("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
C_fpga = np.zeros( (m, n), dtype=np.int16)
gemm.sendMat(mat_A)
gemm.sendMat(mat_B)
gemm.sendMat(C_fpga)
gemm.sendMat(bias)
gemm.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
gemm.execute()
gemm.getMat(C_fpga)
if m > 4096 and n > 4096 and k > 4096:
print("Skip golden comparision because large matrix size")
else:
multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)
def test_sendA_first(int_range, m, k, n):
print ("test_sendA_first: %d %d %d %d" % (int_range, m, k, n ))
mat_A = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16)
gemm.sendMat(mat_A)
mat_B = np.random.randint(low=-int_range, high=int_range, size=(k, n), dtype=np.int16)
C_fpga = gemm.matmul(mat_A, mat_B, False)
C_cpu = np.matmul(mat_A, mat_B)
C_cpu = C_cpu.flatten()
C_fpga = C_fpga.flatten()
C_cpu = np.reshape(C_cpu, (m, n))
C_fpga = np.reshape(C_fpga, (m, n))
if np.array_equal(C_fpga, C_cpu):
print ("Success!\n")
else:
print ("not equal :(")
sys.exit();
def test_multiInstrv1(int_range, sz, add_bias = False):
print ("test_multiInstrv1: %d %d" % (int_range, sz ))
A = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
B = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
C = np.zeros ( (sz, sz), dtype=np.int16);
D = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
E = np.zeros ( (sz, sz), dtype=np.int16);
b0 = np.zeros ( (sz, sz), dtype=np.int32);
b1 = np.zeros ( (sz, sz), dtype=np.int32);
if add_bias == True:
b0 = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int32)
b1 = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int32)
gemm.sendMat(A)
gemm.sendMat(B)
gemm.sendMat(b0)
gemm.sendMat(C)
gemm.sendMat(D)
gemm.sendMat(E)
gemm.sendMat(b1)
gemm.addGEMMOp(A,B,C, b0, 1,0)
gemm.addGEMMOp(C,D,E, b1, 1,0)
gemm.execute()
gemm.getMat(C)
gemm.getMat(E)
if sz > 4096:
print("Skip golden comparision because large matrix size")
else:
print("test C")
multiply_and_cmp(C, A, B, b0, sz, sz, [1,0])
print("test E")
multiply_and_cmp(E, C, D, b1, sz, sz, [1,0])
def test_multiInstrv2(int_range, sz):
print ("test_multiInstrv2: %d %d" % (int_range, sz ))
A = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
B = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
C = np.zeros ( (sz, sz), dtype=np.int16);
D = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
E = np.zeros ( (sz, sz), dtype=np.int16);
F = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
G = np.zeros ( (sz, sz), dtype=np.int16);
H = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
I = np.zeros ( (sz, sz), dtype=np.int16);
gemm.sendMat(A)
gemm.sendMat(B)
gemm.sendMat(C)
gemm.sendMat(D)
gemm.sendMat(E)
gemm.sendMat(F)
gemm.sendMat(G)
gemm.sendMat(H)
gemm.sendMat(I)
b0 = np.zeros ( (sz, sz), dtype=np.int32);
b1 = np.zeros ( (sz, sz), dtype=np.int32);
b2 = np.zeros ( (sz, sz), dtype=np.int32);
b3 = np.zeros ( (sz, sz), dtype=np.int32);
gemm.sendMat(b0)
gemm.sendMat(b1)
gemm.sendMat(b2)
gemm.sendMat(b3)
gemm.addGEMMOp(A,B,C, b0, 1,0)
gemm.addGEMMOp(C,D,E, b1, 1,0)
gemm.addGEMMOp(E,F,G, b2, 1,0)
gemm.addGEMMOp(G,H,I, b3, 1,0)
gemm.execute()
gemm.getMat(I)
if sz > 4096:
print("Skip golden comparision because large matrix size")
else:
print("test I")
C_cpu = np.matmul(A,B)
E_cpu = np.matmul(C_cpu,D)
G_cpu = np.matmul(E_cpu,F)
multiply_and_cmp(I, G_cpu, H, b3, sz, sz, [1,0])
def test_rand_basic ( int_range, bias_range, num_iter, post_scale):
min_sz_exp = 8
for i in range(num_iter):
print ("test_rand_basic iter: %d" % i)
rand_m = random.randint(0,5)
rand_k = random.randint(0,5)
rand_n = random.randint(0,5)
rand_m = 2 ** (rand_m + min_sz_exp)
rand_k = 2 ** (rand_k + min_sz_exp)
rand_n = 2 ** (rand_n + min_sz_exp)
test_basic_randint(int_range, int_range, bias_range, rand_m, rand_k, rand_n, post_scale)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='pyXDNN')
parser.add_argument('--xclbin', required = True, help='.xclbin file')
parser.add_argument('--gemxlib', required = True, help='FPGA gemx host shared library')
args = parser.parse_args()
gemm.createHandle(args.xclbin, "gemxKernel_0", args.gemxlib)
size = 256
while size < 8192:
test_basic_randint( 32764, 32764, 0, size, size, size, [1,1])
test_basic_randint( 32764, 32764, 0, size, size, size, [4,18])
size = size * 2
for i in range(5):
test_basic_randint( 32764, 32764, 0, 512, 512, 32, [16,17])
test_basic_randint( 32764, 32764, 0, 256, 512, 32, [2,18])
test_basic_randint( 32764, 32764, 0, 2048, 512, 32, [4,18])
test_basic_randint( 32764, 32764, 0, 2048, 512, 32, [128,17])
#test_basic_randint( 32764, 256, 512, 32)
#test_basic_randint( 100, 256, 512, 32)
#test_basic_randint(32764, 256, 512, 256)
#test_basic_randint(10, 256, 512, 256, True) fail
#test_basic_randint(32764, 256, 512, 1024)
#test_basic_randint(32764, 256, 512, 2048)
#test_basic_randint(100, 16384, 16834, 8192)
test_rand_basic (32764, 0, 5, [1,0])
test_sendA_first(32764, 256, 512, 1024)
test_multiInstrv1(32764, 256)
for m_sz in range(5):
sz = 2 ** (m_sz+8)
print ("Size: %d" % sz)
test_multiInstrv2( 32764, sz)
gemm.closeHandle()