hqjenny-centrifuge/examples/gemx_tl/python/test_gemm.py

import numpy as np
import gemm
import sys
import random
import argparse

def cmp( A, B):
    if np.array_equal(A, B):
        print ("Success!\n")
    else:
        print ("not equal :(")
        sys.exit()

def multiply_and_cmp(C, A, B, X, m, n, post_scale):
    #Calculate golden C
    m64 = np.matmul(np.int64(A), np.int64(B)) #intermediate accumulation to 64 bits
    bias64 = np.int64(X) #bias to 64 bits
    output64 = m64 + bias64
    o64d = output64*post_scale[0]
    o64m = o64d/(2**post_scale[1])
    C_cpu = np.int16(o64m) #scale down for 16 bits
    C_fpga = C.flatten()
    C_cpu = C_cpu.flatten()
    C_fpga = np.reshape(C_fpga, (m, n))
    C_cpu = np.reshape(C_cpu, (m, n))
    if np.array_equal(C_fpga, C_cpu):
	print ("Success!\n")
    else:
	print ("Not equal!")
	print (C_fpga.shape, C_cpu.shape)
	np.savetxt("cpu_out.np", C_cpu, fmt="%d")
	np.savetxt("fpga_out.np", C_fpga, fmt="%d")
	np.savetxt("bias.np", X, fmt="%d")
	np.savetxt("A.np", A, fmt="%d")
	np.savetxt("B.np", B, fmt="%d")
	sys.exit();

def test_basic_randint ( A_range, B_range, bias_range, m, k, n, post_scale):
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ( (m, n), dtype=np.int32);
    test_basic(mat_A, mat_B, bias, post_scale)

def test_basic_randint_shift ( A_range, A_shift, B_range, B_shift, bias_range, bias_shift, m, k, n, post_scale):
    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
    mat_A = mat_A + A_shift
    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)
    mat_B = mat_B + B_shift
    bias = []
    if bias_range != 0:
        bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)
    else:
        bias = np.zeros ( (m, n), dtype=np.int32);    bias = bias + bias_shift
    test_basic(mat_A, mat_B, bias, post_scale)

#def test_basic_gauss ( a_mu, a_sigma, b_mu, b_sigma,  m, k, n, add_bias = False):
#    mat_A = np.random.randint(low=-A_range, high=A_range, size=(m, k), dtype=np.int16)
#    mat_B = np.random.randint(low=-B_range, high=B_range, size=(k, n), dtype=np.int16)
#    bias = np.random.randint(low=-bias_range, high=bias_range, size=(m, n), dtype=np.int32)

#    test_basic(mat_A, mat_B, bias, add_bias)

def test_basic(mat_A, mat_B, bias, post_scale = [1,1]):
    m = mat_A.shape[0]
    k = mat_A.shape[1]
    n = mat_B.shape[1]
    print ("test_basic: %d %d %d %d %d" % (m, k, n, post_scale[0], post_scale[1] ))
    print ("A: ", np.amax(mat_A), np.amin(mat_A), np.average(mat_A))
    print ("B: ", np.amax(mat_B), np.amin(mat_B), np.average(mat_B))
    print ("bias: ", np.amax(bias), np.amin(bias), np.average(bias))
    C_fpga = np.zeros( (m, n), dtype=np.int16)
    gemm.sendMat(mat_A)
    gemm.sendMat(mat_B)
    gemm.sendMat(C_fpga)
    gemm.sendMat(bias)
    gemm.addGEMMOp ( mat_A, mat_B, C_fpga, bias, post_scale[0], post_scale[1])
    gemm.execute()
    gemm.getMat(C_fpga)
    if m > 4096 and n > 4096 and k > 4096:
      print("Skip golden comparision because large matrix size")
    else:
      multiply_and_cmp(C_fpga, mat_A, mat_B, bias, m, n, post_scale)

def test_sendA_first(int_range, m, k, n):
    print ("test_sendA_first: %d %d %d %d" % (int_range, m, k, n ))
    mat_A = np.random.randint(low=-int_range, high=int_range, size=(m, k), dtype=np.int16)
    gemm.sendMat(mat_A)
    mat_B = np.random.randint(low=-int_range, high=int_range, size=(k, n), dtype=np.int16)
    C_fpga = gemm.matmul(mat_A, mat_B, False)
    C_cpu = np.matmul(mat_A, mat_B)
    C_cpu = C_cpu.flatten()
    C_fpga = C_fpga.flatten()
    C_cpu = np.reshape(C_cpu, (m, n))
    C_fpga = np.reshape(C_fpga, (m, n))
    if np.array_equal(C_fpga, C_cpu):
      print ("Success!\n")
    else:
      print ("not equal :(")
      sys.exit();

def test_multiInstrv1(int_range, sz, add_bias = False):
    print ("test_multiInstrv1: %d %d" % (int_range, sz ))
    A = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    B = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    C = np.zeros ( (sz, sz), dtype=np.int16);
    D = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    E = np.zeros ( (sz, sz), dtype=np.int16);
    b0 = np.zeros ( (sz, sz), dtype=np.int32);
    b1 = np.zeros ( (sz, sz), dtype=np.int32);
    if add_bias == True:
        b0 = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int32)
        b1 = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int32)
    gemm.sendMat(A)
    gemm.sendMat(B)
    gemm.sendMat(b0)
    gemm.sendMat(C)
    gemm.sendMat(D)
    gemm.sendMat(E)
    gemm.sendMat(b1)
    gemm.addGEMMOp(A,B,C, b0, 1,0)
    gemm.addGEMMOp(C,D,E, b1, 1,0)
    gemm.execute()
    gemm.getMat(C)
    gemm.getMat(E)
    if sz > 4096:
      print("Skip golden comparision because large matrix size")
    else:
      print("test C")
      multiply_and_cmp(C, A, B, b0, sz, sz, [1,0])
      print("test E")
      multiply_and_cmp(E, C, D, b1, sz, sz, [1,0])

def test_multiInstrv2(int_range, sz):
    print ("test_multiInstrv2: %d %d" % (int_range, sz ))
    A = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    B = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    C = np.zeros ( (sz, sz), dtype=np.int16);
    D = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    E = np.zeros ( (sz, sz), dtype=np.int16);
    F = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    G = np.zeros ( (sz, sz), dtype=np.int16);
    H = np.random.randint(low=-int_range, high=int_range, size=(sz, sz), dtype=np.int16)
    I = np.zeros ( (sz, sz), dtype=np.int16);
    gemm.sendMat(A)
    gemm.sendMat(B)
    gemm.sendMat(C)
    gemm.sendMat(D)
    gemm.sendMat(E)
    gemm.sendMat(F)
    gemm.sendMat(G)
    gemm.sendMat(H)
    gemm.sendMat(I)
    b0 = np.zeros ( (sz, sz), dtype=np.int32);
    b1 = np.zeros ( (sz, sz), dtype=np.int32);
    b2 = np.zeros ( (sz, sz), dtype=np.int32);
    b3 = np.zeros ( (sz, sz), dtype=np.int32);
    gemm.sendMat(b0)
    gemm.sendMat(b1)
    gemm.sendMat(b2)
    gemm.sendMat(b3)
    gemm.addGEMMOp(A,B,C, b0, 1,0)
    gemm.addGEMMOp(C,D,E, b1, 1,0)
    gemm.addGEMMOp(E,F,G, b2, 1,0)
    gemm.addGEMMOp(G,H,I, b3, 1,0)
    gemm.execute()
    gemm.getMat(I)
    if sz > 4096:
      print("Skip golden comparision because large matrix size")
    else:
      print("test I")
      C_cpu = np.matmul(A,B)
      E_cpu = np.matmul(C_cpu,D)
      G_cpu = np.matmul(E_cpu,F)
      multiply_and_cmp(I, G_cpu, H, b3, sz, sz, [1,0])

def test_rand_basic ( int_range, bias_range, num_iter, post_scale):
    min_sz_exp = 8
    for i in range(num_iter):
        print ("test_rand_basic iter: %d" % i)
        rand_m = random.randint(0,5)
        rand_k = random.randint(0,5)
        rand_n = random.randint(0,5)
        rand_m = 2 ** (rand_m + min_sz_exp)
        rand_k = 2 ** (rand_k + min_sz_exp)
        rand_n = 2 ** (rand_n + min_sz_exp)
        test_basic_randint(int_range, int_range, bias_range, rand_m, rand_k, rand_n, post_scale)

if __name__ == '__main__':
  parser = argparse.ArgumentParser(description='pyXDNN')
  parser.add_argument('--xclbin', required = True, help='.xclbin file')
  parser.add_argument('--gemxlib', required = True, help='FPGA gemx host shared library')
  args = parser.parse_args()

  gemm.createHandle(args.xclbin, "gemxKernel_0", args.gemxlib)

  size = 256
  while size < 8192:
    test_basic_randint( 32764, 32764, 0, size, size, size, [1,1])
    test_basic_randint( 32764, 32764, 0, size, size, size, [4,18])
    size = size * 2

  for i in range(5):
    test_basic_randint( 32764, 32764, 0, 512, 512, 32, [16,17])
    test_basic_randint( 32764, 32764, 0, 256, 512, 32, [2,18])
    test_basic_randint( 32764, 32764, 0, 2048, 512, 32, [4,18])
    test_basic_randint( 32764, 32764, 0, 2048, 512, 32, [128,17])
    #test_basic_randint( 32764, 256, 512, 32)
    #test_basic_randint( 100, 256, 512, 32)
    #test_basic_randint(32764, 256, 512, 256)
    #test_basic_randint(10, 256, 512, 256, True) fail
    #test_basic_randint(32764, 256, 512, 1024)
    #test_basic_randint(32764, 256, 512, 2048)
    #test_basic_randint(100, 16384, 16834, 8192)

  test_rand_basic (32764, 0, 5, [1,0])
  test_sendA_first(32764, 256, 512, 1024)
  test_multiInstrv1(32764, 256)

  for m_sz in range(5):
    sz = 2 ** (m_sz+8)
    print ("Size: %d" % sz)
    test_multiInstrv2( 32764, sz)

  gemm.closeHandle()