hqjenny-centrifuge/examples/gemx_tl/python/gemx_host.h

1054 lines
39 KiB
C++

/*
* gemx_host.h
*
* Created on: Jan 20, 2018
* Author: xteng
*/
#ifndef SRC_GEMX_HOST_H_
#define SRC_GEMX_HOST_H_
#include "assert.h"
#include <stdio.h>
#include <vector>
#include <string>
#include <fstream>
#include "CL/cl.h"
#include "CL/cl_ext.h"
#include <boost/compute.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/program.hpp>
#include <iostream>
#include <iterator>
#include <unordered_map>
#include <stdlib.h>
#include <cstring>
#include <iomanip>
//#define GEMX_PERF_DBG
using namespace std;
namespace gemx {
typedef enum {
OpControl, OpGemv, OpGemm, OpTransp, OpSpmv, OpResult, OpFail, OpFcn
} OpType;
class XTimer
{
public:
XTimer() : beg_(clock_::now()) {}
void reset() { beg_ = clock_::now(); }
double elapsed() const {
return std::chrono::duration_cast<second_>
(clock_::now() - beg_).count(); }
private:
typedef std::chrono::high_resolution_clock clock_;
typedef std::chrono::duration<double, std::ratio<1> > second_;
std::chrono::time_point<clock_> beg_;
};
class kArgs {
public:
virtual ~kArgs() {
}
virtual size_t sizeInBytes() = 0;
virtual char* asByteArray() = 0;
};
class SpMvArgs: public kArgs {
public:
virtual ~SpMvArgs() {
}
};
//////////////////////////// GEMM ////////////////////////////
class GemmArgs: public kArgs {
public:
virtual ~GemmArgs() {
}
GemmArgs() = delete;
GemmArgs(unsigned int p_Aoffset, unsigned int p_Boffset,
unsigned int p_Coffset, unsigned int p_Xoffset, unsigned int p_M, unsigned int p_K,
unsigned int p_N, unsigned int p_Lda, unsigned int p_Ldb,
unsigned int p_Ldc, unsigned int p_Ldx, int post_scale, int post_shift) :
m_gemm_args( { int(OpGemm), p_Aoffset, p_Boffset, p_Coffset, p_Xoffset, p_M, p_K,
p_N, p_Lda, p_Ldb, p_Ldc, p_Ldx, 0, 0, 0, 0 }) {
m_gemm_args.m_postScaleVal = (post_scale << 8) | (post_shift & 0x000000ff);
}
size_t sizeInBytes() {
return sizeof(m_gemm_args);
}
char *asByteArray() {
return reinterpret_cast<char*>(&m_gemm_args);
}
protected:
struct {
int m_optype;
unsigned int m_Aoffset, m_Boffset, m_Coffset, m_Xoffset, m_M, m_K, m_N,
m_Lda, m_Ldb, m_Ldc, m_Ldx;
int m_postScaleVal;
int dummy[3];
} m_gemm_args;
};
class FcnArgs: public kArgs {
public:
virtual ~FcnArgs() {
}
FcnArgs() = delete;
FcnArgs(unsigned int p_Aoffset, unsigned int p_Boffset,
unsigned int p_Coffset, unsigned int p_Xoffset, unsigned int p_M, unsigned int p_K,
unsigned int p_N, unsigned int p_Lda, unsigned int p_Ldb,
unsigned int p_Ldc, unsigned int p_Ldx, int post_scale, int post_shift, short prelu_scale, short prelu_alpha) :
m_fcn_args( { OpFcn, p_Aoffset, p_Boffset, p_Coffset, p_Xoffset, p_M, p_K,
p_N, p_Lda, p_Ldb, p_Ldc, p_Ldx, 0, 0, 0, 0 }) {
m_fcn_args.m_postScaleVal = (post_scale << 8) | (post_shift & 0x000000ff);
m_fcn_args.m_PReLUVal = (prelu_scale << 6) | (prelu_alpha & 0x003f);
//std::cout << "s_dummy: " << m_fcn_args.s_dummy << std::endl;
//printf ("PReLUVal: %d\n", m_fcn_args.m_PReLUVal);
//std::stringstream stream;
//std::cout << "optype: " << optype << " p_Aoffset: " << p_Aoffset << std::endl;
/*
int * data = (int*)asByteArray();
for (int i = 0; i < sizeInBytes()/4; i++){
std::cout << "word " << i << ": " << data[i] << std::endl;
}
*/
//std::string result( stream.str() );
//std::cout << "Hex: " << result << std::endl;
}
size_t sizeInBytes() {
return sizeof(m_fcn_args);
}
char *asByteArray() {
return reinterpret_cast<char*>(&m_fcn_args);
}
protected:
struct {
int m_optype;
unsigned int m_Aoffset, m_Boffset, m_Coffset, m_Xoffset, m_M, m_K, m_N,
m_Lda, m_Ldb, m_Ldc, m_Ldx;
int m_postScaleVal;
short m_PReLUVal;
short s_dummy;
int dummy[2];
} m_fcn_args;
};
// Matrix descriptor with data itself stored in caller's space
template<typename T>
class Mat {
private:
unsigned int m_Rows, m_Cols, m_Ld, m_buf_sz;
bool m_ownmem;
T *m_Addr;
public:
const static size_t GEMX_CMP_WIDTH = 11;
Mat() = delete;
~Mat() {
if (m_ownmem && m_Addr) {
free(m_Addr);
}
}
Mat(unsigned int p_Rows, unsigned int p_Cols, unsigned int p_Ld) :
m_Rows(p_Rows), m_Cols(p_Cols), m_Ld(p_Ld), m_ownmem(true) {
m_buf_sz = sizeof(T) * p_Rows * p_Ld;
posix_memalign((void**) &m_Addr, 4096, m_buf_sz);
//m_Addr = (T*)aligned_alloc ( 4096, sizeof(T) * p_Rows * p_Cols);
}
Mat(unsigned int p_Rows, unsigned int p_Cols, unsigned int p_Ld, T *p_Addr) :
m_Rows(p_Rows), m_Cols(p_Cols), m_Ld(p_Ld), m_Addr(p_Addr), m_ownmem(
false) {
m_buf_sz = sizeof(T) * p_Rows * p_Ld;
}
Mat& operator=(const Mat& p_Src) {
assert(p_Src.rows() == rows());
assert(p_Src.cols() == cols());
for (unsigned int row = 0; row < m_Rows; ++row) {
for (unsigned int col = 0; col < m_Ld; ++col) {
m_Addr[row][col] = p_Src.getVal(row, col);
}
}
return *this;
}
unsigned int buf_sz(){
return m_buf_sz;
}
T*& data() {
return m_Addr;
}
inline T &getVal(unsigned int p_Row, unsigned int p_Col) {
return m_Addr[p_Row * ld() + p_Col];
}
inline unsigned int rows() {
return m_Rows;
}
inline unsigned int cols() {
return m_Cols;
}
inline unsigned int ld() {
return m_Ld;
}
void init(unsigned int p_Rows, unsigned int p_Cols, unsigned int p_Ld,
T *p_Addr) {
m_Rows = p_Rows;
m_Cols = p_Cols;
m_Ld = p_Ld;
m_Addr = p_Addr;
}
void fillModRange(T p_Min, T p_Max) {
T l_val = p_Min;
for (unsigned int row = 0; row < m_Rows; ++row) {
for (unsigned int col = 0; col < ld(); ++col) {
getVal(row, col) = l_val++;
if ( l_val > p_Max ) l_val = p_Min;
}
}
}
void fillMod(T p_Max, T p_First = 0) {
T l_val = p_First;
for (unsigned int row = 0; row < m_Rows; ++row) {
for (unsigned int col = 0; col < ld(); ++col) {
getVal(row, col) = l_val;
l_val++;
l_val %= p_Max;
}
}
}
void multiply(Mat & p_A, Mat & p_B) {
T l_val = 0;
assert(p_A.rows() == rows());
assert(p_A.cols() == p_B.rows());
assert(p_B.cols() == cols());
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < cols(); ++col) {
T l_val = 0;
for (unsigned int k = 0; k < p_A.cols(); ++k) {
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
}
//cout << " DEBUG multiply setting row=" << row << " col=" << col << endl;
getVal(row, col) = l_val;
}
}
}
void
multiplyAddScale(Mat & p_A, Mat & p_B, Mat<int> & p_X, int32_t p_postScale) {
assert(p_A.rows() == rows());
assert(p_A.cols() == p_B.rows());
assert(p_B.cols() == cols());
assert(p_X.rows() == rows());
assert(p_X.cols() == cols());
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < cols(); ++col) {
int64_t l_val = 0;
for (unsigned int k = 0; k < p_A.cols(); ++k) {
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
}
l_val += p_X.getVal(row, col);
l_val = (l_val >> (p_postScale & 0x00ff)) * (p_postScale >> 8);
T l_entry = (T)(l_val);
getVal(row, col) = l_entry;
}
}
}
void matMultWithScaleAndPRelu(Mat & p_A, Mat & p_B, Mat<int> & p_X, int32_t p_postScale, int16_t p_PReluVal) {
cout << "A rows: " << p_A.rows() << " this rows: " << rows() << endl;
assert(p_A.rows() == rows());
assert(p_A.cols() == p_B.rows());
assert(p_B.cols() == cols());
assert(p_X.rows() == rows());
assert(p_X.cols() == cols());
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < cols(); ++col) {
int64_t l_val = 0;
for (unsigned int k = 0; k < p_A.cols(); ++k) {
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
// if ((row==2) && (col == 0)) {
// if (p_B.getVal(k, col) != 0) {
// std::cout << " A[2," << std::dec << k << "]= " << p_A.getVal(row, k) << std::hex << " 0x" << p_A.getVal(row, k);
// std::cout << " B[" << std::dec << k << ",0]= " << p_B.getVal(k,col) << std::hex << " 0x" << p_B.getVal(k,col);
// std::cout << " A*B+C = " << std::dec << l_val << std::hex << " 0x" << l_val << "\n";
// }
// }
}
// if ((row == 2) && (col == 0)) {
// std::bitset<64> l_bVal{l_val};
// std::cout << "C[2,0]= " << l_bVal << "\n";
// }
l_val += p_X.getVal(row,col);
unsigned int l_psShift = p_postScale & 0x00ff;
unsigned int l_psVal = p_postScale >> 8;
l_val = (l_val >> l_psShift) * l_psVal;
T l_entry = (T)(l_val);
if (l_entry < 0) {
l_entry = (l_entry >> (p_PReluVal & 0x003f))* (T)(p_PReluVal >> 6);
}
getVal(row, col) = l_entry;
}
}
}
void matMultWithScaleAndPRelu(Mat & p_A, Mat & p_B, int32_t p_bias, int32_t p_postScale, int16_t p_PReluVal) {
assert(p_A.rows() == rows());
assert(p_A.cols() == p_B.rows());
assert(p_B.cols() == cols());
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < cols(); ++col) {
int64_t l_val = 0;
for (unsigned int k = 0; k < p_A.cols(); ++k) {
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
}
l_val += p_bias;
l_val = (l_val >> (p_postScale & 0x00ff)) * (p_postScale >> 8);
//handle saturation
if (l_val & 0x100000000){//negative number
if ((l_val & 0x0ffff0000) != 0xffff0000) {//underflow
l_val = 0x8000;
}
}
else {
if ((l_val & 0x0ffff0000) != 0) {
l_val = 0x7fff;
}
}
T l_entry = (T)(l_val);
if (l_entry < 0) {
l_entry = (l_entry * (T)(p_PReluVal >> 6)) >> (p_PReluVal & 0x003f);
}
getVal(row, col) = l_entry;
}
}
}
void multiplyGf(Mat & p_A, Mat & p_B, unsigned int p_EdgeWidth) {
assert(p_A.rows() == rows());
assert(p_A.cols() == p_B.rows());
assert(p_B.cols() == cols());
cout << " DEBUG multiplyGf rows=" << rows() << " cols=" << cols()
<< "\n";
for (unsigned int rowBlock = 0; rowBlock < rows() / p_EdgeWidth;
++rowBlock) {
for (unsigned int colBlock = 0; colBlock < cols() / p_EdgeWidth;
++colBlock) {
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < cols(); ++col) {
T l_val = 0;
for (unsigned int k = 0; k < p_A.cols(); ++k) {
l_val += p_A.getVal(k + rowBlock * p_EdgeWidth,
col + colBlock * p_EdgeWidth)
* p_B.getVal(k + rowBlock * p_EdgeWidth,
col + colBlock * p_EdgeWidth);
}
getVal(row + rowBlock * p_EdgeWidth,
col + colBlock * p_EdgeWidth) = l_val;
cout << "DEBUG multiplyGf after k-loop " << *this
<< "\n";
}
}
}
}
}
// Matrix A is in GvA format (also dimensions are wider and shorter)
// The p_rowEdgeWidth just inficates the compute array intake edge to allow for matrix dimension adjustment
void multiplyGemvGf(Mat & p_A, Mat & p_B, unsigned int p_rowEdgeWidth) {
assert(p_A.rows() * p_rowEdgeWidth == rows());
assert(p_A.cols() == p_B.rows() * p_rowEdgeWidth);
assert(p_B.cols() == cols());
cout << " DEBUG multiplyGvA format rows=" << rows() << " cols="
<< cols() << "\n";
// Rows here are mblocks, cols are within the mblock
for (unsigned int row = 0; row < p_A.rows(); ++row) { // A is already in block format
for (unsigned int col = 0; col < p_A.cols(); ++col) {
unsigned int k = col / p_rowEdgeWidth;
unsigned int w = col % p_rowEdgeWidth;
T l_a = p_A.getVal(row, col);
T l_b = p_B.getVal(k, 0);
getVal(w + row * p_rowEdgeWidth, 0) += l_a * l_b;
//cout << " += a * b = " << l_a << " * " << l_b << "\n";
}
//cout << " DEBUG multiplyGemvGf after k-loop " << *this << "\n";
}
}
#if 0
void
multiplySpmv(SpMat<T, TspD, Tsp> & p_A, Mat & p_B) {
T l_val = 0;
assert(p_A.rows() == rows());
assert(p_A.cols() == p_B.rows());
assert(p_B.cols() == cols());
vector<MtxRow> l_rows = p_A.getNnzVector();
for (MtxRow &l_row : l_rows) {
unsigned int row = l_row.getRow(),
col = l_row.getCol();
double l_val = l_row.getVal();
getVal(row, 0) += l_val * p_B.getVal(col, 0);
//cout << "DEBUG multiplySpmv row=" << row << " col=" << col << " "
// << l_val << " * " << p_B.getVal(col, 0)
// << " was added to " << getVal(row, 0) << "\n";
}
}
#endif
void transpose(Mat & p_A) {
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < cols(); ++col) {
getVal(row, col) = p_A.getVal(col, row);
}
}
swap(m_Rows, m_Cols);
}
void transposeGva(Mat & p_A, unsigned int p_rowEdgeWidth,
unsigned int p_colEdgeWidth) {
unsigned int l_pos = 0;
for (unsigned int rowBlock = 0; rowBlock < p_A.rows() / p_rowEdgeWidth;
++rowBlock) {
for (unsigned int colBlock = 0;
colBlock < p_A.cols() / p_colEdgeWidth; ++colBlock) {
for (unsigned int col = 0; col < p_colEdgeWidth; ++col) {
for (unsigned int row = 0; row < p_rowEdgeWidth; ++row) {
getVal(l_pos / cols(), l_pos % cols()) = p_A.getVal(
row + rowBlock * p_rowEdgeWidth,
col + colBlock * p_colEdgeWidth);
l_pos++;
}
//cout << "DEBUG transposeGva step " << *this << "\n";
}
}
}
swap(m_Rows, m_Cols);
}
void print(ostream& os) {
os << m_Rows << "x" << m_Cols << " Ld=" << m_Ld << "\n";
unsigned int l_cols = cols(); // normal matrix
//ld();; // parent matrix (within Ld
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < l_cols; ++col) {
os << int(getVal(row, col)) << " ";
}
os << "\n";
}
}
bool cmp(float p_TolRel, float p_TolAbs, Mat &p_Ref) {
bool ok = true;
unsigned int l_verbose = 1; // 0 none, 1 if not exactly equal, 2 if passed tolerance, 3 show all
unsigned int l_numExactMatches = 0, l_numMismatches = 0;
for (unsigned int row = 0; row < rows(); ++row) {
for (unsigned int col = 0; col < cols(); ++col) {
string l_Prefix = " row " + to_string(row) + " col "
+ to_string(col);
T v = getVal(row, col);
T vRef = p_Ref.getVal(row, col);
bool l_exactMatch = false;
bool l_ok = cmpVal(p_TolRel, p_TolAbs, vRef, v, l_Prefix,
l_exactMatch, 1);
ok = ok && l_ok;
if (l_exactMatch) {
l_numExactMatches++;
}
if (!l_ok) {
l_numMismatches++;
}
}
}
unsigned int l_total = rows() * cols();
unsigned int l_withinTolerance = l_total - l_numExactMatches
- l_numMismatches;
cout << " Compared " << l_total << " values:" << " exact match "
<< l_numExactMatches << " within tolerance "
<< l_withinTolerance << " mismatch " << l_numMismatches
<< "\n";
return (ok);
}
bool cmpVal(float p_TolRel, float p_TolAbs, T vRef, T v,
std::string p_Prefix, bool &p_exactMatch, unsigned int p_Verbose) {
float l_diffAbs = abs(v - vRef);
float l_diffRel = l_diffAbs;
if (vRef != 0) {
l_diffRel /= abs(vRef);
}
p_exactMatch = (vRef == v);
bool l_status = p_exactMatch || (l_diffRel <= p_TolRel)
|| (l_diffAbs <= p_TolAbs);
if ((p_Verbose >= 3) || ((p_Verbose >= 2) && !p_exactMatch)
|| ((p_Verbose >= 1) && !l_status)) {
std::cout << p_Prefix << " ValRef " << std::left
<< std::setw(GEMX_CMP_WIDTH) << vRef << " Val " << std::left
<< std::setw(GEMX_CMP_WIDTH) << v << " DifRel "
<< std::left << std::setw(GEMX_CMP_WIDTH) << l_diffRel
<< " DifAbs " << std::left << std::setw(GEMX_CMP_WIDTH)
<< l_diffAbs << " Status " << l_status << "\n";
}
return (l_status);
}
};
//Base address will be the instruction memory region
class XCL_FPGA {
public:
XCL_FPGA() = delete;
XCL_FPGA(const string & xclbin, const string & kernelName) {
loadXclbin(xclbin, kernelName);
}
~XCL_FPGA() {
}
void loadXclbin(string p_XclbinFile, string p_KernelName) {
// https://gitenterprise.xilinx.com/rkeryell/heterogeneous_examples/blob/master/vector_add/SDAccel-Boost.Compute/vector_add.cpp
// Create the OpenCL context to attach resources on the device
m_Context = move(boost::compute::system::default_context());
// Create the OpenCL command queue to control the device
//m_CommandQueue = move(boost::compute::system::default_queue());
//boost::compute::command_queue queue(boost::compute::system::default_context(), boost::compute::system::default_device(), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE);
boost::compute::command_queue queue(
boost::compute::system::default_context(),
boost::compute::system::default_device(),
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
m_CommandQueue = move(queue);
// Construct an OpenCL program from the precompiled kernel file
m_Program = move(
boost::compute::program::create_with_binary_file(p_XclbinFile,
m_Context));
m_Program.build();
m_Kernel = move(boost::compute::kernel(m_Program, p_KernelName));
}
boost::compute::buffer createBuf(void *ptr, size_t sz_bytes) {
//decltype of cl_mem_ext_ptr_t.flags
//unsigned l_k2bank[] = {GEMX_fpgaDdrBanks};
//std::cout << "l_k2bank: " << l_k2bank[0] << std::endl;
cl_mem_ext_ptr_t l_bufExt;
//l_bufExt.obj = NULL;
l_bufExt.param = 0;
l_bufExt.flags = GEMX_fpgaDdrBanks;
l_bufExt.obj = ptr;
// Buffers
return boost::compute::buffer(m_Context, sz_bytes,
CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX,
&l_bufExt);
}
bool copyToFpga(const boost::compute::buffer & buf, bool sync_send) {
boost::compute::event l_event;
//cout << "copyToFPGA" << endl;
// Send the input data to the accelerator
l_event = m_CommandQueue.enqueue_migrate_memory_objects(1, &(buf.get()),
0);
if (sync_send){
l_event.wait();
} else{
m_waitInput.insert(l_event);
}
return true;
}
///////////////////////////////////////////////////////////////////////////
boost::compute::buffer copyToFpga(void * buf, size_t sz_bytes,
bool sync_send = false) {
boost::compute::buffer cl_buf = createBuf(buf, sz_bytes);
copyToFpga(cl_buf, sync_send);
return cl_buf;
}
void copyFromFpga(const boost::compute::buffer & buf, bool sync_exec = true) {
//cout << "copyFromFPGA" << endl;
XTimer t;
boost::compute::event l_readEvents =
m_CommandQueue.enqueue_migrate_memory_objects(1, &(buf.get()),
CL_MIGRATE_MEM_OBJECT_HOST, m_waitOutput);
//l_readEvents.wait();
if ( sync_exec ){
l_readEvents.wait();
m_waitOutput.clear();
} else{
m_waitOutput.insert(l_readEvents);
}
#ifdef GEMX_PERF_DBG
cout << "copyFromFpga: " << t.elapsed() << endl;
#endif
}
void execKernel(const boost::compute::buffer & instr_buf, bool sync_exec = false) {
boost::compute::extents<1> offset { 0 };
boost::compute::extents<1> global { 1 };
// Use only 1 CU
boost::compute::extents<1> local { 1 };
// Launch kernels
m_Kernel.set_args(instr_buf, instr_buf);
XTimer t;
boost::compute::event l_event = m_CommandQueue.enqueue_nd_range_kernel(
m_Kernel, offset, global, local, m_waitInput);
if ( 1 ) {
l_event.wait();
} else{
m_waitOutput.insert(l_event);
}
m_waitInput.clear();
#ifdef GEMX_PERF_DBG
cout << "execKernel: " << t.elapsed() << endl;
#endif
}
void wait ()
{
for (size_t i = 0; i < m_waitOutput.size(); i++){
m_waitOutput[i].wait();
}
m_waitInput.clear();
m_waitOutput.clear();
}
private:
boost::compute::program m_Program;
boost::compute::kernel m_Kernel;
boost::compute::context m_Context;
boost::compute::command_queue m_CommandQueue;
boost::compute::wait_list m_waitInput, m_waitOutput;
};
template<typename HType>
class GEMMHost {
public:
GEMMHost() = delete;
~GEMMHost() {
}
GEMMHost(const GEMMHost<HType> &) = delete;
GEMMHost(const string & xclbin, const string & kernelName) {
_fpga = shared_ptr<XCL_FPGA>(new XCL_FPGA(xclbin, kernelName));
void *aligned_mem = nullptr;
assert(!posix_memalign(&aligned_mem, PAGE_SIZE, INSTR_BUF_SIZE));
_instrBuf = shared_ptr<char>((char*) aligned_mem);
memset(_instrBuf.get(), 0, INSTR_BUF_SIZE);
_instr_offset = 0;
_cl_instr_buf = _fpga->copyToFpga(_instrBuf.get(), INSTR_BUF_SIZE,
true);
xclGetMemObjDeviceAddress(_cl_instr_buf.get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &_ddrDeviceBaseAddr);
assert(!posix_memalign(&aligned_mem, PAGE_SIZE, KERN_DBG_BUF_SIZE));
_kernDbgBuf = shared_ptr<char>((char*) aligned_mem);
_cl_kern_dbg_buf = _fpga->copyToFpga(_kernDbgBuf.get(),
KERN_DBG_BUF_SIZE, true);
}
bool AddGEMMOp(const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift) {
return AddGEMMOp (A, B, C, bias, m, k, n, k, n, n, n, postScale, postShift);
}
bool AddGEMMOp(const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, unsigned int lda, unsigned int ldb, unsigned int ldc, unsigned int ldx, int postScale, int postShift) {
XTimer t;
if (_hostMat.find(A) == _hostMat.end()
|| _hostMat.find(B) == _hostMat.end()
|| _hostMat.find(C) == _hostMat.end()
|| _hostMat.find(bias) == _hostMat.end()) {
cerr << "Matrix not found!" << endl;
return false;
}
unsigned long long A_off = 0, B_off = 0, C_off = 0, X_off = 0;
xclGetMemObjDeviceAddress(_devHandle[A].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &A_off);
xclGetMemObjDeviceAddress(_devHandle[B].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &B_off);
xclGetMemObjDeviceAddress(_devHandle[C].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &C_off);
if ( _devHandle.find(bias) != _devHandle.end()){
xclGetMemObjDeviceAddress(_devHandle[bias].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &X_off);
assert(X_off > _ddrDeviceBaseAddr);
X_off -= _ddrDeviceBaseAddr;
}
// cout << "A_dev_addr: " << A_off << " B_dev_addr: " << B_off << " C_dev_addr: " << C_off << " X_dev_addr: " << X_off << endl;
assert(A_off > _ddrDeviceBaseAddr);
assert(B_off > _ddrDeviceBaseAddr);
assert(C_off > _ddrDeviceBaseAddr);
A_off -= _ddrDeviceBaseAddr;
B_off -= _ddrDeviceBaseAddr;
C_off -= _ddrDeviceBaseAddr;
assert(A_off % PAGE_SIZE == 0);
assert(B_off % PAGE_SIZE == 0);
assert(C_off % PAGE_SIZE == 0);
assert(X_off % PAGE_SIZE == 0);
A_off /= PAGE_SIZE;
B_off /= PAGE_SIZE;
C_off /= PAGE_SIZE;
X_off /= PAGE_SIZE;
GemmArgs gargs(A_off, B_off, C_off, X_off, m,
k, n, lda, ldb, ldc, ldx, postScale, postShift);
AddInstr ( &gargs);
return true;
}
bool AddMat(const HType & handle, void * mat_ptr, unsigned long long buf_sz) {
if (_hostMat.find(handle) == _hostMat.end()) {
_hostMat[handle] = mat_ptr;
_hostMatSz[handle] = buf_sz;
return true;
}
else if (_hostMatSz[handle] != buf_sz ){
_hostMat[handle] = mat_ptr;
_hostMatSz[handle] = buf_sz;
_devHandle.erase(handle);
cout << "Erasing devhandle!" << endl;
return true;
}
//cout << "Matrix " << handle << " already added!" << endl;
return false;
}
/*
bool AddMat(const HType & handle, short *p_Addr, unsigned int p_Rows,
unsigned int p_Cols, unsigned int p_Ld) {
return AddMat(handle,
shared_ptr<ShortMat>(
new ShortMat(p_Rows, p_Cols, p_Ld, p_Addr)));
}
bool AddMat(const HType & handle, int *p_Addr, unsigned int p_Rows,
unsigned int p_Cols, unsigned int p_Ld) {
return AddMat(handle,
shared_ptr<IntMat>(
new IntMat(p_Rows, p_Cols, p_Ld, p_Addr)));
}
*/
/*
shared_ptr<BaseMat> AllocMat(const HType & handle, size_t n_rows,
size_t n_cols, size_t ld, bool sendToFPGA = false, bool sync_send =
false) {
if (_hostMat.find(handle) != _hostMat.end()) {
cout << "Matrix " << handle << " already created!" << endl;
return _hostMat[handle];
}
cout << "Matrix " << handle << " " << n_rows << " " << n_cols << " "
<< ld << endl;
void* aligned_buffer = nullptr;
// assert ( !posix_memalign ( &aligned_buffer, PAGE_SIZE, sizeof(T) * n_rows * n_cols ));
assert(
!posix_memalign(&aligned_buffer, PAGE_SIZE,
* n_rows * ld));
shared_ptr<Mat<DType> > mat_ptr = shared_ptr<Mat<DType> >(
new Mat<DType>(n_rows, n_cols, ld, (DType*) aligned_buffer));
assert(AddMat(handle, mat_ptr));
if (sendToFPGA)
SendToFPGA(handle, sync_send);
return mat_ptr;
}
*/
void * GetMat(const HType & handle,
bool queryFPGA = false, bool sync_get = true) {
void * ret_ptr = nullptr;
if (_hostMat.find(handle) != _hostMat.end()) {
if (queryFPGA)
GetFromFPGA(handle, sync_get);
ret_ptr = _hostMat[handle];
} else{
cout << "GetMat: Matrix not found!" << endl;
assert(0);
}
return ret_ptr;
}
void Execute() {
XTimer t;
_fpga->copyToFpga(_cl_instr_buf, false);
_fpga->execKernel(_cl_instr_buf);
memset(_instrBuf.get(), 0, PAGE_SIZE);
_instr_offset = 0;
#ifdef GEMX_PERF_DBG
cout << "Execute: " << t.elapsed() << endl;
#endif
}
void Wait(){
_fpga->wait();
}
/*
void SendToFPGA(const HType & handle, int * p_addr, int n_row, int n_col,
int ld, bool sync_send = false) {
SendToFPGA(handle,
shared_ptr<Mat >(
new Mat<DType>(n_row, n_col, ld, p_addr)), sync_send);
}
*/
void SendToFPGA(const HType & handle, void * mat_ptr, unsigned long long buf_sz,
bool sync_send = false) {
AddMat(handle, mat_ptr, buf_sz);
SendToFPGA(handle, sync_send);
}
void SendToFPGA(const HType & handle, bool sync_send = false) {
XTimer t;
assert(_hostMat.find(handle) != _hostMat.end());
//shared_ptr < Mat<T> > mat = _hostMat[handle];
if (_devHandle.find(handle) != _devHandle.end()) {
_fpga->copyToFpga(_devHandle[handle], sync_send);
} else {
_devHandle[handle] = _fpga->copyToFpga(_hostMat[handle], _hostMatSz[handle], sync_send);
}
#ifdef GEMX_PERF_DBG
cout << "SendToFPGA: " << t.elapsed() << endl;
#endif
}
void GetFromFPGA(const HType & handle, bool sync_get) {
XTimer t;
assert(_devHandle.find(handle) != _devHandle.end());
_fpga->copyFromFpga(_devHandle[handle], sync_get);
#ifdef GEMX_PERF_DBG
cout << "GetFromFPGA: " << t.elapsed() << endl;
#endif
}
protected:
void AddInstr ( kArgs * args )
{
char * instr = args->asByteArray();
char * curr_pos = &_instrBuf.get()[_instr_offset];
memcpy(curr_pos, instr, args->sizeInBytes());
_instr_offset += args->sizeInBytes();
}
static const unsigned int PAGE_SIZE = 4096;
static const unsigned int INSTR_BUF_SIZE = PAGE_SIZE;
static const unsigned int KERN_DBG_BUF_SIZE = PAGE_SIZE;
unsigned long long _ddrDeviceBaseAddr;
shared_ptr<char> _instrBuf, _kernDbgBuf;
boost::compute::buffer _cl_instr_buf, _cl_kern_dbg_buf;
unsigned int _instr_offset;
unordered_map<HType, void* > _hostMat;
unordered_map<HType, unsigned long long > _hostMatSz;
unordered_map<HType, boost::compute::buffer> _devHandle;
shared_ptr<XCL_FPGA> _fpga;
};
template<typename HType>
class FCNHost : public GEMMHost <HType>
{
public:
FCNHost() = delete;
virtual ~FCNHost(){}
FCNHost ( const FCNHost<HType>&) = delete;
FCNHost(const string & xclbin, const string & kernelName ) : GEMMHost<HType> ( xclbin, kernelName)
{
}
bool AddFCNOp ( const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift, short PReLUScale, short PReLUAlpha)
{
return AddFCNOp ( A, B, C, bias, m, k, n, k, n, n, n,postScale, postShift, PReLUScale, PReLUAlpha);
}
bool AddFCNOp ( const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, unsigned int lda, unsigned int ldb, unsigned int ldc, unsigned int ldx, int postScale, int postShift, short PReLUScale, short PReLUAlpha)
{
XTimer t;
if (this->_hostMat.find(A) == this->_hostMat.end()
|| this->_hostMat.find(B) == this->_hostMat.end()
|| this->_hostMat.find(C) == this->_hostMat.end()
|| this->_hostMat.find(bias) == this->_hostMat.end()) {
cerr << "Matrix not found!" << endl;
return false;
}
if ( m < MIN_M ){
cerr << "m dimension (" << m << ") is less than minimum supported size " << MIN_M << endl;
return false;
}
if ( !isPowerOf2(m) ){
cerr << "m dimension (" << m << ") isn't a power of 2" << endl;
return false;
}
if ( k < MIN_K ){
cerr << "k dimension (" << k << ") is less than minimum supported size " << MIN_K << endl;
return false;
}
if ( !isPowerOf2(k) ){
cerr << "k dimension (" << k << ") isn't a power of 2" << endl;
return false;
}
if ( n < MIN_N ){
cerr << "n dimension (" << n << ") is less than minimum supported size " << MIN_N << endl;
return false;
}
if ( !isPowerOf2(n) ){
cerr << "n dimension (" << n << ") isn't a power of 2" << endl;
return false;
}
if ( lda < MIN_M ){
cerr << "lda dimension (" << lda << ") is less than minimum supported size " << MIN_M << endl;
return false;
}
if ( !isPowerOf2(lda) ){
cerr << "lda dimension (" << lda << ") isn't a power of 2" << endl;
return false;
}
if ( ldb < MIN_N ){
cerr << "ldb dimension (" << ldb << ") is less than minimum supported size " << MIN_N << endl;
return false;
}
if ( !isPowerOf2(ldb) ){
cerr << "ldb dimension (" << ldb << ") isn't a power of 2" << endl;
return false;
}
if ( ldc < MIN_N ){
cerr << "ldc dimension (" << ldc << ") is less than minimum supported size " << MIN_N << endl;
return false;
}
if ( !isPowerOf2(ldc) ){
cerr << "ldc dimension (" << ldc << ") isn't a power of 2" << endl;
return false;
}
//if ( ldx != ldc )
unsigned long long A_off = 0, B_off = 0, C_off = 0, X_off = 0;
xclGetMemObjDeviceAddress(this->_devHandle[A].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &A_off);
xclGetMemObjDeviceAddress(this->_devHandle[B].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &B_off);
xclGetMemObjDeviceAddress(this->_devHandle[C].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &C_off);
if ( this->_devHandle.find(bias) != this->_devHandle.end()){
xclGetMemObjDeviceAddress(this->_devHandle[bias].get(),
boost::compute::system::default_device().get(),
sizeof(unsigned long long), &X_off);
assert(X_off > this->_ddrDeviceBaseAddr);
X_off -= this->_ddrDeviceBaseAddr;
}
//cout << "A_dev_addr: " << A_off << " B_dev_addr: " << B_off << " C_dev_addr: " << C_off << endl;
assert(A_off > this->_ddrDeviceBaseAddr);
assert(B_off > this->_ddrDeviceBaseAddr);
assert(C_off > this->_ddrDeviceBaseAddr);
A_off -= this->_ddrDeviceBaseAddr;
B_off -= this->_ddrDeviceBaseAddr;
C_off -= this->_ddrDeviceBaseAddr;
assert(A_off % this->PAGE_SIZE == 0);
assert(B_off % this->PAGE_SIZE == 0);
assert(C_off % this->PAGE_SIZE == 0);
assert(X_off % this->PAGE_SIZE == 0);
A_off /= this->PAGE_SIZE;
B_off /= this->PAGE_SIZE;
C_off /= this->PAGE_SIZE;
X_off /= this->PAGE_SIZE;
FcnArgs args(A_off, B_off, C_off, X_off, m,
k, n, lda, ldb, ldc, ldx, postScale, postShift, PReLUScale, PReLUAlpha);
this->AddInstr ( &args);
#ifdef GEMX_PERF_DBG
cout << "AddFCNOp: " << t.elapsed() << endl;
#endif
return true;
}
protected:
const int MIN_M = 256;
const int MIN_K = 256;
const int MIN_N = 32;
bool isPowerOf2( int n )
{
return ( (n & (n-1)) == 0 );
}
};
}
;
// namespace
extern "C" {
gemx::FCNHost<short*> * MakeFCNHost(char *xclbin, char * kernName);
void DestroyFCNHost(gemx::FCNHost<short*> * ptr);
void SendToFPGAShrt(gemx::FCNHost<short*> * gh, short *A, unsigned long long num_elem, bool sync_send);
void SendToFPGAInt(gemx::FCNHost<short*> * gh, int *A, unsigned long long num_elem, bool sync_send);
void SendToFPGAShrt_dbg(gemx::FCNHost<short*> * gh, char * name, short *A, int m, int n, bool sync_send);
void SendToFPGAInt_dbg(gemx::FCNHost<short*> * gh, char * name, int *A, int m, int n, bool sync_send);
void SendToFPGA(gemx::FCNHost<short*> * gh, void * A, unsigned long long buf_sz, bool sync_send);
void* GetFromFPGA(gemx::FCNHost<short*> * gh, short *A, bool sync_get);
void Wait ( gemx::FCNHost<short*> * gh );
bool AddFCNOp(gemx::FCNHost<short*> * gh, void * A, void * B, void *C, void * bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift, short PReLUScale, short PReLUAlpha );
void Execute (gemx::FCNHost<short*> * gh);
gemx::GEMMHost<short*> * MakeGEMMHost(char *xclbin, char * kernName);
void DestroyGEMMHost(gemx::GEMMHost<short*> * ptr);
void SendToFPGAShrt_GEMM(gemx::GEMMHost<short*> * gh, short *A, unsigned long long num_elem, bool sync_send);
void SendToFPGAInt_GEMM(gemx::GEMMHost<short*> * gh, int *A, unsigned long long num_elem, bool sync_send);
void SendToFPGAShrt_dbg_GEMM(gemx::GEMMHost<short*> * gh, char * name, short *A, int m, int n, bool sync_send);
void SendToFPGAInt_dbg_GEMM(gemx::GEMMHost<short*> * gh, char * name, int *A, int m, int n, bool sync_send);
void SendToFPGA_GEMM(gemx::GEMMHost<short*> * gh, void * A, unsigned long long buf_sz, bool sync_send);
void* GetFromFPGA_GEMM(gemx::GEMMHost<short*> * gh, short *A, bool sync_get);
void Wait_GEMM ( gemx::GEMMHost<short*> * gh );
//void AddGEMMOp(gemx::FCNHost<short*> * gh, void * A, void * B, void *C, unsigned int m, unsigned int k, unsigned int n);
bool AddGEMMOp(gemx::GEMMHost<short*> * gh, void * A, void * B, void *C, void * bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift);
void Execute_GEMM (gemx::GEMMHost<short*> * gh);
}
#endif /* SRC_GEMX_HOST_H_ */