1054 lines
39 KiB
C++
1054 lines
39 KiB
C++
/*
|
|
* gemx_host.h
|
|
*
|
|
* Created on: Jan 20, 2018
|
|
* Author: xteng
|
|
*/
|
|
|
|
#ifndef SRC_GEMX_HOST_H_
|
|
#define SRC_GEMX_HOST_H_
|
|
|
|
#include "assert.h"
|
|
#include <stdio.h>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <fstream>
|
|
#include "CL/cl.h"
|
|
#include "CL/cl_ext.h"
|
|
#include <boost/compute.hpp>
|
|
#include <boost/compute/command_queue.hpp>
|
|
#include <boost/compute/program.hpp>
|
|
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <unordered_map>
|
|
#include <stdlib.h>
|
|
#include <cstring>
|
|
#include <iomanip>
|
|
//#define GEMX_PERF_DBG
|
|
using namespace std;
|
|
namespace gemx {
|
|
typedef enum {
|
|
OpControl, OpGemv, OpGemm, OpTransp, OpSpmv, OpResult, OpFail, OpFcn
|
|
} OpType;
|
|
|
|
class XTimer
|
|
{
|
|
public:
|
|
XTimer() : beg_(clock_::now()) {}
|
|
void reset() { beg_ = clock_::now(); }
|
|
double elapsed() const {
|
|
return std::chrono::duration_cast<second_>
|
|
(clock_::now() - beg_).count(); }
|
|
|
|
private:
|
|
typedef std::chrono::high_resolution_clock clock_;
|
|
typedef std::chrono::duration<double, std::ratio<1> > second_;
|
|
std::chrono::time_point<clock_> beg_;
|
|
};
|
|
|
|
class kArgs {
|
|
public:
|
|
virtual ~kArgs() {
|
|
}
|
|
virtual size_t sizeInBytes() = 0;
|
|
virtual char* asByteArray() = 0;
|
|
};
|
|
|
|
class SpMvArgs: public kArgs {
|
|
public:
|
|
virtual ~SpMvArgs() {
|
|
}
|
|
};
|
|
|
|
//////////////////////////// GEMM ////////////////////////////
|
|
class GemmArgs: public kArgs {
|
|
public:
|
|
virtual ~GemmArgs() {
|
|
}
|
|
GemmArgs() = delete;
|
|
GemmArgs(unsigned int p_Aoffset, unsigned int p_Boffset,
|
|
unsigned int p_Coffset, unsigned int p_Xoffset, unsigned int p_M, unsigned int p_K,
|
|
unsigned int p_N, unsigned int p_Lda, unsigned int p_Ldb,
|
|
unsigned int p_Ldc, unsigned int p_Ldx, int post_scale, int post_shift) :
|
|
m_gemm_args( { int(OpGemm), p_Aoffset, p_Boffset, p_Coffset, p_Xoffset, p_M, p_K,
|
|
p_N, p_Lda, p_Ldb, p_Ldc, p_Ldx, 0, 0, 0, 0 }) {
|
|
m_gemm_args.m_postScaleVal = (post_scale << 8) | (post_shift & 0x000000ff);
|
|
}
|
|
size_t sizeInBytes() {
|
|
return sizeof(m_gemm_args);
|
|
}
|
|
char *asByteArray() {
|
|
return reinterpret_cast<char*>(&m_gemm_args);
|
|
}
|
|
|
|
protected:
|
|
struct {
|
|
int m_optype;
|
|
unsigned int m_Aoffset, m_Boffset, m_Coffset, m_Xoffset, m_M, m_K, m_N,
|
|
m_Lda, m_Ldb, m_Ldc, m_Ldx;
|
|
int m_postScaleVal;
|
|
int dummy[3];
|
|
} m_gemm_args;
|
|
};
|
|
|
|
class FcnArgs: public kArgs {
|
|
public:
|
|
virtual ~FcnArgs() {
|
|
}
|
|
FcnArgs() = delete;
|
|
FcnArgs(unsigned int p_Aoffset, unsigned int p_Boffset,
|
|
unsigned int p_Coffset, unsigned int p_Xoffset, unsigned int p_M, unsigned int p_K,
|
|
unsigned int p_N, unsigned int p_Lda, unsigned int p_Ldb,
|
|
unsigned int p_Ldc, unsigned int p_Ldx, int post_scale, int post_shift, short prelu_scale, short prelu_alpha) :
|
|
m_fcn_args( { OpFcn, p_Aoffset, p_Boffset, p_Coffset, p_Xoffset, p_M, p_K,
|
|
p_N, p_Lda, p_Ldb, p_Ldc, p_Ldx, 0, 0, 0, 0 }) {
|
|
|
|
m_fcn_args.m_postScaleVal = (post_scale << 8) | (post_shift & 0x000000ff);
|
|
m_fcn_args.m_PReLUVal = (prelu_scale << 6) | (prelu_alpha & 0x003f);
|
|
|
|
//std::cout << "s_dummy: " << m_fcn_args.s_dummy << std::endl;
|
|
//printf ("PReLUVal: %d\n", m_fcn_args.m_PReLUVal);
|
|
//std::stringstream stream;
|
|
//std::cout << "optype: " << optype << " p_Aoffset: " << p_Aoffset << std::endl;
|
|
|
|
/*
|
|
int * data = (int*)asByteArray();
|
|
for (int i = 0; i < sizeInBytes()/4; i++){
|
|
std::cout << "word " << i << ": " << data[i] << std::endl;
|
|
}
|
|
*/
|
|
|
|
//std::string result( stream.str() );
|
|
//std::cout << "Hex: " << result << std::endl;
|
|
}
|
|
size_t sizeInBytes() {
|
|
return sizeof(m_fcn_args);
|
|
}
|
|
char *asByteArray() {
|
|
return reinterpret_cast<char*>(&m_fcn_args);
|
|
}
|
|
|
|
protected:
|
|
struct {
|
|
int m_optype;
|
|
unsigned int m_Aoffset, m_Boffset, m_Coffset, m_Xoffset, m_M, m_K, m_N,
|
|
m_Lda, m_Ldb, m_Ldc, m_Ldx;
|
|
int m_postScaleVal;
|
|
short m_PReLUVal;
|
|
short s_dummy;
|
|
int dummy[2];
|
|
} m_fcn_args;
|
|
};
|
|
|
|
// Matrix descriptor with data itself stored in caller's space
|
|
template<typename T>
|
|
class Mat {
|
|
private:
|
|
unsigned int m_Rows, m_Cols, m_Ld, m_buf_sz;
|
|
bool m_ownmem;
|
|
T *m_Addr;
|
|
public:
|
|
const static size_t GEMX_CMP_WIDTH = 11;
|
|
Mat() = delete;
|
|
~Mat() {
|
|
if (m_ownmem && m_Addr) {
|
|
free(m_Addr);
|
|
}
|
|
}
|
|
Mat(unsigned int p_Rows, unsigned int p_Cols, unsigned int p_Ld) :
|
|
m_Rows(p_Rows), m_Cols(p_Cols), m_Ld(p_Ld), m_ownmem(true) {
|
|
m_buf_sz = sizeof(T) * p_Rows * p_Ld;
|
|
posix_memalign((void**) &m_Addr, 4096, m_buf_sz);
|
|
//m_Addr = (T*)aligned_alloc ( 4096, sizeof(T) * p_Rows * p_Cols);
|
|
}
|
|
|
|
Mat(unsigned int p_Rows, unsigned int p_Cols, unsigned int p_Ld, T *p_Addr) :
|
|
m_Rows(p_Rows), m_Cols(p_Cols), m_Ld(p_Ld), m_Addr(p_Addr), m_ownmem(
|
|
false) {
|
|
m_buf_sz = sizeof(T) * p_Rows * p_Ld;
|
|
}
|
|
Mat& operator=(const Mat& p_Src) {
|
|
assert(p_Src.rows() == rows());
|
|
assert(p_Src.cols() == cols());
|
|
for (unsigned int row = 0; row < m_Rows; ++row) {
|
|
for (unsigned int col = 0; col < m_Ld; ++col) {
|
|
m_Addr[row][col] = p_Src.getVal(row, col);
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
unsigned int buf_sz(){
|
|
return m_buf_sz;
|
|
}
|
|
T*& data() {
|
|
return m_Addr;
|
|
}
|
|
|
|
inline T &getVal(unsigned int p_Row, unsigned int p_Col) {
|
|
return m_Addr[p_Row * ld() + p_Col];
|
|
}
|
|
inline unsigned int rows() {
|
|
return m_Rows;
|
|
}
|
|
inline unsigned int cols() {
|
|
return m_Cols;
|
|
}
|
|
inline unsigned int ld() {
|
|
return m_Ld;
|
|
}
|
|
|
|
void init(unsigned int p_Rows, unsigned int p_Cols, unsigned int p_Ld,
|
|
T *p_Addr) {
|
|
m_Rows = p_Rows;
|
|
m_Cols = p_Cols;
|
|
m_Ld = p_Ld;
|
|
m_Addr = p_Addr;
|
|
}
|
|
|
|
void fillModRange(T p_Min, T p_Max) {
|
|
T l_val = p_Min;
|
|
for (unsigned int row = 0; row < m_Rows; ++row) {
|
|
for (unsigned int col = 0; col < ld(); ++col) {
|
|
getVal(row, col) = l_val++;
|
|
if ( l_val > p_Max ) l_val = p_Min;
|
|
}
|
|
}
|
|
}
|
|
|
|
void fillMod(T p_Max, T p_First = 0) {
|
|
T l_val = p_First;
|
|
for (unsigned int row = 0; row < m_Rows; ++row) {
|
|
for (unsigned int col = 0; col < ld(); ++col) {
|
|
getVal(row, col) = l_val;
|
|
l_val++;
|
|
l_val %= p_Max;
|
|
}
|
|
}
|
|
}
|
|
|
|
void multiply(Mat & p_A, Mat & p_B) {
|
|
T l_val = 0;
|
|
assert(p_A.rows() == rows());
|
|
assert(p_A.cols() == p_B.rows());
|
|
assert(p_B.cols() == cols());
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < cols(); ++col) {
|
|
T l_val = 0;
|
|
for (unsigned int k = 0; k < p_A.cols(); ++k) {
|
|
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
|
|
}
|
|
//cout << " DEBUG multiply setting row=" << row << " col=" << col << endl;
|
|
getVal(row, col) = l_val;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
multiplyAddScale(Mat & p_A, Mat & p_B, Mat<int> & p_X, int32_t p_postScale) {
|
|
assert(p_A.rows() == rows());
|
|
assert(p_A.cols() == p_B.rows());
|
|
assert(p_B.cols() == cols());
|
|
assert(p_X.rows() == rows());
|
|
assert(p_X.cols() == cols());
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < cols(); ++col) {
|
|
int64_t l_val = 0;
|
|
for (unsigned int k = 0; k < p_A.cols(); ++k) {
|
|
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
|
|
}
|
|
l_val += p_X.getVal(row, col);
|
|
l_val = (l_val >> (p_postScale & 0x00ff)) * (p_postScale >> 8);
|
|
T l_entry = (T)(l_val);
|
|
getVal(row, col) = l_entry;
|
|
}
|
|
}
|
|
}
|
|
|
|
void matMultWithScaleAndPRelu(Mat & p_A, Mat & p_B, Mat<int> & p_X, int32_t p_postScale, int16_t p_PReluVal) {
|
|
cout << "A rows: " << p_A.rows() << " this rows: " << rows() << endl;
|
|
assert(p_A.rows() == rows());
|
|
assert(p_A.cols() == p_B.rows());
|
|
assert(p_B.cols() == cols());
|
|
assert(p_X.rows() == rows());
|
|
assert(p_X.cols() == cols());
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < cols(); ++col) {
|
|
int64_t l_val = 0;
|
|
for (unsigned int k = 0; k < p_A.cols(); ++k) {
|
|
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
|
|
// if ((row==2) && (col == 0)) {
|
|
// if (p_B.getVal(k, col) != 0) {
|
|
// std::cout << " A[2," << std::dec << k << "]= " << p_A.getVal(row, k) << std::hex << " 0x" << p_A.getVal(row, k);
|
|
// std::cout << " B[" << std::dec << k << ",0]= " << p_B.getVal(k,col) << std::hex << " 0x" << p_B.getVal(k,col);
|
|
// std::cout << " A*B+C = " << std::dec << l_val << std::hex << " 0x" << l_val << "\n";
|
|
// }
|
|
// }
|
|
}
|
|
|
|
// if ((row == 2) && (col == 0)) {
|
|
// std::bitset<64> l_bVal{l_val};
|
|
// std::cout << "C[2,0]= " << l_bVal << "\n";
|
|
// }
|
|
l_val += p_X.getVal(row,col);
|
|
unsigned int l_psShift = p_postScale & 0x00ff;
|
|
unsigned int l_psVal = p_postScale >> 8;
|
|
l_val = (l_val >> l_psShift) * l_psVal;
|
|
T l_entry = (T)(l_val);
|
|
if (l_entry < 0) {
|
|
l_entry = (l_entry >> (p_PReluVal & 0x003f))* (T)(p_PReluVal >> 6);
|
|
}
|
|
getVal(row, col) = l_entry;
|
|
}
|
|
}
|
|
}
|
|
void matMultWithScaleAndPRelu(Mat & p_A, Mat & p_B, int32_t p_bias, int32_t p_postScale, int16_t p_PReluVal) {
|
|
assert(p_A.rows() == rows());
|
|
assert(p_A.cols() == p_B.rows());
|
|
assert(p_B.cols() == cols());
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < cols(); ++col) {
|
|
int64_t l_val = 0;
|
|
for (unsigned int k = 0; k < p_A.cols(); ++k) {
|
|
l_val += p_A.getVal(row, k) * p_B.getVal(k, col);
|
|
}
|
|
|
|
l_val += p_bias;
|
|
l_val = (l_val >> (p_postScale & 0x00ff)) * (p_postScale >> 8);
|
|
//handle saturation
|
|
if (l_val & 0x100000000){//negative number
|
|
if ((l_val & 0x0ffff0000) != 0xffff0000) {//underflow
|
|
l_val = 0x8000;
|
|
}
|
|
}
|
|
else {
|
|
if ((l_val & 0x0ffff0000) != 0) {
|
|
l_val = 0x7fff;
|
|
}
|
|
}
|
|
T l_entry = (T)(l_val);
|
|
if (l_entry < 0) {
|
|
l_entry = (l_entry * (T)(p_PReluVal >> 6)) >> (p_PReluVal & 0x003f);
|
|
}
|
|
getVal(row, col) = l_entry;
|
|
}
|
|
}
|
|
}
|
|
|
|
void multiplyGf(Mat & p_A, Mat & p_B, unsigned int p_EdgeWidth) {
|
|
assert(p_A.rows() == rows());
|
|
assert(p_A.cols() == p_B.rows());
|
|
assert(p_B.cols() == cols());
|
|
cout << " DEBUG multiplyGf rows=" << rows() << " cols=" << cols()
|
|
<< "\n";
|
|
for (unsigned int rowBlock = 0; rowBlock < rows() / p_EdgeWidth;
|
|
++rowBlock) {
|
|
for (unsigned int colBlock = 0; colBlock < cols() / p_EdgeWidth;
|
|
++colBlock) {
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < cols(); ++col) {
|
|
T l_val = 0;
|
|
for (unsigned int k = 0; k < p_A.cols(); ++k) {
|
|
l_val += p_A.getVal(k + rowBlock * p_EdgeWidth,
|
|
col + colBlock * p_EdgeWidth)
|
|
* p_B.getVal(k + rowBlock * p_EdgeWidth,
|
|
col + colBlock * p_EdgeWidth);
|
|
}
|
|
getVal(row + rowBlock * p_EdgeWidth,
|
|
col + colBlock * p_EdgeWidth) = l_val;
|
|
cout << "DEBUG multiplyGf after k-loop " << *this
|
|
<< "\n";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Matrix A is in GvA format (also dimensions are wider and shorter)
|
|
// The p_rowEdgeWidth just inficates the compute array intake edge to allow for matrix dimension adjustment
|
|
void multiplyGemvGf(Mat & p_A, Mat & p_B, unsigned int p_rowEdgeWidth) {
|
|
assert(p_A.rows() * p_rowEdgeWidth == rows());
|
|
assert(p_A.cols() == p_B.rows() * p_rowEdgeWidth);
|
|
assert(p_B.cols() == cols());
|
|
cout << " DEBUG multiplyGvA format rows=" << rows() << " cols="
|
|
<< cols() << "\n";
|
|
// Rows here are mblocks, cols are within the mblock
|
|
for (unsigned int row = 0; row < p_A.rows(); ++row) { // A is already in block format
|
|
for (unsigned int col = 0; col < p_A.cols(); ++col) {
|
|
unsigned int k = col / p_rowEdgeWidth;
|
|
unsigned int w = col % p_rowEdgeWidth;
|
|
T l_a = p_A.getVal(row, col);
|
|
T l_b = p_B.getVal(k, 0);
|
|
getVal(w + row * p_rowEdgeWidth, 0) += l_a * l_b;
|
|
//cout << " += a * b = " << l_a << " * " << l_b << "\n";
|
|
}
|
|
//cout << " DEBUG multiplyGemvGf after k-loop " << *this << "\n";
|
|
}
|
|
}
|
|
#if 0
|
|
void
|
|
multiplySpmv(SpMat<T, TspD, Tsp> & p_A, Mat & p_B) {
|
|
T l_val = 0;
|
|
assert(p_A.rows() == rows());
|
|
assert(p_A.cols() == p_B.rows());
|
|
assert(p_B.cols() == cols());
|
|
vector<MtxRow> l_rows = p_A.getNnzVector();
|
|
for (MtxRow &l_row : l_rows) {
|
|
unsigned int row = l_row.getRow(),
|
|
col = l_row.getCol();
|
|
double l_val = l_row.getVal();
|
|
getVal(row, 0) += l_val * p_B.getVal(col, 0);
|
|
//cout << "DEBUG multiplySpmv row=" << row << " col=" << col << " "
|
|
// << l_val << " * " << p_B.getVal(col, 0)
|
|
// << " was added to " << getVal(row, 0) << "\n";
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void transpose(Mat & p_A) {
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < cols(); ++col) {
|
|
getVal(row, col) = p_A.getVal(col, row);
|
|
}
|
|
}
|
|
swap(m_Rows, m_Cols);
|
|
}
|
|
void transposeGva(Mat & p_A, unsigned int p_rowEdgeWidth,
|
|
unsigned int p_colEdgeWidth) {
|
|
unsigned int l_pos = 0;
|
|
for (unsigned int rowBlock = 0; rowBlock < p_A.rows() / p_rowEdgeWidth;
|
|
++rowBlock) {
|
|
for (unsigned int colBlock = 0;
|
|
colBlock < p_A.cols() / p_colEdgeWidth; ++colBlock) {
|
|
for (unsigned int col = 0; col < p_colEdgeWidth; ++col) {
|
|
for (unsigned int row = 0; row < p_rowEdgeWidth; ++row) {
|
|
getVal(l_pos / cols(), l_pos % cols()) = p_A.getVal(
|
|
row + rowBlock * p_rowEdgeWidth,
|
|
col + colBlock * p_colEdgeWidth);
|
|
l_pos++;
|
|
}
|
|
//cout << "DEBUG transposeGva step " << *this << "\n";
|
|
}
|
|
}
|
|
}
|
|
swap(m_Rows, m_Cols);
|
|
}
|
|
void print(ostream& os) {
|
|
os << m_Rows << "x" << m_Cols << " Ld=" << m_Ld << "\n";
|
|
unsigned int l_cols = cols(); // normal matrix
|
|
//ld();; // parent matrix (within Ld
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < l_cols; ++col) {
|
|
os << int(getVal(row, col)) << " ";
|
|
}
|
|
os << "\n";
|
|
}
|
|
}
|
|
bool cmp(float p_TolRel, float p_TolAbs, Mat &p_Ref) {
|
|
bool ok = true;
|
|
unsigned int l_verbose = 1; // 0 none, 1 if not exactly equal, 2 if passed tolerance, 3 show all
|
|
unsigned int l_numExactMatches = 0, l_numMismatches = 0;
|
|
for (unsigned int row = 0; row < rows(); ++row) {
|
|
for (unsigned int col = 0; col < cols(); ++col) {
|
|
string l_Prefix = " row " + to_string(row) + " col "
|
|
+ to_string(col);
|
|
T v = getVal(row, col);
|
|
T vRef = p_Ref.getVal(row, col);
|
|
bool l_exactMatch = false;
|
|
bool l_ok = cmpVal(p_TolRel, p_TolAbs, vRef, v, l_Prefix,
|
|
l_exactMatch, 1);
|
|
ok = ok && l_ok;
|
|
if (l_exactMatch) {
|
|
l_numExactMatches++;
|
|
}
|
|
if (!l_ok) {
|
|
l_numMismatches++;
|
|
}
|
|
}
|
|
}
|
|
unsigned int l_total = rows() * cols();
|
|
unsigned int l_withinTolerance = l_total - l_numExactMatches
|
|
- l_numMismatches;
|
|
cout << " Compared " << l_total << " values:" << " exact match "
|
|
<< l_numExactMatches << " within tolerance "
|
|
<< l_withinTolerance << " mismatch " << l_numMismatches
|
|
<< "\n";
|
|
return (ok);
|
|
}
|
|
|
|
bool cmpVal(float p_TolRel, float p_TolAbs, T vRef, T v,
|
|
std::string p_Prefix, bool &p_exactMatch, unsigned int p_Verbose) {
|
|
float l_diffAbs = abs(v - vRef);
|
|
float l_diffRel = l_diffAbs;
|
|
if (vRef != 0) {
|
|
l_diffRel /= abs(vRef);
|
|
}
|
|
p_exactMatch = (vRef == v);
|
|
bool l_status = p_exactMatch || (l_diffRel <= p_TolRel)
|
|
|| (l_diffAbs <= p_TolAbs);
|
|
if ((p_Verbose >= 3) || ((p_Verbose >= 2) && !p_exactMatch)
|
|
|| ((p_Verbose >= 1) && !l_status)) {
|
|
std::cout << p_Prefix << " ValRef " << std::left
|
|
<< std::setw(GEMX_CMP_WIDTH) << vRef << " Val " << std::left
|
|
<< std::setw(GEMX_CMP_WIDTH) << v << " DifRel "
|
|
<< std::left << std::setw(GEMX_CMP_WIDTH) << l_diffRel
|
|
<< " DifAbs " << std::left << std::setw(GEMX_CMP_WIDTH)
|
|
<< l_diffAbs << " Status " << l_status << "\n";
|
|
}
|
|
return (l_status);
|
|
}
|
|
|
|
};
|
|
|
|
//Base address will be the instruction memory region
|
|
class XCL_FPGA {
|
|
public:
|
|
XCL_FPGA() = delete;
|
|
XCL_FPGA(const string & xclbin, const string & kernelName) {
|
|
loadXclbin(xclbin, kernelName);
|
|
}
|
|
|
|
~XCL_FPGA() {
|
|
}
|
|
|
|
void loadXclbin(string p_XclbinFile, string p_KernelName) {
|
|
// https://gitenterprise.xilinx.com/rkeryell/heterogeneous_examples/blob/master/vector_add/SDAccel-Boost.Compute/vector_add.cpp
|
|
|
|
// Create the OpenCL context to attach resources on the device
|
|
m_Context = move(boost::compute::system::default_context());
|
|
// Create the OpenCL command queue to control the device
|
|
//m_CommandQueue = move(boost::compute::system::default_queue());
|
|
//boost::compute::command_queue queue(boost::compute::system::default_context(), boost::compute::system::default_device(), CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE);
|
|
boost::compute::command_queue queue(
|
|
boost::compute::system::default_context(),
|
|
boost::compute::system::default_device(),
|
|
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
|
|
m_CommandQueue = move(queue);
|
|
// Construct an OpenCL program from the precompiled kernel file
|
|
m_Program = move(
|
|
boost::compute::program::create_with_binary_file(p_XclbinFile,
|
|
m_Context));
|
|
m_Program.build();
|
|
|
|
m_Kernel = move(boost::compute::kernel(m_Program, p_KernelName));
|
|
}
|
|
|
|
boost::compute::buffer createBuf(void *ptr, size_t sz_bytes) {
|
|
//decltype of cl_mem_ext_ptr_t.flags
|
|
|
|
//unsigned l_k2bank[] = {GEMX_fpgaDdrBanks};
|
|
//std::cout << "l_k2bank: " << l_k2bank[0] << std::endl;
|
|
|
|
cl_mem_ext_ptr_t l_bufExt;
|
|
//l_bufExt.obj = NULL;
|
|
l_bufExt.param = 0;
|
|
l_bufExt.flags = GEMX_fpgaDdrBanks;
|
|
l_bufExt.obj = ptr;
|
|
// Buffers
|
|
return boost::compute::buffer(m_Context, sz_bytes,
|
|
CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_EXT_PTR_XILINX,
|
|
&l_bufExt);
|
|
}
|
|
|
|
bool copyToFpga(const boost::compute::buffer & buf, bool sync_send) {
|
|
boost::compute::event l_event;
|
|
//cout << "copyToFPGA" << endl;
|
|
// Send the input data to the accelerator
|
|
l_event = m_CommandQueue.enqueue_migrate_memory_objects(1, &(buf.get()),
|
|
0);
|
|
|
|
if (sync_send){
|
|
l_event.wait();
|
|
} else{
|
|
m_waitInput.insert(l_event);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
boost::compute::buffer copyToFpga(void * buf, size_t sz_bytes,
|
|
bool sync_send = false) {
|
|
boost::compute::buffer cl_buf = createBuf(buf, sz_bytes);
|
|
copyToFpga(cl_buf, sync_send);
|
|
return cl_buf;
|
|
}
|
|
|
|
void copyFromFpga(const boost::compute::buffer & buf, bool sync_exec = true) {
|
|
//cout << "copyFromFPGA" << endl;
|
|
XTimer t;
|
|
boost::compute::event l_readEvents =
|
|
m_CommandQueue.enqueue_migrate_memory_objects(1, &(buf.get()),
|
|
CL_MIGRATE_MEM_OBJECT_HOST, m_waitOutput);
|
|
//l_readEvents.wait();
|
|
if ( sync_exec ){
|
|
l_readEvents.wait();
|
|
m_waitOutput.clear();
|
|
} else{
|
|
m_waitOutput.insert(l_readEvents);
|
|
}
|
|
#ifdef GEMX_PERF_DBG
|
|
cout << "copyFromFpga: " << t.elapsed() << endl;
|
|
#endif
|
|
}
|
|
|
|
void execKernel(const boost::compute::buffer & instr_buf, bool sync_exec = false) {
|
|
boost::compute::extents<1> offset { 0 };
|
|
boost::compute::extents<1> global { 1 };
|
|
// Use only 1 CU
|
|
boost::compute::extents<1> local { 1 };
|
|
// Launch kernels
|
|
m_Kernel.set_args(instr_buf, instr_buf);
|
|
|
|
XTimer t;
|
|
boost::compute::event l_event = m_CommandQueue.enqueue_nd_range_kernel(
|
|
m_Kernel, offset, global, local, m_waitInput);
|
|
|
|
if ( 1 ) {
|
|
l_event.wait();
|
|
} else{
|
|
m_waitOutput.insert(l_event);
|
|
}
|
|
m_waitInput.clear();
|
|
|
|
#ifdef GEMX_PERF_DBG
|
|
cout << "execKernel: " << t.elapsed() << endl;
|
|
#endif
|
|
|
|
}
|
|
|
|
void wait ()
|
|
{
|
|
for (size_t i = 0; i < m_waitOutput.size(); i++){
|
|
m_waitOutput[i].wait();
|
|
}
|
|
m_waitInput.clear();
|
|
m_waitOutput.clear();
|
|
}
|
|
|
|
private:
|
|
boost::compute::program m_Program;
|
|
boost::compute::kernel m_Kernel;
|
|
boost::compute::context m_Context;
|
|
boost::compute::command_queue m_CommandQueue;
|
|
boost::compute::wait_list m_waitInput, m_waitOutput;
|
|
};
|
|
|
|
template<typename HType>
|
|
class GEMMHost {
|
|
public:
|
|
GEMMHost() = delete;
|
|
~GEMMHost() {
|
|
}
|
|
GEMMHost(const GEMMHost<HType> &) = delete;
|
|
GEMMHost(const string & xclbin, const string & kernelName) {
|
|
_fpga = shared_ptr<XCL_FPGA>(new XCL_FPGA(xclbin, kernelName));
|
|
void *aligned_mem = nullptr;
|
|
assert(!posix_memalign(&aligned_mem, PAGE_SIZE, INSTR_BUF_SIZE));
|
|
_instrBuf = shared_ptr<char>((char*) aligned_mem);
|
|
memset(_instrBuf.get(), 0, INSTR_BUF_SIZE);
|
|
_instr_offset = 0;
|
|
_cl_instr_buf = _fpga->copyToFpga(_instrBuf.get(), INSTR_BUF_SIZE,
|
|
true);
|
|
xclGetMemObjDeviceAddress(_cl_instr_buf.get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &_ddrDeviceBaseAddr);
|
|
|
|
assert(!posix_memalign(&aligned_mem, PAGE_SIZE, KERN_DBG_BUF_SIZE));
|
|
_kernDbgBuf = shared_ptr<char>((char*) aligned_mem);
|
|
_cl_kern_dbg_buf = _fpga->copyToFpga(_kernDbgBuf.get(),
|
|
KERN_DBG_BUF_SIZE, true);
|
|
}
|
|
|
|
bool AddGEMMOp(const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift) {
|
|
return AddGEMMOp (A, B, C, bias, m, k, n, k, n, n, n, postScale, postShift);
|
|
}
|
|
|
|
bool AddGEMMOp(const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, unsigned int lda, unsigned int ldb, unsigned int ldc, unsigned int ldx, int postScale, int postShift) {
|
|
XTimer t;
|
|
if (_hostMat.find(A) == _hostMat.end()
|
|
|| _hostMat.find(B) == _hostMat.end()
|
|
|| _hostMat.find(C) == _hostMat.end()
|
|
|| _hostMat.find(bias) == _hostMat.end()) {
|
|
cerr << "Matrix not found!" << endl;
|
|
return false;
|
|
}
|
|
unsigned long long A_off = 0, B_off = 0, C_off = 0, X_off = 0;
|
|
|
|
xclGetMemObjDeviceAddress(_devHandle[A].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &A_off);
|
|
xclGetMemObjDeviceAddress(_devHandle[B].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &B_off);
|
|
xclGetMemObjDeviceAddress(_devHandle[C].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &C_off);
|
|
|
|
if ( _devHandle.find(bias) != _devHandle.end()){
|
|
xclGetMemObjDeviceAddress(_devHandle[bias].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &X_off);
|
|
assert(X_off > _ddrDeviceBaseAddr);
|
|
X_off -= _ddrDeviceBaseAddr;
|
|
}
|
|
|
|
// cout << "A_dev_addr: " << A_off << " B_dev_addr: " << B_off << " C_dev_addr: " << C_off << " X_dev_addr: " << X_off << endl;
|
|
assert(A_off > _ddrDeviceBaseAddr);
|
|
assert(B_off > _ddrDeviceBaseAddr);
|
|
assert(C_off > _ddrDeviceBaseAddr);
|
|
A_off -= _ddrDeviceBaseAddr;
|
|
B_off -= _ddrDeviceBaseAddr;
|
|
C_off -= _ddrDeviceBaseAddr;
|
|
|
|
assert(A_off % PAGE_SIZE == 0);
|
|
assert(B_off % PAGE_SIZE == 0);
|
|
assert(C_off % PAGE_SIZE == 0);
|
|
assert(X_off % PAGE_SIZE == 0);
|
|
|
|
A_off /= PAGE_SIZE;
|
|
B_off /= PAGE_SIZE;
|
|
C_off /= PAGE_SIZE;
|
|
X_off /= PAGE_SIZE;
|
|
|
|
GemmArgs gargs(A_off, B_off, C_off, X_off, m,
|
|
k, n, lda, ldb, ldc, ldx, postScale, postShift);
|
|
AddInstr ( &gargs);
|
|
return true;
|
|
}
|
|
|
|
bool AddMat(const HType & handle, void * mat_ptr, unsigned long long buf_sz) {
|
|
if (_hostMat.find(handle) == _hostMat.end()) {
|
|
_hostMat[handle] = mat_ptr;
|
|
_hostMatSz[handle] = buf_sz;
|
|
return true;
|
|
}
|
|
else if (_hostMatSz[handle] != buf_sz ){
|
|
_hostMat[handle] = mat_ptr;
|
|
_hostMatSz[handle] = buf_sz;
|
|
_devHandle.erase(handle);
|
|
cout << "Erasing devhandle!" << endl;
|
|
return true;
|
|
}
|
|
//cout << "Matrix " << handle << " already added!" << endl;
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
bool AddMat(const HType & handle, short *p_Addr, unsigned int p_Rows,
|
|
unsigned int p_Cols, unsigned int p_Ld) {
|
|
return AddMat(handle,
|
|
shared_ptr<ShortMat>(
|
|
new ShortMat(p_Rows, p_Cols, p_Ld, p_Addr)));
|
|
}
|
|
|
|
bool AddMat(const HType & handle, int *p_Addr, unsigned int p_Rows,
|
|
unsigned int p_Cols, unsigned int p_Ld) {
|
|
return AddMat(handle,
|
|
shared_ptr<IntMat>(
|
|
new IntMat(p_Rows, p_Cols, p_Ld, p_Addr)));
|
|
}
|
|
*/
|
|
|
|
|
|
/*
|
|
shared_ptr<BaseMat> AllocMat(const HType & handle, size_t n_rows,
|
|
size_t n_cols, size_t ld, bool sendToFPGA = false, bool sync_send =
|
|
false) {
|
|
if (_hostMat.find(handle) != _hostMat.end()) {
|
|
cout << "Matrix " << handle << " already created!" << endl;
|
|
return _hostMat[handle];
|
|
}
|
|
cout << "Matrix " << handle << " " << n_rows << " " << n_cols << " "
|
|
<< ld << endl;
|
|
void* aligned_buffer = nullptr;
|
|
// assert ( !posix_memalign ( &aligned_buffer, PAGE_SIZE, sizeof(T) * n_rows * n_cols ));
|
|
assert(
|
|
!posix_memalign(&aligned_buffer, PAGE_SIZE,
|
|
* n_rows * ld));
|
|
shared_ptr<Mat<DType> > mat_ptr = shared_ptr<Mat<DType> >(
|
|
new Mat<DType>(n_rows, n_cols, ld, (DType*) aligned_buffer));
|
|
assert(AddMat(handle, mat_ptr));
|
|
|
|
if (sendToFPGA)
|
|
SendToFPGA(handle, sync_send);
|
|
return mat_ptr;
|
|
}
|
|
*/
|
|
|
|
void * GetMat(const HType & handle,
|
|
bool queryFPGA = false, bool sync_get = true) {
|
|
void * ret_ptr = nullptr;
|
|
if (_hostMat.find(handle) != _hostMat.end()) {
|
|
if (queryFPGA)
|
|
GetFromFPGA(handle, sync_get);
|
|
ret_ptr = _hostMat[handle];
|
|
} else{
|
|
cout << "GetMat: Matrix not found!" << endl;
|
|
assert(0);
|
|
}
|
|
return ret_ptr;
|
|
}
|
|
|
|
void Execute() {
|
|
XTimer t;
|
|
_fpga->copyToFpga(_cl_instr_buf, false);
|
|
_fpga->execKernel(_cl_instr_buf);
|
|
memset(_instrBuf.get(), 0, PAGE_SIZE);
|
|
_instr_offset = 0;
|
|
#ifdef GEMX_PERF_DBG
|
|
cout << "Execute: " << t.elapsed() << endl;
|
|
#endif
|
|
}
|
|
|
|
void Wait(){
|
|
_fpga->wait();
|
|
}
|
|
|
|
/*
|
|
void SendToFPGA(const HType & handle, int * p_addr, int n_row, int n_col,
|
|
int ld, bool sync_send = false) {
|
|
SendToFPGA(handle,
|
|
shared_ptr<Mat >(
|
|
new Mat<DType>(n_row, n_col, ld, p_addr)), sync_send);
|
|
}
|
|
*/
|
|
|
|
void SendToFPGA(const HType & handle, void * mat_ptr, unsigned long long buf_sz,
|
|
bool sync_send = false) {
|
|
AddMat(handle, mat_ptr, buf_sz);
|
|
SendToFPGA(handle, sync_send);
|
|
}
|
|
|
|
void SendToFPGA(const HType & handle, bool sync_send = false) {
|
|
XTimer t;
|
|
assert(_hostMat.find(handle) != _hostMat.end());
|
|
|
|
//shared_ptr < Mat<T> > mat = _hostMat[handle];
|
|
if (_devHandle.find(handle) != _devHandle.end()) {
|
|
_fpga->copyToFpga(_devHandle[handle], sync_send);
|
|
} else {
|
|
_devHandle[handle] = _fpga->copyToFpga(_hostMat[handle], _hostMatSz[handle], sync_send);
|
|
}
|
|
#ifdef GEMX_PERF_DBG
|
|
cout << "SendToFPGA: " << t.elapsed() << endl;
|
|
#endif
|
|
}
|
|
|
|
void GetFromFPGA(const HType & handle, bool sync_get) {
|
|
XTimer t;
|
|
assert(_devHandle.find(handle) != _devHandle.end());
|
|
_fpga->copyFromFpga(_devHandle[handle], sync_get);
|
|
#ifdef GEMX_PERF_DBG
|
|
cout << "GetFromFPGA: " << t.elapsed() << endl;
|
|
#endif
|
|
}
|
|
|
|
protected:
|
|
|
|
void AddInstr ( kArgs * args )
|
|
{
|
|
char * instr = args->asByteArray();
|
|
char * curr_pos = &_instrBuf.get()[_instr_offset];
|
|
memcpy(curr_pos, instr, args->sizeInBytes());
|
|
_instr_offset += args->sizeInBytes();
|
|
}
|
|
|
|
static const unsigned int PAGE_SIZE = 4096;
|
|
static const unsigned int INSTR_BUF_SIZE = PAGE_SIZE;
|
|
static const unsigned int KERN_DBG_BUF_SIZE = PAGE_SIZE;
|
|
|
|
unsigned long long _ddrDeviceBaseAddr;
|
|
shared_ptr<char> _instrBuf, _kernDbgBuf;
|
|
boost::compute::buffer _cl_instr_buf, _cl_kern_dbg_buf;
|
|
unsigned int _instr_offset;
|
|
unordered_map<HType, void* > _hostMat;
|
|
unordered_map<HType, unsigned long long > _hostMatSz;
|
|
unordered_map<HType, boost::compute::buffer> _devHandle;
|
|
shared_ptr<XCL_FPGA> _fpga;
|
|
};
|
|
|
|
template<typename HType>
|
|
class FCNHost : public GEMMHost <HType>
|
|
{
|
|
public:
|
|
FCNHost() = delete;
|
|
virtual ~FCNHost(){}
|
|
FCNHost ( const FCNHost<HType>&) = delete;
|
|
FCNHost(const string & xclbin, const string & kernelName ) : GEMMHost<HType> ( xclbin, kernelName)
|
|
{
|
|
}
|
|
|
|
bool AddFCNOp ( const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift, short PReLUScale, short PReLUAlpha)
|
|
{
|
|
return AddFCNOp ( A, B, C, bias, m, k, n, k, n, n, n,postScale, postShift, PReLUScale, PReLUAlpha);
|
|
}
|
|
|
|
bool AddFCNOp ( const HType & A, const HType & B, const HType &C, const HType & bias, unsigned int m, unsigned int k, unsigned int n, unsigned int lda, unsigned int ldb, unsigned int ldc, unsigned int ldx, int postScale, int postShift, short PReLUScale, short PReLUAlpha)
|
|
{
|
|
XTimer t;
|
|
if (this->_hostMat.find(A) == this->_hostMat.end()
|
|
|| this->_hostMat.find(B) == this->_hostMat.end()
|
|
|| this->_hostMat.find(C) == this->_hostMat.end()
|
|
|| this->_hostMat.find(bias) == this->_hostMat.end()) {
|
|
cerr << "Matrix not found!" << endl;
|
|
return false;
|
|
}
|
|
|
|
if ( m < MIN_M ){
|
|
cerr << "m dimension (" << m << ") is less than minimum supported size " << MIN_M << endl;
|
|
return false;
|
|
}
|
|
if ( !isPowerOf2(m) ){
|
|
cerr << "m dimension (" << m << ") isn't a power of 2" << endl;
|
|
return false;
|
|
}
|
|
|
|
if ( k < MIN_K ){
|
|
cerr << "k dimension (" << k << ") is less than minimum supported size " << MIN_K << endl;
|
|
return false;
|
|
}
|
|
|
|
if ( !isPowerOf2(k) ){
|
|
cerr << "k dimension (" << k << ") isn't a power of 2" << endl;
|
|
return false;
|
|
}
|
|
|
|
if ( n < MIN_N ){
|
|
cerr << "n dimension (" << n << ") is less than minimum supported size " << MIN_N << endl;
|
|
return false;
|
|
}
|
|
if ( !isPowerOf2(n) ){
|
|
cerr << "n dimension (" << n << ") isn't a power of 2" << endl;
|
|
return false;
|
|
}
|
|
if ( lda < MIN_M ){
|
|
cerr << "lda dimension (" << lda << ") is less than minimum supported size " << MIN_M << endl;
|
|
return false;
|
|
}
|
|
if ( !isPowerOf2(lda) ){
|
|
cerr << "lda dimension (" << lda << ") isn't a power of 2" << endl;
|
|
return false;
|
|
}
|
|
if ( ldb < MIN_N ){
|
|
cerr << "ldb dimension (" << ldb << ") is less than minimum supported size " << MIN_N << endl;
|
|
return false;
|
|
}
|
|
|
|
if ( !isPowerOf2(ldb) ){
|
|
cerr << "ldb dimension (" << ldb << ") isn't a power of 2" << endl;
|
|
return false;
|
|
}
|
|
|
|
if ( ldc < MIN_N ){
|
|
cerr << "ldc dimension (" << ldc << ") is less than minimum supported size " << MIN_N << endl;
|
|
return false;
|
|
}
|
|
|
|
if ( !isPowerOf2(ldc) ){
|
|
cerr << "ldc dimension (" << ldc << ") isn't a power of 2" << endl;
|
|
return false;
|
|
}
|
|
|
|
//if ( ldx != ldc )
|
|
unsigned long long A_off = 0, B_off = 0, C_off = 0, X_off = 0;
|
|
|
|
xclGetMemObjDeviceAddress(this->_devHandle[A].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &A_off);
|
|
xclGetMemObjDeviceAddress(this->_devHandle[B].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &B_off);
|
|
xclGetMemObjDeviceAddress(this->_devHandle[C].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &C_off);
|
|
|
|
if ( this->_devHandle.find(bias) != this->_devHandle.end()){
|
|
xclGetMemObjDeviceAddress(this->_devHandle[bias].get(),
|
|
boost::compute::system::default_device().get(),
|
|
sizeof(unsigned long long), &X_off);
|
|
assert(X_off > this->_ddrDeviceBaseAddr);
|
|
X_off -= this->_ddrDeviceBaseAddr;
|
|
}
|
|
|
|
//cout << "A_dev_addr: " << A_off << " B_dev_addr: " << B_off << " C_dev_addr: " << C_off << endl;
|
|
assert(A_off > this->_ddrDeviceBaseAddr);
|
|
assert(B_off > this->_ddrDeviceBaseAddr);
|
|
assert(C_off > this->_ddrDeviceBaseAddr);
|
|
A_off -= this->_ddrDeviceBaseAddr;
|
|
B_off -= this->_ddrDeviceBaseAddr;
|
|
C_off -= this->_ddrDeviceBaseAddr;
|
|
|
|
assert(A_off % this->PAGE_SIZE == 0);
|
|
assert(B_off % this->PAGE_SIZE == 0);
|
|
assert(C_off % this->PAGE_SIZE == 0);
|
|
assert(X_off % this->PAGE_SIZE == 0);
|
|
|
|
A_off /= this->PAGE_SIZE;
|
|
B_off /= this->PAGE_SIZE;
|
|
C_off /= this->PAGE_SIZE;
|
|
X_off /= this->PAGE_SIZE;
|
|
|
|
FcnArgs args(A_off, B_off, C_off, X_off, m,
|
|
k, n, lda, ldb, ldc, ldx, postScale, postShift, PReLUScale, PReLUAlpha);
|
|
this->AddInstr ( &args);
|
|
#ifdef GEMX_PERF_DBG
|
|
cout << "AddFCNOp: " << t.elapsed() << endl;
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
protected:
|
|
const int MIN_M = 256;
|
|
const int MIN_K = 256;
|
|
const int MIN_N = 32;
|
|
|
|
bool isPowerOf2( int n )
|
|
{
|
|
return ( (n & (n-1)) == 0 );
|
|
}
|
|
|
|
};
|
|
|
|
}
|
|
;
|
|
// namespace
|
|
|
|
extern "C" {
|
|
gemx::FCNHost<short*> * MakeFCNHost(char *xclbin, char * kernName);
|
|
void DestroyFCNHost(gemx::FCNHost<short*> * ptr);
|
|
|
|
void SendToFPGAShrt(gemx::FCNHost<short*> * gh, short *A, unsigned long long num_elem, bool sync_send);
|
|
void SendToFPGAInt(gemx::FCNHost<short*> * gh, int *A, unsigned long long num_elem, bool sync_send);
|
|
void SendToFPGAShrt_dbg(gemx::FCNHost<short*> * gh, char * name, short *A, int m, int n, bool sync_send);
|
|
void SendToFPGAInt_dbg(gemx::FCNHost<short*> * gh, char * name, int *A, int m, int n, bool sync_send);
|
|
|
|
void SendToFPGA(gemx::FCNHost<short*> * gh, void * A, unsigned long long buf_sz, bool sync_send);
|
|
|
|
void* GetFromFPGA(gemx::FCNHost<short*> * gh, short *A, bool sync_get);
|
|
|
|
void Wait ( gemx::FCNHost<short*> * gh );
|
|
bool AddFCNOp(gemx::FCNHost<short*> * gh, void * A, void * B, void *C, void * bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift, short PReLUScale, short PReLUAlpha );
|
|
void Execute (gemx::FCNHost<short*> * gh);
|
|
|
|
gemx::GEMMHost<short*> * MakeGEMMHost(char *xclbin, char * kernName);
|
|
void DestroyGEMMHost(gemx::GEMMHost<short*> * ptr);
|
|
|
|
void SendToFPGAShrt_GEMM(gemx::GEMMHost<short*> * gh, short *A, unsigned long long num_elem, bool sync_send);
|
|
void SendToFPGAInt_GEMM(gemx::GEMMHost<short*> * gh, int *A, unsigned long long num_elem, bool sync_send);
|
|
void SendToFPGAShrt_dbg_GEMM(gemx::GEMMHost<short*> * gh, char * name, short *A, int m, int n, bool sync_send);
|
|
void SendToFPGAInt_dbg_GEMM(gemx::GEMMHost<short*> * gh, char * name, int *A, int m, int n, bool sync_send);
|
|
|
|
void SendToFPGA_GEMM(gemx::GEMMHost<short*> * gh, void * A, unsigned long long buf_sz, bool sync_send);
|
|
|
|
void* GetFromFPGA_GEMM(gemx::GEMMHost<short*> * gh, short *A, bool sync_get);
|
|
|
|
void Wait_GEMM ( gemx::GEMMHost<short*> * gh );
|
|
//void AddGEMMOp(gemx::FCNHost<short*> * gh, void * A, void * B, void *C, unsigned int m, unsigned int k, unsigned int n);
|
|
|
|
bool AddGEMMOp(gemx::GEMMHost<short*> * gh, void * A, void * B, void *C, void * bias, unsigned int m, unsigned int k, unsigned int n, int postScale, int postShift);
|
|
|
|
void Execute_GEMM (gemx::GEMMHost<short*> * gh);
|
|
}
|
|
|
|
#endif /* SRC_GEMX_HOST_H_ */
|