hqjenny-centrifuge/workloads/bm/vadd_tl/accel.c

117 lines
4.2 KiB
C

#include "accel.h"
int vadd(int * a, int * b, int* c, int length) {
// For pointer type
#pragma HLS INTERFACE m_axi port=a offset=slave bundle=gmem0 num_write_outstanding=16 num_read_outstanding=16 max_write_burst_length=16 max_read_burst_length= 16 depth=16 latency=125
#pragma HLS INTERFACE m_axi port=b offset=slave bundle=gmem0 num_write_outstanding=16 num_read_outstanding=16 max_write_burst_length=16 max_read_burst_length= 16 depth=16 latency=125
// Slave is for AXI4Lite, with burst mode disabled
#pragma HLS INTERFACE m_axi port=c offset=slave bundle=gmem0 num_write_outstanding=16 num_read_outstanding=16 max_write_burst_length=16 max_read_burst_length= 16 depth=16 latency=125
#pragma HLS INTERFACE s_axilite port=a bundle=control
#pragma HLS INTERFACE s_axilite port=b bundle=control
#pragma HLS INTERFACE s_axilite port=c bundle=control
#pragma HLS INTERFACE s_axilite port=length bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control
//#pragma HLS DATAFLOW
int upper = (length >> 3) << 3;
int i = 0;
for (i = 0; i < upper; i += 8) {
// To prevent burst mode
c[i+0] = a[i+0] +b[i+0];
c[i+1] = a[i+1] +b[i+1];
c[i+2] = a[i+2] +b[i+2];
c[i+3] = a[i+3] +b[i+3];
c[i+4] = a[i+4] +b[i+4];
c[i+5] = a[i+5] +b[i+5];
c[i+6] = a[i+6] +b[i+6];
c[i+7] = a[i+7] +b[i+7];
}
for (i = upper; i < length; i++) {
c[i] = a[i] +b[i];
}
return 0;
}
//Including to use ap_uint<> datatype
//#include <ap_int.h>
/*
#define BUFFER_SIZE 128
#define DATAWIDTH 512
#define VECTOR_SIZE (DATAWIDTH / 32) // vector size is 16 (512/32 = 16)
//typedef ap_uint<DATAWIDTH> uint512_dt;
Vector Addition Kernel Implementation using uint512_dt datatype
Arguments:
in1 (input) --> Input Vector1
in2 (input) --> Input Vector2
out (output) --> Output Vector
size (input) --> Size of Vector in Integer
*/
/*extern "C" {
void vadd(
const uint512_dt *in1, // Read-Only Vector 1
const uint512_dt *in2, // Read-Only Vector 2
uint512_dt *out, // Output Result
int size // Size in integer
)
{
#pragma HLS INTERFACE m_axi port=in1 offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi port=in2 offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi port=out offset=slave bundle=gmem
#pragma HLS INTERFACE s_axilite port=in1 bundle=control
#pragma HLS INTERFACE s_axilite port=in2 bundle=control
#pragma HLS INTERFACE s_axilite port=out bundle=control
#pragma HLS INTERFACE s_axilite port=size bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control
uint512_dt v1_local[BUFFER_SIZE]; // Local memory to store vector1
uint512_dt result_local[BUFFER_SIZE];// Local Memory to store result
// Input vector size for interger vectors. However kernel is directly
// accessing 512bit data (total 16 elements). So total number of read
// from global memory is calculated here:
int size_in16 = (size-1) / VECTOR_SIZE + 1;
//Per iteration of this loop perform BUFFER_SIZE vector addition
for(int i = 0; i < size_in16; i += BUFFER_SIZE)
{
#pragma HLS LOOP_TRIPCOUNT min=8 max=8
int chunk_size = BUFFER_SIZE;
//boundary checks
if ((i + BUFFER_SIZE) > size_in16)
chunk_size = size_in16 - i;
//burst read first vector from global memory to local memory
v1_rd: for (int j = 0 ; j < chunk_size; j++){
#pragma HLS pipeline
#pragma HLS LOOP_TRIPCOUNT min=128 max=128
v1_local[j] = in1 [i + j];
}
//burst read second vector and perform vector addition
v2_rd_add: for (int j = 0 ; j < chunk_size; j++){
#pragma HLS pipeline
#pragma HLS LOOP_TRIPCOUNT min=128 max=128
uint512_dt tmpV1 = v1_local[j];
uint512_dt tmpV2 = in2[i+j];
result_local[j] = tmpV1 + tmpV2; // Vector Addition Operation
}
//burst write the result
out_write: for (int j = 0 ; j < chunk_size; j++){
#pragma HLS pipeline
#pragma HLS LOOP_TRIPCOUNT min=128 max=128
out[i+j] = result_local[j];
}
}
}
}*/