hqjenny-centrifuge/workloads/bm/vadd_tl/accel.c

#include "accel.h"

int vadd(int * a, int * b, int* c, int length) {

// For pointer type
#pragma HLS INTERFACE m_axi port=a offset=slave bundle=gmem0 num_write_outstanding=16 num_read_outstanding=16 max_write_burst_length=16 max_read_burst_length=    16 depth=16 latency=125

#pragma HLS INTERFACE m_axi port=b offset=slave bundle=gmem0 num_write_outstanding=16 num_read_outstanding=16 max_write_burst_length=16 max_read_burst_length=    16 depth=16 latency=125
 // Slave is for AXI4Lite, with burst mode disabled
#pragma HLS INTERFACE m_axi port=c offset=slave bundle=gmem0  num_write_outstanding=16 num_read_outstanding=16 max_write_burst_length=16 max_read_burst_length=    16 depth=16 latency=125


#pragma HLS INTERFACE s_axilite port=a bundle=control
#pragma HLS INTERFACE s_axilite port=b bundle=control
#pragma HLS INTERFACE s_axilite port=c bundle=control
#pragma HLS INTERFACE s_axilite port=length bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control

//#pragma HLS DATAFLOW
    int upper = (length >> 3) << 3;
    int i = 0;
    for (i = 0; i < upper; i += 8) {
        // To prevent burst mode
        c[i+0] = a[i+0] +b[i+0];
        c[i+1] = a[i+1] +b[i+1];
        c[i+2] = a[i+2] +b[i+2];
        c[i+3] = a[i+3] +b[i+3];

        c[i+4] = a[i+4] +b[i+4];
        c[i+5] = a[i+5] +b[i+5];
        c[i+6] = a[i+6] +b[i+6];
        c[i+7] = a[i+7] +b[i+7];
    }

    for (i = upper; i < length; i++) {
        c[i] = a[i] +b[i];
    }
    return 0;
}

//Including to use ap_uint<> datatype
//#include <ap_int.h>
/*
#define BUFFER_SIZE 128
#define DATAWIDTH   512
#define VECTOR_SIZE (DATAWIDTH / 32) // vector size is 16 (512/32 = 16)
//typedef ap_uint<DATAWIDTH> uint512_dt;

    Vector Addition Kernel Implementation using uint512_dt datatype
    Arguments:
        in1   (input)     --> Input Vector1
        in2   (input)     --> Input Vector2
        out   (output)    --> Output Vector
        size  (input)     --> Size of Vector in Integer
   */
/*extern "C" {
void vadd(
        const uint512_dt *in1, // Read-Only Vector 1
        const uint512_dt *in2, // Read-Only Vector 2
        uint512_dt *out,       // Output Result
        int size               // Size in integer
        )
{
#pragma HLS INTERFACE m_axi port=in1  offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi port=in2  offset=slave bundle=gmem
#pragma HLS INTERFACE m_axi port=out offset=slave bundle=gmem
#pragma HLS INTERFACE s_axilite port=in1  bundle=control
#pragma HLS INTERFACE s_axilite port=in2  bundle=control
#pragma HLS INTERFACE s_axilite port=out bundle=control
#pragma HLS INTERFACE s_axilite port=size bundle=control
#pragma HLS INTERFACE s_axilite port=return bundle=control

    uint512_dt v1_local[BUFFER_SIZE];    // Local memory to store vector1
    uint512_dt result_local[BUFFER_SIZE];// Local Memory to store result

    // Input vector size for interger vectors. However kernel is directly
    // accessing 512bit data (total 16 elements). So total number of read
    // from global memory is calculated here:
    int size_in16 = (size-1) / VECTOR_SIZE + 1;

    //Per iteration of this loop perform BUFFER_SIZE vector addition
    for(int i = 0; i < size_in16;  i += BUFFER_SIZE)
    {
        #pragma HLS LOOP_TRIPCOUNT min=8 max=8
        int chunk_size = BUFFER_SIZE;

        //boundary checks
        if ((i + BUFFER_SIZE) > size_in16)
            chunk_size = size_in16 - i;

        //burst read first vector from global memory to local memory
        v1_rd: for (int j = 0 ; j <  chunk_size; j++){
        #pragma HLS pipeline
        #pragma HLS LOOP_TRIPCOUNT min=128 max=128
            v1_local[j] = in1 [i + j];
        }

        //burst read second vector and perform vector addition
        v2_rd_add: for (int j = 0 ; j < chunk_size; j++){
        #pragma HLS pipeline
        #pragma HLS LOOP_TRIPCOUNT min=128 max=128
            uint512_dt tmpV1     = v1_local[j];
            uint512_dt tmpV2     = in2[i+j];
            result_local[j] = tmpV1 + tmpV2; // Vector Addition Operation
        }

        //burst write the result
        out_write: for (int j = 0 ; j < chunk_size; j++){
        #pragma HLS pipeline
        #pragma HLS LOOP_TRIPCOUNT min=128 max=128
            out[i+j] = result_local[j];
       }
    }
}
}*/