diff --git a/apps/maprobe/Makefile b/apps/maprobe/Makefile index 2bcc3590..0328ed17 100644 --- a/apps/maprobe/Makefile +++ b/apps/maprobe/Makefile @@ -1,3 +1,3 @@ NAME = maprobe -SRCS = common.c bitutils.c latency-test.c main.c +SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c main.c include $(AM_HOME)/Makefile.app diff --git a/apps/maprobe/bandwidth-test.c b/apps/maprobe/bandwidth-test.c new file mode 100644 index 00000000..de80096b --- /dev/null +++ b/apps/maprobe/bandwidth-test.c @@ -0,0 +1,93 @@ +#include "maprobe.h" + +void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + assert(size >= _PERF_CACHELINE_SIZE_BYTE); + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { + __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8 + ); + } + _perf_g_total_samples += total_access; +} + +void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + assert(size >= _PERF_CACHELINE_SIZE_BYTE); + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0"); + __asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8 + ); + } + _perf_g_total_samples += total_access; +} + +void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv) +{ + // printf("stride %d linear access latency test\n", step); + // printf("range (B), read latency, iters, samples, cycles\n"); + assert(size >= _PERF_CACHELINE_SIZE_BYTE); + + // _perf_print_timer(); + _perf_start_timer(); + for (int i = 0; i < iter; i++) { + for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { + __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); + } + } + _perf_end_timer(); + // _perf_print_timer(); + uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter; + if (to_csv) { + printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); + } else { + printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n", + size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8 + ); + } + _perf_g_total_samples += total_access; +} \ No newline at end of file diff --git a/apps/maprobe/include/maprobe.h b/apps/maprobe/include/maprobe.h index 8bffe0f0..d3918dbd 100644 --- a/apps/maprobe/include/maprobe.h +++ b/apps/maprobe/include/maprobe.h @@ -6,6 +6,7 @@ #include #include #include "bitutils.h" +#include "resultmat.h" // config // #define PERF_SIM // probe run in simulatior, diaable perf counters @@ -72,6 +73,8 @@ extern void test_linear_access_latency(uint64_t size, uint64_t step, int iter, i extern void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv); extern void test_same_address_load_latency(int iter, int to_csv); extern void test_read_after_write_latency(int iter, int to_csv); + +// bandwidth test extern void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv); extern void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv); extern void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv); diff --git a/apps/maprobe/include/resultmat.h b/apps/maprobe/include/resultmat.h new file mode 100644 index 00000000..629aa49b --- /dev/null +++ b/apps/maprobe/include/resultmat.h @@ -0,0 +1,37 @@ +#ifndef PROBE_RESULT_MATRIX_H +#define PROBE_RESULT_MATRIX_H + +#include + +struct result_matrix_meta { + char* name; + char* row_name; + char* column_name; + int row_size; + int column_size; + void* result_array; + void* column_array; + void* row_array; +}; + +void print_float_result_matrix(struct result_matrix_meta* meta); +void matrix_print_example(); + +#define FOR(v,end) for (int v = 0; v < end; v++) +#define CONCAT(a,b) a##b +#define TOSTR(a) #a +#define DEFINE_FLOAT_RESULT_MATRIX(matrix_name, rowname, rowsize, columnname, columnsize) \ + struct result_matrix_meta CONCAT(matrix_name,_matrix_meta); \ + float CONCAT(matrix_name,_result_array)[rowsize][columnsize] = {0}; \ + int CONCAT(matrix_name,_column_array)[columnsize] = {0}; \ + int CONCAT(matrix_name,_row_array)[rowsize] = {0}; \ + CONCAT(matrix_name,_matrix_meta).name = TOSTR(matrix_name); \ + CONCAT(matrix_name,_matrix_meta).column_name = TOSTR(columnname); \ + CONCAT(matrix_name,_matrix_meta).row_name = TOSTR(rowname); \ + CONCAT(matrix_name,_matrix_meta).column_size = columnsize; \ + CONCAT(matrix_name,_matrix_meta).row_size = rowsize; \ + CONCAT(matrix_name,_matrix_meta).result_array = CONCAT(matrix_name,_result_array); \ + CONCAT(matrix_name,_matrix_meta).column_array = CONCAT(matrix_name,_column_array); \ + CONCAT(matrix_name,_matrix_meta).row_array = CONCAT(matrix_name,_row_array); + +#endif \ No newline at end of file diff --git a/apps/maprobe/latency-test.c b/apps/maprobe/latency-test.c index 814fd0a3..f79f01a4 100644 --- a/apps/maprobe/latency-test.c +++ b/apps/maprobe/latency-test.c @@ -1,9 +1,5 @@ #include "maprobe.h" -// inline uint64_t get_next_linear_address(uint64_t current_addr, uint64_t step) { -// return current_addr + step; -// } - inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) { return (rand() % (end_addr - base_addr) + base_addr) / align * align; } @@ -224,98 +220,6 @@ void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_c test_linear_access_latency_batch8(size, step, iter, to_csv); } -void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv) -{ - // printf("stride %d linear access latency test\n", step); - // printf("range (B), read latency, iters, samples, cycles\n"); - assert(size >= _PERF_CACHELINE_SIZE_BYTE); - - // _perf_print_timer(); - _perf_start_timer(); - for (int i = 0; i < iter; i++) { - for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { - __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0"); - } - } - _perf_end_timer(); - // _perf_print_timer(); - uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter; - if (to_csv) { - printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); - } else { - printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", - size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8 - ); - } - _perf_g_total_samples += total_access; -} - -void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv) -{ - // printf("stride %d linear access latency test\n", step); - // printf("range (B), read latency, iters, samples, cycles\n"); - assert(size >= _PERF_CACHELINE_SIZE_BYTE); - - // _perf_print_timer(); - _perf_start_timer(); - for (int i = 0; i < iter; i++) { - for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { - __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0"); - __asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0"); - } - } - _perf_end_timer(); - // _perf_print_timer(); - uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter; - if (to_csv) { - printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); - } else { - printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", - size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8 - ); - } - _perf_g_total_samples += total_access; -} - -void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv) -{ - // printf("stride %d linear access latency test\n", step); - // printf("range (B), read latency, iters, samples, cycles\n"); - assert(size >= _PERF_CACHELINE_SIZE_BYTE); - - // _perf_print_timer(); - _perf_start_timer(); - for (int i = 0; i < iter; i++) { - for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) { - __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0"); - } - } - _perf_end_timer(); - // _perf_print_timer(); - uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter; - if (to_csv) { - printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle); - } else { - printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n", - size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8 - ); - } - _perf_g_total_samples += total_access; -} - void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv) { // printf("align %d random access (cache line) latency test, %s\n", diff --git a/apps/maprobe/main.c b/apps/maprobe/main.c index 86444ec8..4fa95297 100644 --- a/apps/maprobe/main.c +++ b/apps/maprobe/main.c @@ -199,6 +199,7 @@ void legacy_latency_throughput_test() int main() { + matrix_print_example(); latency_test_example(); typical_latency_test(); // pointer_tracing_graph(); diff --git a/apps/maprobe/resultmat.c b/apps/maprobe/resultmat.c new file mode 100644 index 00000000..a264f182 --- /dev/null +++ b/apps/maprobe/resultmat.c @@ -0,0 +1,55 @@ +#include "resultmat.h" + +void print_float_result_matrix(struct result_matrix_meta* meta) +{ + assert(meta); + printf("---------- %s matrix start ----------\n", meta->name); + printf("%s (row) \\ %s (column)\n", meta->row_name, meta->column_name); + if (meta->column_array) { + if (meta->row_array) { + printf("\\ , \t"); + } + for (int c = 0; c < meta->column_size; c++) { + printf(" %d,\t", *((int*)meta->column_array + c)); + } + printf("\n"); + } + for (int r = 0; r < meta->row_size; r++) { + if (meta->row_array) { + printf("%3d,\t", *((int*)meta->row_array + r)); + } + for (int c = 0; c < meta->column_size; c++) { + printf("%f,\t", *((float*)meta->result_array + r * meta->column_size + c)); + } + printf("\n"); + } + printf("---------- %s matrix end ----------\n"); +} + +void matrix_print_example() +{ + DEFINE_FLOAT_RESULT_MATRIX(test,testrow,5,testcol,10); + // ({ + // struct result_matrix_meta test_matrix_meta; + // float test_result_array[5][10] = {0}; + // int test_column_array[10] = {0}; + // int testrow_array[5] = {0}; + // test_matrix_meta.name = "test"; + // test_matrix_meta.column_name = "testcol"; + // test_matrix_meta.row_name = "testrow"; + // test_matrix_meta.column_size = 10; + // test_matrix_meta.row_size = 5; + // test_matrix_meta.result_array = test_result_array; + // test_matrix_meta.column_array = test_column_array; + // test_matrix_meta.row_array = test_row_array; + // }) + + FOR(x,5) { test_row_array[x] = x; } + FOR(x,10) { test_column_array[x] = x; } + FOR(x,5) { + FOR(y,10) { + test_result_array[x][y] = rand(); + } + } + print_float_result_matrix(&test_matrix_meta); +} \ No newline at end of file