maprobe: support matrix print
This commit is contained in:
parent
ac66935e15
commit
b27f2968b4
|
@ -1,3 +1,3 @@
|
|||
NAME = maprobe
|
||||
SRCS = common.c bitutils.c latency-test.c main.c
|
||||
SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c main.c
|
||||
include $(AM_HOME)/Makefile.app
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
#include "maprobe.h"
|
||||
|
||||
void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv)
|
||||
{
|
||||
// printf("stride %d linear access latency test\n", step);
|
||||
// printf("range (B), read latency, iters, samples, cycles\n");
|
||||
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
|
||||
|
||||
// _perf_print_timer();
|
||||
_perf_start_timer();
|
||||
for (int i = 0; i < iter; i++) {
|
||||
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
|
||||
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
}
|
||||
}
|
||||
_perf_end_timer();
|
||||
// _perf_print_timer();
|
||||
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
|
||||
if (to_csv) {
|
||||
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
|
||||
} else {
|
||||
printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
|
||||
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
|
||||
);
|
||||
}
|
||||
_perf_g_total_samples += total_access;
|
||||
}
|
||||
|
||||
void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv)
|
||||
{
|
||||
// printf("stride %d linear access latency test\n", step);
|
||||
// printf("range (B), read latency, iters, samples, cycles\n");
|
||||
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
|
||||
|
||||
// _perf_print_timer();
|
||||
_perf_start_timer();
|
||||
for (int i = 0; i < iter; i++) {
|
||||
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
|
||||
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
}
|
||||
}
|
||||
_perf_end_timer();
|
||||
// _perf_print_timer();
|
||||
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
|
||||
if (to_csv) {
|
||||
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
|
||||
} else {
|
||||
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
|
||||
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
|
||||
);
|
||||
}
|
||||
_perf_g_total_samples += total_access;
|
||||
}
|
||||
|
||||
void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
|
||||
{
|
||||
// printf("stride %d linear access latency test\n", step);
|
||||
// printf("range (B), read latency, iters, samples, cycles\n");
|
||||
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
|
||||
|
||||
// _perf_print_timer();
|
||||
_perf_start_timer();
|
||||
for (int i = 0; i < iter; i++) {
|
||||
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
|
||||
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
}
|
||||
}
|
||||
_perf_end_timer();
|
||||
// _perf_print_timer();
|
||||
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter;
|
||||
if (to_csv) {
|
||||
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
|
||||
} else {
|
||||
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n",
|
||||
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8
|
||||
);
|
||||
}
|
||||
_perf_g_total_samples += total_access;
|
||||
}
|
|
@ -6,6 +6,7 @@
|
|||
#include <klib.h>
|
||||
#include <csr.h>
|
||||
#include "bitutils.h"
|
||||
#include "resultmat.h"
|
||||
|
||||
// config
|
||||
// #define PERF_SIM // probe run in simulatior, diaable perf counters
|
||||
|
@ -72,6 +73,8 @@ extern void test_linear_access_latency(uint64_t size, uint64_t step, int iter, i
|
|||
extern void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv);
|
||||
extern void test_same_address_load_latency(int iter, int to_csv);
|
||||
extern void test_read_after_write_latency(int iter, int to_csv);
|
||||
|
||||
// bandwidth test
|
||||
extern void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv);
|
||||
extern void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv);
|
||||
extern void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv);
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
#ifndef PROBE_RESULT_MATRIX_H
|
||||
#define PROBE_RESULT_MATRIX_H
|
||||
|
||||
#include <klib.h>
|
||||
|
||||
struct result_matrix_meta {
|
||||
char* name;
|
||||
char* row_name;
|
||||
char* column_name;
|
||||
int row_size;
|
||||
int column_size;
|
||||
void* result_array;
|
||||
void* column_array;
|
||||
void* row_array;
|
||||
};
|
||||
|
||||
void print_float_result_matrix(struct result_matrix_meta* meta);
|
||||
void matrix_print_example();
|
||||
|
||||
#define FOR(v,end) for (int v = 0; v < end; v++)
|
||||
#define CONCAT(a,b) a##b
|
||||
#define TOSTR(a) #a
|
||||
#define DEFINE_FLOAT_RESULT_MATRIX(matrix_name, rowname, rowsize, columnname, columnsize) \
|
||||
struct result_matrix_meta CONCAT(matrix_name,_matrix_meta); \
|
||||
float CONCAT(matrix_name,_result_array)[rowsize][columnsize] = {0}; \
|
||||
int CONCAT(matrix_name,_column_array)[columnsize] = {0}; \
|
||||
int CONCAT(matrix_name,_row_array)[rowsize] = {0}; \
|
||||
CONCAT(matrix_name,_matrix_meta).name = TOSTR(matrix_name); \
|
||||
CONCAT(matrix_name,_matrix_meta).column_name = TOSTR(columnname); \
|
||||
CONCAT(matrix_name,_matrix_meta).row_name = TOSTR(rowname); \
|
||||
CONCAT(matrix_name,_matrix_meta).column_size = columnsize; \
|
||||
CONCAT(matrix_name,_matrix_meta).row_size = rowsize; \
|
||||
CONCAT(matrix_name,_matrix_meta).result_array = CONCAT(matrix_name,_result_array); \
|
||||
CONCAT(matrix_name,_matrix_meta).column_array = CONCAT(matrix_name,_column_array); \
|
||||
CONCAT(matrix_name,_matrix_meta).row_array = CONCAT(matrix_name,_row_array);
|
||||
|
||||
#endif
|
|
@ -1,9 +1,5 @@
|
|||
#include "maprobe.h"
|
||||
|
||||
// inline uint64_t get_next_linear_address(uint64_t current_addr, uint64_t step) {
|
||||
// return current_addr + step;
|
||||
// }
|
||||
|
||||
inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) {
|
||||
return (rand() % (end_addr - base_addr) + base_addr) / align * align;
|
||||
}
|
||||
|
@ -224,98 +220,6 @@ void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_c
|
|||
test_linear_access_latency_batch8(size, step, iter, to_csv);
|
||||
}
|
||||
|
||||
void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv)
|
||||
{
|
||||
// printf("stride %d linear access latency test\n", step);
|
||||
// printf("range (B), read latency, iters, samples, cycles\n");
|
||||
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
|
||||
|
||||
// _perf_print_timer();
|
||||
_perf_start_timer();
|
||||
for (int i = 0; i < iter; i++) {
|
||||
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
|
||||
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
}
|
||||
}
|
||||
_perf_end_timer();
|
||||
// _perf_print_timer();
|
||||
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
|
||||
if (to_csv) {
|
||||
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
|
||||
} else {
|
||||
printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
|
||||
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
|
||||
);
|
||||
}
|
||||
_perf_g_total_samples += total_access;
|
||||
}
|
||||
|
||||
void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv)
|
||||
{
|
||||
// printf("stride %d linear access latency test\n", step);
|
||||
// printf("range (B), read latency, iters, samples, cycles\n");
|
||||
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
|
||||
|
||||
// _perf_print_timer();
|
||||
_perf_start_timer();
|
||||
for (int i = 0; i < iter; i++) {
|
||||
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
|
||||
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
__asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
}
|
||||
}
|
||||
_perf_end_timer();
|
||||
// _perf_print_timer();
|
||||
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
|
||||
if (to_csv) {
|
||||
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
|
||||
} else {
|
||||
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
|
||||
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
|
||||
);
|
||||
}
|
||||
_perf_g_total_samples += total_access;
|
||||
}
|
||||
|
||||
void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
|
||||
{
|
||||
// printf("stride %d linear access latency test\n", step);
|
||||
// printf("range (B), read latency, iters, samples, cycles\n");
|
||||
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
|
||||
|
||||
// _perf_print_timer();
|
||||
_perf_start_timer();
|
||||
for (int i = 0; i < iter; i++) {
|
||||
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
|
||||
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
|
||||
}
|
||||
}
|
||||
_perf_end_timer();
|
||||
// _perf_print_timer();
|
||||
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter;
|
||||
if (to_csv) {
|
||||
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
|
||||
} else {
|
||||
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n",
|
||||
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8
|
||||
);
|
||||
}
|
||||
_perf_g_total_samples += total_access;
|
||||
}
|
||||
|
||||
void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv)
|
||||
{
|
||||
// printf("align %d random access (cache line) latency test, %s\n",
|
||||
|
|
|
@ -199,6 +199,7 @@ void legacy_latency_throughput_test()
|
|||
|
||||
int main()
|
||||
{
|
||||
matrix_print_example();
|
||||
latency_test_example();
|
||||
typical_latency_test();
|
||||
// pointer_tracing_graph();
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
#include "resultmat.h"
|
||||
|
||||
void print_float_result_matrix(struct result_matrix_meta* meta)
|
||||
{
|
||||
assert(meta);
|
||||
printf("---------- %s matrix start ----------\n", meta->name);
|
||||
printf("%s (row) \\ %s (column)\n", meta->row_name, meta->column_name);
|
||||
if (meta->column_array) {
|
||||
if (meta->row_array) {
|
||||
printf("\\ , \t");
|
||||
}
|
||||
for (int c = 0; c < meta->column_size; c++) {
|
||||
printf(" %d,\t", *((int*)meta->column_array + c));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
for (int r = 0; r < meta->row_size; r++) {
|
||||
if (meta->row_array) {
|
||||
printf("%3d,\t", *((int*)meta->row_array + r));
|
||||
}
|
||||
for (int c = 0; c < meta->column_size; c++) {
|
||||
printf("%f,\t", *((float*)meta->result_array + r * meta->column_size + c));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("---------- %s matrix end ----------\n");
|
||||
}
|
||||
|
||||
void matrix_print_example()
|
||||
{
|
||||
DEFINE_FLOAT_RESULT_MATRIX(test,testrow,5,testcol,10);
|
||||
// ({
|
||||
// struct result_matrix_meta test_matrix_meta;
|
||||
// float test_result_array[5][10] = {0};
|
||||
// int test_column_array[10] = {0};
|
||||
// int testrow_array[5] = {0};
|
||||
// test_matrix_meta.name = "test";
|
||||
// test_matrix_meta.column_name = "testcol";
|
||||
// test_matrix_meta.row_name = "testrow";
|
||||
// test_matrix_meta.column_size = 10;
|
||||
// test_matrix_meta.row_size = 5;
|
||||
// test_matrix_meta.result_array = test_result_array;
|
||||
// test_matrix_meta.column_array = test_column_array;
|
||||
// test_matrix_meta.row_array = test_row_array;
|
||||
// })
|
||||
|
||||
FOR(x,5) { test_row_array[x] = x; }
|
||||
FOR(x,10) { test_column_array[x] = x; }
|
||||
FOR(x,5) {
|
||||
FOR(y,10) {
|
||||
test_result_array[x][y] = rand();
|
||||
}
|
||||
}
|
||||
print_float_result_matrix(&test_matrix_meta);
|
||||
}
|
Loading…
Reference in New Issue