maprobe: support matrix print

This commit is contained in:
William Wang 2023-03-07 16:29:44 +08:00
parent ac66935e15
commit b27f2968b4
7 changed files with 190 additions and 97 deletions

View File

@ -1,3 +1,3 @@
NAME = maprobe
SRCS = common.c bitutils.c latency-test.c main.c
SRCS = common.c bitutils.c resultmat.c latency-test.c bandwidth-test.c main.c
include $(AM_HOME)/Makefile.app

View File

@ -0,0 +1,93 @@
#include "maprobe.h"
void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}

View File

@ -6,6 +6,7 @@
#include <klib.h>
#include <csr.h>
#include "bitutils.h"
#include "resultmat.h"
// config
// #define PERF_SIM // probe run in simulatior, diaable perf counters
@ -72,6 +73,8 @@ extern void test_linear_access_latency(uint64_t size, uint64_t step, int iter, i
extern void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv);
extern void test_same_address_load_latency(int iter, int to_csv);
extern void test_read_after_write_latency(int iter, int to_csv);
// bandwidth test
extern void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv);
extern void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv);
extern void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv);

View File

@ -0,0 +1,37 @@
#ifndef PROBE_RESULT_MATRIX_H
#define PROBE_RESULT_MATRIX_H
#include <klib.h>
struct result_matrix_meta {
char* name;
char* row_name;
char* column_name;
int row_size;
int column_size;
void* result_array;
void* column_array;
void* row_array;
};
void print_float_result_matrix(struct result_matrix_meta* meta);
void matrix_print_example();
#define FOR(v,end) for (int v = 0; v < end; v++)
#define CONCAT(a,b) a##b
#define TOSTR(a) #a
#define DEFINE_FLOAT_RESULT_MATRIX(matrix_name, rowname, rowsize, columnname, columnsize) \
struct result_matrix_meta CONCAT(matrix_name,_matrix_meta); \
float CONCAT(matrix_name,_result_array)[rowsize][columnsize] = {0}; \
int CONCAT(matrix_name,_column_array)[columnsize] = {0}; \
int CONCAT(matrix_name,_row_array)[rowsize] = {0}; \
CONCAT(matrix_name,_matrix_meta).name = TOSTR(matrix_name); \
CONCAT(matrix_name,_matrix_meta).column_name = TOSTR(columnname); \
CONCAT(matrix_name,_matrix_meta).row_name = TOSTR(rowname); \
CONCAT(matrix_name,_matrix_meta).column_size = columnsize; \
CONCAT(matrix_name,_matrix_meta).row_size = rowsize; \
CONCAT(matrix_name,_matrix_meta).result_array = CONCAT(matrix_name,_result_array); \
CONCAT(matrix_name,_matrix_meta).column_array = CONCAT(matrix_name,_column_array); \
CONCAT(matrix_name,_matrix_meta).row_array = CONCAT(matrix_name,_row_array);
#endif

View File

@ -1,9 +1,5 @@
#include "maprobe.h"
// inline uint64_t get_next_linear_address(uint64_t current_addr, uint64_t step) {
// return current_addr + step;
// }
inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) {
return (rand() % (end_addr - base_addr) + base_addr) / align * align;
}
@ -224,98 +220,6 @@ void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_c
test_linear_access_latency_batch8(size, step, iter, to_csv);
}
void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv)
{
// printf("align %d random access (cache line) latency test, %s\n",

View File

@ -199,6 +199,7 @@ void legacy_latency_throughput_test()
int main()
{
matrix_print_example();
latency_test_example();
typical_latency_test();
// pointer_tracing_graph();

55
apps/maprobe/resultmat.c Normal file
View File

@ -0,0 +1,55 @@
#include "resultmat.h"
void print_float_result_matrix(struct result_matrix_meta* meta)
{
assert(meta);
printf("---------- %s matrix start ----------\n", meta->name);
printf("%s (row) \\ %s (column)\n", meta->row_name, meta->column_name);
if (meta->column_array) {
if (meta->row_array) {
printf("\\ , \t");
}
for (int c = 0; c < meta->column_size; c++) {
printf(" %d,\t", *((int*)meta->column_array + c));
}
printf("\n");
}
for (int r = 0; r < meta->row_size; r++) {
if (meta->row_array) {
printf("%3d,\t", *((int*)meta->row_array + r));
}
for (int c = 0; c < meta->column_size; c++) {
printf("%f,\t", *((float*)meta->result_array + r * meta->column_size + c));
}
printf("\n");
}
printf("---------- %s matrix end ----------\n");
}
void matrix_print_example()
{
DEFINE_FLOAT_RESULT_MATRIX(test,testrow,5,testcol,10);
// ({
// struct result_matrix_meta test_matrix_meta;
// float test_result_array[5][10] = {0};
// int test_column_array[10] = {0};
// int testrow_array[5] = {0};
// test_matrix_meta.name = "test";
// test_matrix_meta.column_name = "testcol";
// test_matrix_meta.row_name = "testrow";
// test_matrix_meta.column_size = 10;
// test_matrix_meta.row_size = 5;
// test_matrix_meta.result_array = test_result_array;
// test_matrix_meta.column_array = test_column_array;
// test_matrix_meta.row_array = test_row_array;
// })
FOR(x,5) { test_row_array[x] = x; }
FOR(x,10) { test_column_array[x] = x; }
FOR(x,5) {
FOR(y,10) {
test_result_array[x][y] = rand();
}
}
print_float_result_matrix(&test_matrix_meta);
}