Support KModel V4
This commit is contained in:
parent
49d25c8f4a
commit
6c201580a0
File diff suppressed because it is too large
Load Diff
|
@ -22,6 +22,11 @@ ENDIF ()
|
|||
# definitions in macros
|
||||
add_definitions(-DCONFIG_LOG_LEVEL=LOG_VERBOSE -DCONFIG_LOG_ENABLE -DCONFIG_LOG_COLORS -DLOG_KERNEL -D__riscv64 -DLV_CONF_INCLUDE_SIMPLE)
|
||||
|
||||
# xtl options
|
||||
add_definitions(-DTCB_SPAN_NO_EXCEPTIONS -DTCB_SPAN_NO_CONTRACT_CHECKING)
|
||||
# nncase options
|
||||
add_definitions(-DNNCASE_TARGET=k210)
|
||||
|
||||
if (NOT SDK_ROOT)
|
||||
get_filename_component(_SDK_ROOT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
|
||||
global_set(SDK_ROOT ${_SDK_ROOT})
|
||||
|
|
|
@ -40,6 +40,7 @@ if (BUILDING_SDK)
|
|||
-Wno-error=unused-but-set-variable
|
||||
-Wno-error=unused-variable
|
||||
-Wno-error=deprecated-declarations
|
||||
-Wno-multichar
|
||||
-Wextra
|
||||
-Werror=frame-larger-than=32768
|
||||
-Wno-unused-parameter
|
||||
|
|
|
@ -103,7 +103,7 @@ SECTIONS
|
|||
{
|
||||
PROVIDE_HIDDEN (__init_array_start = .);
|
||||
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
|
||||
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
|
||||
*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)
|
||||
PROVIDE_HIDDEN (__init_array_end = .);
|
||||
} >ram AT>ram :ram_ro
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
FILE(GLOB_RECURSE LIB_SRC
|
||||
"${CMAKE_CURRENT_LIST_DIR}/*.h"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/*.hpp"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/*.cpp"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/*.c"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/*.cpp"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/*.s"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/*.S"
|
||||
)
|
||||
|
@ -16,7 +16,8 @@ FILE(GLOB_RECURSE ASSEMBLY_FILES
|
|||
"${CMAKE_CURRENT_LIST_DIR}/*.S"
|
||||
)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include)
|
||||
include_directories(${SDK_ROOT}/third_party/xtl/include)
|
||||
include_directories(${CMAKE_CURRENT_LIST_DIR}/drivers/include ${CMAKE_CURRENT_LIST_DIR}/bsp/include ${CMAKE_CURRENT_LIST_DIR}/nncase/include)
|
||||
|
||||
SET_PROPERTY(SOURCE ${ASSEMBLY_FILES} PROPERTY LANGUAGE C)
|
||||
SET_SOURCE_FILES_PROPERTIES(${ASSEMBLY_FILES} PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp -D __riscv64")
|
||||
|
|
|
@ -663,18 +663,31 @@ typedef void (*kpu_done_callback_t)(void *userdata);
|
|||
|
||||
typedef struct
|
||||
{
|
||||
const uint8_t *model_buffer;
|
||||
uint8_t *main_buffer;
|
||||
uint32_t output_count;
|
||||
const kpu_model_output_t *outputs;
|
||||
const kpu_model_layer_header_t *layer_headers;
|
||||
const uint8_t *body_start;
|
||||
uint32_t layers_length;
|
||||
volatile uint32_t current_layer;
|
||||
const uint8_t *volatile current_body;
|
||||
dmac_channel_number_t dma_ch;
|
||||
kpu_done_callback_t done_callback;
|
||||
void *userdata;
|
||||
int is_nncase;
|
||||
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
const uint8_t *model_buffer;
|
||||
uint8_t *main_buffer;
|
||||
uint32_t output_count;
|
||||
const kpu_model_output_t *outputs;
|
||||
const kpu_model_layer_header_t *layer_headers;
|
||||
const uint8_t *body_start;
|
||||
uint32_t layers_length;
|
||||
volatile uint32_t current_layer;
|
||||
const uint8_t *volatile current_body;
|
||||
dmac_channel_number_t dma_ch;
|
||||
kpu_done_callback_t done_callback;
|
||||
void *userdata;
|
||||
};
|
||||
|
||||
struct
|
||||
{
|
||||
void* nncase_ctx;
|
||||
};
|
||||
};
|
||||
} kpu_model_context_t;
|
||||
|
||||
typedef struct
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "dmac.h"
|
||||
#include "kpu.h"
|
||||
#include "printf.h"
|
||||
#include "nncase.h"
|
||||
|
||||
#define LAYER_BURST_SIZE 12
|
||||
|
||||
|
@ -1361,6 +1362,7 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
|||
|
||||
if(header->version == 3 && header->arch == 0)
|
||||
{
|
||||
ctx->is_nncase = 0;
|
||||
ctx->model_buffer = buffer;
|
||||
ctx->output_count = header->output_count;
|
||||
ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
|
||||
|
@ -1370,6 +1372,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
|||
ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
|
||||
if(!ctx->main_buffer)
|
||||
return -1;
|
||||
} else if(header->version == 'KMDL')
|
||||
{
|
||||
return nncase_load_kmodel(ctx, buffer);
|
||||
} else
|
||||
{
|
||||
return -1;
|
||||
|
@ -1380,6 +1385,9 @@ int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
|||
|
||||
int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
|
||||
{
|
||||
if(ctx->is_nncase)
|
||||
return nncase_get_output(ctx, index, data, size);
|
||||
|
||||
if(index >= ctx->output_count)
|
||||
return -1;
|
||||
|
||||
|
@ -1391,6 +1399,9 @@ int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, siz
|
|||
|
||||
void kpu_model_free(kpu_model_context_t *ctx)
|
||||
{
|
||||
if(ctx->is_nncase)
|
||||
return nncase_model_free(ctx);
|
||||
|
||||
free(ctx->main_buffer);
|
||||
ctx->main_buffer = NULL;
|
||||
}
|
||||
|
@ -1595,6 +1606,9 @@ static void ai_step_not_isr(void *userdata)
|
|||
|
||||
int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
|
||||
{
|
||||
if(ctx->is_nncase)
|
||||
return nncase_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
|
||||
|
||||
ctx->dma_ch = dma_ch;
|
||||
ctx->done_callback = done_callback;
|
||||
ctx->userdata = userdata;
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
---
|
||||
BasedOnStyle: WebKit
|
||||
BreakBeforeBraces: Allman
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: 'true'
|
||||
UseTab: Never
|
||||
PointerAlignment: Right
|
||||
|
||||
...
|
|
@ -0,0 +1,97 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
#include <optional>
|
||||
#include <stdint.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
typedef enum _datatype
|
||||
{
|
||||
dt_float32,
|
||||
dt_uint8
|
||||
} datatype_t;
|
||||
|
||||
struct padding
|
||||
{
|
||||
int32_t before;
|
||||
int32_t after;
|
||||
|
||||
int32_t sum() const noexcept { return before + after; }
|
||||
|
||||
static padding zero() noexcept { return {}; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct value_range
|
||||
{
|
||||
T min;
|
||||
T max;
|
||||
};
|
||||
|
||||
typedef enum _reduce_op
|
||||
{
|
||||
reduce_mean,
|
||||
reduce_min,
|
||||
reduce_max
|
||||
} reduce_op_t;
|
||||
|
||||
typedef enum _binary_op
|
||||
{
|
||||
binary_add,
|
||||
binary_sub,
|
||||
binary_mul,
|
||||
binary_div
|
||||
} binary_op_t;
|
||||
|
||||
typedef struct _quant_param
|
||||
{
|
||||
int32_t zero_point;
|
||||
float scale;
|
||||
} quant_param_t;
|
||||
|
||||
inline bool operator==(const quant_param_t &lhs, const quant_param_t &rhs) noexcept
|
||||
{
|
||||
return lhs.zero_point == rhs.zero_point && lhs.scale == rhs.scale;
|
||||
}
|
||||
|
||||
struct fixed_mul
|
||||
{
|
||||
float mul;
|
||||
int8_t shift;
|
||||
};
|
||||
|
||||
typedef enum _memory_type
|
||||
{
|
||||
mem_const,
|
||||
mem_main,
|
||||
mem_k210_kpu
|
||||
} memory_type_t;
|
||||
|
||||
using runtime_shape_t = std::array<int, 4>;
|
||||
using runtime_paddings_t = std::array<padding, 4>;
|
||||
|
||||
struct scalar
|
||||
{
|
||||
datatype_t type;
|
||||
std::array<uint8_t, 4> storage;
|
||||
|
||||
scalar() = default;
|
||||
|
||||
template <class T>
|
||||
scalar(T &&value) { as<T>() = value; }
|
||||
|
||||
template <class T>
|
||||
T &as() noexcept { return *reinterpret_cast<T *>(storage.data()); }
|
||||
|
||||
template <class T>
|
||||
const T &as() const noexcept { return *reinterpret_cast<const T *>(storage.data()); }
|
||||
};
|
||||
|
||||
struct memory_range
|
||||
{
|
||||
memory_type_t memory_type;
|
||||
datatype_t datatype;
|
||||
uint32_t start;
|
||||
uint32_t size;
|
||||
};
|
||||
}
|
|
@ -0,0 +1,257 @@
|
|||
#pragma once
|
||||
#include "../utils.h"
|
||||
#include <runtime_op_utility.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
namespace cpu
|
||||
{
|
||||
inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
|
||||
int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
|
||||
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
|
||||
{
|
||||
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
|
||||
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
|
||||
|
||||
for (int batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
int in_y_origin = (oy * stride_h) - padding_h.before;
|
||||
int in_x_origin = (ox * stride_w) - padding_w.before;
|
||||
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
|
||||
for (int oc = 0; oc < out_channels; oc++)
|
||||
{
|
||||
auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
|
||||
float value = bias[oc];
|
||||
|
||||
for (int ky = filter_y_start; ky < filter_y_end; ky++)
|
||||
{
|
||||
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
|
||||
{
|
||||
int in_y = in_y_origin + dilation_h * ky;
|
||||
int in_x = in_x_origin + dilation_w * kx;
|
||||
|
||||
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
|
||||
auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
|
||||
|
||||
for (int ic = 0; ic < in_shape[3]; ic++)
|
||||
value += in_pix[ic] * w_pix[ic];
|
||||
}
|
||||
}
|
||||
|
||||
*output++ = details::apply_activation(value, fused_activation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void depthwise_conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
|
||||
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
|
||||
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
|
||||
{
|
||||
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
|
||||
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
|
||||
|
||||
for (int batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
int in_y_origin = (oy * stride_h) - padding_h.before;
|
||||
int in_x_origin = (ox * stride_w) - padding_w.before;
|
||||
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
|
||||
for (int oc = 0; oc < in_shape[3]; oc++)
|
||||
{
|
||||
auto w_oc = weights + (size_t)oc * filter_h * filter_w;
|
||||
float value = bias[oc];
|
||||
|
||||
for (int ky = filter_y_start; ky < filter_y_end; ky++)
|
||||
{
|
||||
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
|
||||
{
|
||||
int in_y = in_y_origin + dilation_h * ky;
|
||||
int in_x = in_x_origin + dilation_w * kx;
|
||||
|
||||
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
|
||||
auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
|
||||
|
||||
value += in_pix[oc] * w_pix[0];
|
||||
}
|
||||
}
|
||||
|
||||
*output++ = details::apply_activation(value, fused_activation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class TBinaryOp, class TOutputOp>
|
||||
void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape,
|
||||
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
|
||||
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
|
||||
{
|
||||
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
|
||||
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
|
||||
|
||||
for (int batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
int in_y_origin = (oy * stride_h) - padding_h.before;
|
||||
int in_x_origin = (ox * stride_w) - padding_w.before;
|
||||
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
|
||||
for (int oc = 0; oc < in_shape[3]; oc++)
|
||||
{
|
||||
float value = init_value;
|
||||
int32_t kernel_count = 0;
|
||||
|
||||
for (int ky = filter_y_start; ky < filter_y_end; ky++)
|
||||
{
|
||||
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
|
||||
{
|
||||
int in_y = in_y_origin + dilation_h * ky;
|
||||
int in_x = in_x_origin + dilation_w * kx;
|
||||
|
||||
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
|
||||
|
||||
value = binary_op(value, in_pix[oc]);
|
||||
kernel_count++;
|
||||
}
|
||||
}
|
||||
|
||||
*output++ = details::apply_activation(window_op(value, kernel_count), fused_activation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
|
||||
int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
|
||||
const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
|
||||
{
|
||||
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
|
||||
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
|
||||
|
||||
for (int batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
int in_y_origin = (oy * stride_h) - padding_h.before;
|
||||
int in_x_origin = (ox * stride_w) - padding_w.before;
|
||||
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
|
||||
for (int oc = 0; oc < out_channels; oc++)
|
||||
{
|
||||
auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
|
||||
int32_t value = bias[oc];
|
||||
|
||||
for (int ky = filter_y_start; ky < filter_y_end; ky++)
|
||||
{
|
||||
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
|
||||
{
|
||||
int in_y = in_y_origin + dilation_h * ky;
|
||||
int in_x = in_x_origin + dilation_w * kx;
|
||||
|
||||
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
|
||||
auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
|
||||
|
||||
for (int ic = 0; ic < in_shape[3]; ic++)
|
||||
value += (in_pix[ic] - input_offset) * (w_pix[ic] - filter_offset);
|
||||
}
|
||||
}
|
||||
|
||||
value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
|
||||
*output++ = (uint8_t)std::clamp(value, 0, 255);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void quantized_depthwise_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
|
||||
int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
|
||||
const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
|
||||
{
|
||||
const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
|
||||
const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
|
||||
|
||||
for (int batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
int in_y_origin = (oy * stride_h) - padding_h.before;
|
||||
int in_x_origin = (ox * stride_w) - padding_w.before;
|
||||
int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
|
||||
for (int oc = 0; oc < in_shape[3]; oc++)
|
||||
{
|
||||
auto w_oc = weights + (size_t)oc * filter_h * filter_w;
|
||||
int32_t value = bias[oc];
|
||||
|
||||
for (int ky = filter_y_start; ky < filter_y_end; ky++)
|
||||
{
|
||||
for (int kx = filter_xSstart; kx < filter_x_end; kx++)
|
||||
{
|
||||
int in_y = in_y_origin + dilation_h * ky;
|
||||
int in_x = in_x_origin + dilation_w * kx;
|
||||
|
||||
auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
|
||||
auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
|
||||
|
||||
value += (in_pix[oc] - input_offset) * (w_pix[0] - filter_offset);
|
||||
}
|
||||
}
|
||||
|
||||
value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
|
||||
*output++ = (uint8_t)std::clamp(value, 0, 255);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
#pragma once
|
||||
#include "../utils.h"
|
||||
#include <runtime_op_utility.h>
|
||||
#include <targets/k210/k210_runtime_op_utility.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
namespace k210
|
||||
{
|
||||
inline void kpu_upload(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape)
|
||||
{
|
||||
if (in_shape[3] % 64 == 0)
|
||||
{
|
||||
std::copy(src, src + kernels::details::compute_size(in_shape), dest);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto layout = targets::k210::get_kpu_row_layout(in_shape[3]);
|
||||
auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
|
||||
|
||||
for (int32_t batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto batch_origin = dest + (size_t)batch * fmap_size;
|
||||
for (int32_t oc = 0; oc < in_shape[1]; oc++)
|
||||
{
|
||||
auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
|
||||
for (int32_t y = 0; y < in_shape[2]; y++)
|
||||
{
|
||||
auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
|
||||
std::copy(src, src + in_shape[3], y_origin);
|
||||
src += in_shape[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if NNCASE_TARGET_K210_SIMULATOR
|
||||
|
||||
inline void kpu_download(const uint8_t *src, uint8_t *dest, const runtime_shape_t &in_shape)
|
||||
{
|
||||
if (in_shape[3] % 64 == 0)
|
||||
{
|
||||
std::copy(src, src + kernels::details::compute_size(in_shape), dest);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto layout = targets::k210::get_kpu_row_layout(in_shape[3]);
|
||||
auto fmap_size = targets::k210::get_kpu_bytes(in_shape[3], in_shape[2], in_shape[1]);
|
||||
|
||||
for (int32_t batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto batch_origin = src + (size_t)batch * fmap_size;
|
||||
for (int32_t oc = 0; oc < in_shape[1]; oc++)
|
||||
{
|
||||
auto channel_origin = batch_origin + (size_t)oc / layout.groups * layout.row_len * in_shape[2] * 64 + (size_t)oc % layout.groups * layout.row_pitch;
|
||||
for (int32_t y = 0; y < in_shape[2]; y++)
|
||||
{
|
||||
auto y_origin = channel_origin + (size_t)y * layout.row_len * 64;
|
||||
for (int32_t x = 0; x < in_shape[3]; x++)
|
||||
*dest++ = y_origin[x];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <bool IsDepthwise, int32_t FilterSize>
|
||||
void kpu_conv2d(const uint8_t *input, int64_t *workspace, uint8_t *output, const uint8_t *weights, int32_t in_h, int32_t in_w, int32_t in_channels, int32_t out_channels, uint8_t pad_value, int32_t arg_x,
|
||||
int32_t shift_x, int32_t arg_w, int32_t shift_w, int64_t arg_add, const targets::k210::kpu_batchnorm_segment *batchnorm, const targets::k210::kpu_activation_table_t &activation)
|
||||
{
|
||||
const auto channel_size = size_t(in_h) * in_w;
|
||||
// conv
|
||||
{
|
||||
auto out_it = workspace;
|
||||
const auto pad = FilterSize == 1 ? 0 : 1;
|
||||
const auto groups = IsDepthwise ? out_channels : 1;
|
||||
const auto g_ic = IsDepthwise ? 1 : in_channels / groups;
|
||||
const auto g_oc = IsDepthwise ? 1 : out_channels;
|
||||
|
||||
for (int32_t og = 0; og < groups; og++)
|
||||
{
|
||||
const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * FilterSize * FilterSize;
|
||||
|
||||
for (int32_t oc = 0; oc < g_oc; oc++)
|
||||
{
|
||||
const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * FilterSize * FilterSize;
|
||||
|
||||
for (int32_t oy = 0; oy < in_h; oy++)
|
||||
{
|
||||
for (int32_t ox = 0; ox < in_w; ox++)
|
||||
{
|
||||
const int32_t in_y_origin = oy - pad;
|
||||
const int32_t in_x_origin = ox - pad;
|
||||
int64_t value = 0;
|
||||
int64_t sum_x = 0, sum_w = 0;
|
||||
|
||||
for (int32_t ic = 0; ic < g_ic; ic++)
|
||||
{
|
||||
const uint8_t *in_c_p = input + ((size_t)og * g_ic + ic) * in_h * in_w;
|
||||
const uint8_t *w_ic_p = w_oc_p + (size_t)ic * FilterSize * FilterSize;
|
||||
|
||||
for (int32_t ky = 0; ky < FilterSize; ky++)
|
||||
{
|
||||
for (int32_t kx = 0; kx < FilterSize; kx++)
|
||||
{
|
||||
const int32_t in_y = in_y_origin + ky;
|
||||
const int32_t in_x = in_x_origin + kx;
|
||||
|
||||
uint8_t x;
|
||||
if (in_x < 0 || in_x >= in_w
|
||||
|| in_y < 0 || in_y >= in_h)
|
||||
x = pad_value;
|
||||
else
|
||||
x = in_c_p[in_y * in_w + in_x];
|
||||
|
||||
uint8_t w = w_ic_p[ky * FilterSize + kx];
|
||||
|
||||
sum_x += x;
|
||||
sum_w += w;
|
||||
value += (int32_t)x * w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*out_it++ = value + (arg_x * sum_x >> shift_x) + (arg_w * sum_w >> shift_w) + arg_add * g_ic;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bn act
|
||||
{
|
||||
auto src_it = workspace;
|
||||
auto out_it = output;
|
||||
for (int32_t oc = 0; oc < out_channels; oc++)
|
||||
{
|
||||
const auto &bn = batchnorm[oc];
|
||||
for (size_t i = 0; i < channel_size; i++)
|
||||
{
|
||||
auto value = (*src_it++ * bn.mul >> bn.shift) + bn.add;
|
||||
auto &seg = *std::find_if(activation.rbegin(), activation.rend(), [value](const targets::k210::kpu_activation_segment &seg) {
|
||||
return value > seg.start_x;
|
||||
});
|
||||
value = runtime::carry_shift((value - seg.start_x) * seg.mul, seg.shift);
|
||||
*out_it++ = (uint8_t)std::clamp(value, int64_t(0), int64_t(255));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void kpu_pool2d(const uint8_t *input, uint8_t *output, int32_t in_h, int32_t in_w, int32_t in_channels, targets::k210::kpu_pool_type_t pool_type)
|
||||
{
|
||||
using namespace targets::k210;
|
||||
|
||||
const auto filter = get_kpu_filter_size(pool_type);
|
||||
const auto stride = get_kpu_filter_stride(pool_type);
|
||||
const auto out_h = get_kpu_pool_output_size(in_h, pool_type);
|
||||
const auto out_w = get_kpu_pool_output_size(in_w, pool_type);
|
||||
|
||||
for (int32_t oc = 0; oc < in_channels; oc++)
|
||||
{
|
||||
auto in_c_p = input + (size_t)oc * in_h * in_w;
|
||||
|
||||
for (int32_t oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int32_t ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
const int32_t in_y_origin = oy * stride;
|
||||
const int32_t in_x_origin = ox * stride;
|
||||
int32_t value = 0;
|
||||
|
||||
switch (pool_type)
|
||||
{
|
||||
case kpu_pool_bypass:
|
||||
{
|
||||
const int32_t in_y = in_y_origin;
|
||||
const int32_t in_x = in_x_origin;
|
||||
|
||||
value = in_c_p[in_y * in_w + in_x];
|
||||
break;
|
||||
}
|
||||
case kpu_pool_max_2_s2:
|
||||
case kpu_pool_max_2_s1:
|
||||
case kpu_pool_max_4_s4:
|
||||
{
|
||||
for (int32_t ky = 0; ky < filter; ky++)
|
||||
{
|
||||
for (int32_t kx = 0; kx < filter; kx++)
|
||||
{
|
||||
const int32_t in_y = in_y_origin + ky;
|
||||
const int32_t in_x = in_x_origin + kx;
|
||||
int32_t in_v;
|
||||
|
||||
if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
|
||||
in_v = 0;
|
||||
else
|
||||
in_v = in_c_p[in_y * in_w + in_x];
|
||||
|
||||
value = std::max(value, in_v);
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case kpu_pool_mean_2_s2:
|
||||
case kpu_pool_mean_2_s1:
|
||||
case kpu_pool_mean_4_s4:
|
||||
{
|
||||
for (int32_t ky = 0; ky < filter; ky++)
|
||||
{
|
||||
for (int32_t kx = 0; kx < filter; kx++)
|
||||
{
|
||||
const int32_t in_y = std::clamp(in_y_origin + ky, 0, in_h - 1);
|
||||
const int32_t in_x = std::clamp(in_x_origin + kx, 0, in_w - 1);
|
||||
const int32_t in_v = in_c_p[in_y * in_w + in_x];
|
||||
|
||||
value += in_v;
|
||||
}
|
||||
}
|
||||
|
||||
value /= filter * filter;
|
||||
break;
|
||||
}
|
||||
case kpu_pool_left_top_2_s2:
|
||||
case kpu_pool_left_top_4_s4:
|
||||
case kpu_pool_right_top_2_s2:
|
||||
{
|
||||
auto k_off = get_kpu_select_pool_offset(pool_type);
|
||||
const int32_t in_y = in_y_origin + k_off[0];
|
||||
const int32_t in_x = in_x_origin + k_off[1];
|
||||
int32_t in_v;
|
||||
|
||||
if (in_y < 0 || in_y >= in_h || in_x < 0 || in_x >= in_w)
|
||||
in_v = 0;
|
||||
else
|
||||
in_v = in_c_p[in_y * in_w + in_x];
|
||||
|
||||
value = in_v;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*output++ = (uint8_t)value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,422 @@
|
|||
#pragma once
|
||||
#include "../utils.h"
|
||||
#include <cmath>
|
||||
#include <runtime_op_utility.h>
|
||||
#include <xtl/xspan.hpp>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
namespace neutral
|
||||
{
|
||||
template <class TOp>
|
||||
void binary(const float *input_a, const float *input_b, float *output, const runtime_shape_t &in_a_shape,
|
||||
const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, const value_range<float> &fused_activation, TOp &&op)
|
||||
{
|
||||
for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
|
||||
{
|
||||
for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
|
||||
{
|
||||
for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
|
||||
{
|
||||
for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
|
||||
{
|
||||
runtime_shape_t in_off = { d0, d1, d2, d3 };
|
||||
const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
|
||||
const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
|
||||
const auto a = input_a[offset(in_a_shape, in_a_off)];
|
||||
const auto b = input_b[offset(in_b_shape, in_b_off)];
|
||||
|
||||
output[offset(out_shape, in_off)] = kernels::details::apply_activation(op(a, b), fused_activation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class TRange, class TPtrGetter = details::default_ptr_getter<uint8_t, TRange>>
|
||||
inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
|
||||
{
|
||||
for (size_t oc = 0; oc < outer_size; oc++)
|
||||
{
|
||||
for (size_t i = 0; i < inputs.size(); i++)
|
||||
{
|
||||
auto size = inner_size * concat_dims[i];
|
||||
auto src = getter(inputs[i]) + oc * size;
|
||||
std::copy(src, src + size, output);
|
||||
output += size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
|
||||
int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
|
||||
const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
|
||||
{
|
||||
const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
|
||||
const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
|
||||
const auto g_ic = in_shape[1] / groups;
|
||||
const auto g_oc = out_channels / groups;
|
||||
|
||||
for (int32_t batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int32_t og = 0; og < groups; og++)
|
||||
{
|
||||
const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
|
||||
const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
|
||||
|
||||
for (int32_t oc = 0; oc < g_oc; oc++)
|
||||
{
|
||||
const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
|
||||
|
||||
for (int32_t oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int32_t ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
|
||||
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
|
||||
const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
|
||||
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
float value = bias[oc];
|
||||
|
||||
for (int32_t ic = 0; ic < g_ic; ic++)
|
||||
{
|
||||
const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
|
||||
const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
|
||||
|
||||
for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
|
||||
{
|
||||
for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
|
||||
{
|
||||
const int32_t in_y = in_y_origin + dilation_h * ky;
|
||||
const int32_t in_x = in_x_origin + dilation_w * kx;
|
||||
|
||||
const float in_v = in_c_p[in_y * in_shape[3] + in_x];
|
||||
const float w = w_ic_p[ky * filter_w + kx];
|
||||
|
||||
value += in_v * w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*output++ = details::apply_activation(value, fused_activation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class TQ>
|
||||
void dequantize(const TQ *input, float *output, size_t count, const quant_param_t ¶m)
|
||||
{
|
||||
float div = 1.f / param.scale;
|
||||
|
||||
for (size_t i = 0; i < count; i++)
|
||||
{
|
||||
output[i] = (input[i] - param.zero_point) * div;
|
||||
}
|
||||
}
|
||||
|
||||
inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
|
||||
{
|
||||
for (size_t oy = 0; oy < a_rows; oy++)
|
||||
{
|
||||
for (size_t ox = 0; ox < b_cols; ox++)
|
||||
{
|
||||
float value = bias[ox];
|
||||
for (size_t i = 0; i < a_cols; i++)
|
||||
{
|
||||
const auto a = input_a[oy * a_cols + i];
|
||||
const auto b = input_b[i * b_cols + ox];
|
||||
value += a * b;
|
||||
}
|
||||
|
||||
output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void pad(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_paddings_t &paddings, T pad_value)
|
||||
{
|
||||
runtime_shape_t out_shape = { in_shape[0] + paddings[0].sum(),
|
||||
in_shape[1] + paddings[1].sum(),
|
||||
in_shape[2] + paddings[2].sum(),
|
||||
in_shape[3] + paddings[3].sum() };
|
||||
|
||||
for (int d0 = 0; d0 < out_shape[0]; d0++)
|
||||
{
|
||||
auto d0_origin = -paddings[0].before;
|
||||
auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int d1 = 0; d1 < out_shape[1]; d1++)
|
||||
{
|
||||
auto d1_origin = -paddings[1].before;
|
||||
auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int d2 = 0; d2 < out_shape[2]; d2++)
|
||||
{
|
||||
auto d2_origin = -paddings[2].before;
|
||||
auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
|
||||
|
||||
for (int d3 = 0; d3 < out_shape[3]; d3++)
|
||||
{
|
||||
auto d3_origin = -paddings[3].before;
|
||||
|
||||
if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
|
||||
|| d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
|
||||
|| d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
|
||||
|| d3 < paddings[3].before || d1 >= out_shape[3] - paddings[3].after)
|
||||
*output++ = pad_value;
|
||||
else
|
||||
*output++ = in2[d3_origin + d3];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class TQ>
|
||||
void quantize(const float *input, TQ *output, size_t count, const quant_param_t ¶m)
|
||||
{
|
||||
for (size_t i = 0; i < count; i++)
|
||||
{
|
||||
int32_t tmp = (int32_t)roundf(input[i] * param.scale + param.zero_point);
|
||||
output[i] = std::clamp(tmp, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
|
||||
}
|
||||
}
|
||||
|
||||
template <class TReducer>
|
||||
void reduce(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, const runtime_shape_t &reduced_shape, TReducer &&reducer)
|
||||
{
|
||||
std::fill(output, output + kernels::details::compute_size(reduced_shape), init_value);
|
||||
|
||||
for (int32_t d0 = 0; d0 < in_shape[0]; d0++)
|
||||
{
|
||||
for (int32_t d1 = 0; d1 < in_shape[1]; d1++)
|
||||
{
|
||||
for (int32_t d2 = 0; d2 < in_shape[2]; d2++)
|
||||
{
|
||||
for (int32_t d3 = 0; d3 < in_shape[3]; d3++)
|
||||
{
|
||||
runtime_shape_t in_off = { d0, d1, d2, d3 };
|
||||
auto out_off = kernels::details::get_reduced_offset(in_off, reduced_shape);
|
||||
const auto a = input[offset(in_shape, in_off)];
|
||||
auto &b = output[offset(reduced_shape, out_off)];
|
||||
b = reducer(b, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class TOp>
|
||||
void unary(const float *input, float *output, size_t count, TOp &&op)
|
||||
{
|
||||
for (size_t i = 0; i < count; i++)
|
||||
output[i] = op(input[i]);
|
||||
}
|
||||
|
||||
template <class TBinaryOp, class TOutputOp>
|
||||
void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, int32_t filter_h, int32_t filter_w,
|
||||
int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
|
||||
const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
|
||||
{
|
||||
const auto out_h = kernels::details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
|
||||
const auto out_w = kernels::details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
|
||||
runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
|
||||
|
||||
for (int32_t batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
for (int32_t oc = 0; oc < in_shape[1]; oc++)
|
||||
{
|
||||
for (int32_t oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
for (int32_t ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
|
||||
const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
|
||||
const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
|
||||
const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
|
||||
const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
|
||||
const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
|
||||
float value = init_value;
|
||||
int32_t kernel_count = 0;
|
||||
|
||||
for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
|
||||
{
|
||||
for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
|
||||
{
|
||||
const int32_t in_y = in_y_origin + dilation_h * ky;
|
||||
const int32_t in_x = in_x_origin + dilation_w * kx;
|
||||
|
||||
const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
|
||||
|
||||
value = binary_op(value, in_v);
|
||||
kernel_count++;
|
||||
}
|
||||
}
|
||||
|
||||
output[offset(out_shape, { batch, oc, oy, ox })] = kernels::details::apply_activation(window_op(value, kernel_count), fused_activation);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void resize_nearest_neighbor(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w)
|
||||
{
|
||||
auto height_scale = (float)in_shape[2] / out_h;
|
||||
auto width_scale = (float)in_shape[3] / out_w;
|
||||
|
||||
for (int batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oc = 0; oc < in_shape[1]; oc++)
|
||||
{
|
||||
auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
auto in_y = std::min((int32_t)floorf(oy * height_scale), in_shape[2] - 1);
|
||||
auto in_row = in_c + in_y * in_shape[3];
|
||||
|
||||
for (int ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
auto in_x = std::min((int32_t)floorf(ox * width_scale), in_shape[3] - 1);
|
||||
*output++ = in_row[in_x];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void resize_bilinear(const float *input, float *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
|
||||
{
|
||||
auto height_scale = (float)in_shape[2] / out_h;
|
||||
auto width_scale = (float)in_shape[3] / out_w;
|
||||
if (align_corners && out_h > 1)
|
||||
height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
|
||||
if (align_corners && out_w > 1)
|
||||
width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
|
||||
|
||||
auto destIdx = 0;
|
||||
for (int batch = 0; batch < in_shape[0]; batch++)
|
||||
{
|
||||
auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oc = 0; oc < in_shape[1]; oc++)
|
||||
{
|
||||
auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
|
||||
|
||||
for (int oy = 0; oy < out_h; oy++)
|
||||
{
|
||||
auto in_y = oy * height_scale;
|
||||
auto in_y0 = (int)floorf(in_y);
|
||||
auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
|
||||
|
||||
for (int ox = 0; ox < out_w; ox++)
|
||||
{
|
||||
auto in_x = ox * width_scale;
|
||||
auto in_x0 = (int)floorf(in_x);
|
||||
auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
|
||||
|
||||
auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
|
||||
auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
|
||||
auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
|
||||
auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
|
||||
|
||||
auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
|
||||
auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
|
||||
auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
|
||||
auto a3 = (in_y - in_y0) * (in_x - in_x0);
|
||||
|
||||
output[destIdx++] = v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
|
||||
{
|
||||
for (size_t batch = 0; batch < outer_size; batch++)
|
||||
{
|
||||
auto src = input + batch * inner_size;
|
||||
auto dest = output + batch * inner_size;
|
||||
|
||||
auto max = *std::max_element(src, src + inner_size);
|
||||
float sum = 0;
|
||||
|
||||
for (size_t i = 0; i < inner_size; i++)
|
||||
{
|
||||
auto value = expf((src[i] - max) * beta);
|
||||
sum += value;
|
||||
dest[i] = value;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < inner_size; i++)
|
||||
dest[i] /= sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void transpose(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &perm)
|
||||
{
|
||||
runtime_shape_t out_shape;
|
||||
for (size_t i = 0; i < 4; i++)
|
||||
out_shape[i] = in_shape[perm[i]];
|
||||
|
||||
runtime_shape_t i, o;
|
||||
for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
|
||||
{
|
||||
i[perm[3]] = o[3];
|
||||
for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
|
||||
{
|
||||
i[perm[2]] = o[2];
|
||||
for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
|
||||
{
|
||||
i[perm[1]] = o[1];
|
||||
for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
|
||||
{
|
||||
i[perm[0]] = o[0];
|
||||
output[offset(out_shape, o)] = input[offset(in_shape, i)];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void strided_slice(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_shape_t &begin, const runtime_shape_t &end, const runtime_shape_t &strides)
|
||||
{
|
||||
auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
|
||||
return stride > 0 ? i < stop : i > stop;
|
||||
};
|
||||
|
||||
for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
|
||||
{
|
||||
auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
|
||||
for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
|
||||
{
|
||||
auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
|
||||
for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
|
||||
{
|
||||
auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
|
||||
for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
|
||||
*output++ = d2_origin[d3];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <datatypes.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace kernels
|
||||
{
|
||||
inline size_t offset(const runtime_shape_t &shape, const runtime_shape_t &index)
|
||||
{
|
||||
return (((size_t)index[0] * shape[1] + index[1]) * shape[2] + index[2]) * shape[3] + index[3];
|
||||
}
|
||||
|
||||
namespace details
|
||||
{
|
||||
inline int32_t get_windowed_output_size(int32_t size, int32_t filter, int32_t stride, int32_t dilation, const padding &padding)
|
||||
{
|
||||
auto effective_filter_size = (filter - 1) * dilation + 1;
|
||||
return (size + padding.before + padding.after - effective_filter_size + stride) / stride;
|
||||
}
|
||||
|
||||
inline size_t compute_size(const runtime_shape_t &shape)
|
||||
{
|
||||
return size_t(shape[0]) * shape[1] * shape[2] * shape[3];
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline T apply_activation(T value, value_range<T> activation)
|
||||
{
|
||||
return std::clamp(value, activation.min, activation.max);
|
||||
}
|
||||
|
||||
inline runtime_shape_t get_reduced_offset(const runtime_shape_t &in_offset, const runtime_shape_t &reduced_shape)
|
||||
{
|
||||
runtime_shape_t off;
|
||||
for (size_t i = 0; i < in_offset.size(); i++)
|
||||
{
|
||||
if (in_offset[i] >= reduced_shape[i])
|
||||
off[i] = 0;
|
||||
else
|
||||
off[i] = in_offset[i];
|
||||
}
|
||||
|
||||
return off;
|
||||
}
|
||||
|
||||
template <class T, class TRange>
|
||||
struct default_ptr_getter
|
||||
{
|
||||
T *operator()(const TRange &range) const noexcept { return range; }
|
||||
};
|
||||
|
||||
template <int32_t Bits>
|
||||
int32_t to_signed(uint32_t value)
|
||||
{
|
||||
auto mask = uint32_t(1) << (Bits - 1);
|
||||
if (Bits != 32 && (value & mask) != 0)
|
||||
{
|
||||
auto sign = 0xFFFFFFFF << Bits;
|
||||
return (int)(value | sign);
|
||||
}
|
||||
|
||||
return (int32_t)value;
|
||||
}
|
||||
|
||||
template <int32_t Bits>
|
||||
int64_t to_signed(uint64_t value)
|
||||
{
|
||||
auto mask = uint64_t(1) << (Bits - 1);
|
||||
if ((value & mask) != 0)
|
||||
{
|
||||
auto sign = 0xFFFFFFFFFFFFFFFF << Bits;
|
||||
return (int64_t)(value | sign);
|
||||
}
|
||||
|
||||
return (int64_t)value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
/* Copyright 2018 Canaan Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef _NNCASE_H
|
||||
#define _NNCASE_H
|
||||
|
||||
#include "kpu.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer);
|
||||
int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size);
|
||||
void nncase_model_free(kpu_model_context_t *ctx);
|
||||
int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,51 @@
|
|||
#pragma once
|
||||
#include <iostream>
|
||||
#include <xtl/xspan.hpp>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
class binary_writer
|
||||
{
|
||||
public:
|
||||
binary_writer(std::ostream &stream)
|
||||
: stream_(stream)
|
||||
{
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void write(T &&value)
|
||||
{
|
||||
stream_.write(reinterpret_cast<const char *>(&value), sizeof(value));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void write_array(xtl::span<const T> value)
|
||||
{
|
||||
stream_.write(reinterpret_cast<const char *>(value.data()), value.size_bytes());
|
||||
}
|
||||
|
||||
std::streampos position() const
|
||||
{
|
||||
return stream_.tellp();
|
||||
}
|
||||
|
||||
void position(std::streampos pos)
|
||||
{
|
||||
stream_.seekp(pos);
|
||||
}
|
||||
|
||||
void align_position(size_t alignment)
|
||||
{
|
||||
auto pos = position();
|
||||
auto rem = pos % alignment;
|
||||
if (rem != 0)
|
||||
position(pos + std::streamoff(alignment - rem));
|
||||
}
|
||||
|
||||
private:
|
||||
std::ostream &stream_;
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
#pragma once
|
||||
#include "model.h"
|
||||
#include <chrono>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <xtl/xspan.hpp>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
class interpreter_base;
|
||||
typedef void (*run_callback_t)(void *userdata);
|
||||
typedef void (*error_callback_t)(const char *err, void *userdata);
|
||||
typedef void (*node_profile_callback_t)(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata);
|
||||
typedef void (interpreter_base::*interpreter_step_t)();
|
||||
|
||||
class interpreter_base
|
||||
{
|
||||
using clock_t = std::chrono::system_clock;
|
||||
|
||||
public:
|
||||
bool try_load_model(const uint8_t *buffer);
|
||||
|
||||
size_t inputs_size() const noexcept { return model_header_->inputs; }
|
||||
size_t outputs_size() const noexcept { return model_header_->outputs; }
|
||||
size_t nodes_size() const noexcept { return model_header_->nodes; }
|
||||
|
||||
const runtime_shape_t &input_shape_at(size_t index) const noexcept { return input_shapes_.at(index); }
|
||||
const memory_range &input_at(size_t index) const noexcept { return inputs_[index]; }
|
||||
const memory_range &output_at(size_t index) const noexcept { return outputs_[index]; }
|
||||
|
||||
template <class T>
|
||||
xtl::span<T> memory_at(const memory_range &range) const noexcept
|
||||
{
|
||||
auto span = memory_at(range);
|
||||
return { reinterpret_cast<T *>(span.data()), span.size() / sizeof(T) };
|
||||
}
|
||||
|
||||
std::chrono::nanoseconds total_duration() const noexcept { return total_duration_; }
|
||||
|
||||
void run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata);
|
||||
|
||||
protected:
|
||||
virtual bool initialize();
|
||||
virtual xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept;
|
||||
|
||||
private:
|
||||
void step();
|
||||
|
||||
private:
|
||||
const model_header *model_header_;
|
||||
std::unique_ptr<uint8_t[]> main_mem_;
|
||||
xtl::span<const memory_range> inputs_;
|
||||
xtl::span<const memory_range> outputs_;
|
||||
xtl::span<const runtime_shape_t> input_shapes_;
|
||||
xtl::span<const node_header> node_headers_;
|
||||
xtl::span<const uint8_t> constants_;
|
||||
const uint8_t *node_body_start_;
|
||||
error_callback_t on_error_;
|
||||
run_callback_t run_callback_;
|
||||
node_profile_callback_t node_profile_;
|
||||
void *userdata_;
|
||||
size_t cnt_node_;
|
||||
const uint8_t *cnt_node_body_;
|
||||
std::chrono::nanoseconds total_duration_;
|
||||
std::optional<clock_t::time_point> last_time_;
|
||||
runtime_opcode last_op_;
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
#pragma once
|
||||
#include "target_config.h"
|
||||
#include <datatypes.h>
|
||||
#include <runtime/runtime_op.h>
|
||||
#include <xtl/xspan.hpp>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
enum kernel_call_result
|
||||
{
|
||||
kcr_done,
|
||||
kcr_async,
|
||||
kcr_error
|
||||
};
|
||||
|
||||
kernel_call_result call_kernel(runtime_opcode opcode, xtl::span<const uint8_t> body, interpreter_t &interpreter, interpreter_step_t step);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
#pragma once
|
||||
#include "../datatypes.h"
|
||||
#include "runtime_op.h"
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
enum model_target : uint32_t
|
||||
{
|
||||
MODEL_TARGET_CPU = 0,
|
||||
MODEL_TARGET_K210 = 1,
|
||||
};
|
||||
|
||||
struct model_header
|
||||
{
|
||||
uint32_t identifier;
|
||||
uint32_t version;
|
||||
uint32_t flags;
|
||||
model_target target;
|
||||
uint32_t constants;
|
||||
uint32_t main_mem;
|
||||
uint32_t nodes;
|
||||
uint32_t inputs;
|
||||
uint32_t outputs;
|
||||
uint32_t reserved0;
|
||||
};
|
||||
|
||||
constexpr uint32_t MODEL_IDENTIFIER = 'KMDL';
|
||||
constexpr uint32_t MODEL_VERSION = 4;
|
||||
|
||||
struct node_header
|
||||
{
|
||||
runtime_opcode opcode;
|
||||
uint32_t body_size;
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
BEGINE_DEFINE_TARGET(neutral)
|
||||
DEFINE_RUNTIME_OP(neutral, binary, Binary, 0)
|
||||
DEFINE_RUNTIME_OP(neutral, concat, Concat, 1)
|
||||
DEFINE_RUNTIME_OP(neutral, conv2d, Conv2D, 2)
|
||||
DEFINE_RUNTIME_OP(neutral, dequantize, Dequantize, 3)
|
||||
DEFINE_RUNTIME_OP(neutral, matmul, MatMul, 4)
|
||||
DEFINE_RUNTIME_OP(neutral, pad, Pad, 5)
|
||||
DEFINE_RUNTIME_OP(neutral, quantize, Quantize, 6)
|
||||
DEFINE_RUNTIME_OP(neutral, reduce, Reduce, 7)
|
||||
DEFINE_RUNTIME_OP(neutral, reduce_window2d, ReduceWindow2D, 8)
|
||||
DEFINE_RUNTIME_OP(neutral, memory_copy, MemoryCopy, 9)
|
||||
DEFINE_RUNTIME_OP(neutral, resize_bilinear, ResizeBilinear, 10)
|
||||
DEFINE_RUNTIME_OP(neutral, resize_nearest_neighbor, ResizeNearestNeighbor, 11)
|
||||
DEFINE_RUNTIME_OP(neutral, softmax, Softmax, 12)
|
||||
DEFINE_RUNTIME_OP(neutral, transpose, Transpose, 13)
|
||||
DEFINE_RUNTIME_OP(neutral, strided_slice, StridedSlice, 14)
|
||||
END_DEFINE_TARGET()
|
||||
|
||||
// CPU
|
||||
BEGINE_DEFINE_TARGET(cpu)
|
||||
DEFINE_RUNTIME_OP(cpu, cpu_conv2d, CPU_CPUConv2D, 1001)
|
||||
DEFINE_RUNTIME_OP(cpu, cpu_depthwise_conv2d, CPU_CPUDepthwiseConv2D, 1002)
|
||||
DEFINE_RUNTIME_OP(cpu, cpu_reduce_window2d, CPU_CPUReduceWindow2D, 1003)
|
||||
DEFINE_RUNTIME_OP(cpu, cpu_quantized_conv2d, CPU_CPUQuantizedConv2D, 1004)
|
||||
DEFINE_RUNTIME_OP(cpu, cpu_quantized_depthwise_conv2d, CPU_CPUQuantizedDepthwiseConv2D, 1005)
|
||||
END_DEFINE_TARGET()
|
||||
|
||||
// K210
|
||||
BEGINE_DEFINE_TARGET(k210)
|
||||
DEFINE_RUNTIME_OP(k210, kpu_upload, K210_KPUUpload, 2001)
|
||||
DEFINE_RUNTIME_OP(k210, kpu_conv2d, K210_KPUConv2D, 2002)
|
||||
END_DEFINE_TARGET()
|
|
@ -0,0 +1,37 @@
|
|||
#pragma once
|
||||
#include "../datatypes.h"
|
||||
#include <string_view>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
#define BEGINE_DEFINE_TARGET(...)
|
||||
#define DEFINE_RUNTIME_OP(target, id, name, value) rop_##id = value,
|
||||
#define END_DEFINE_TARGET()
|
||||
|
||||
enum runtime_opcode : uint32_t
|
||||
{
|
||||
#include "runtime_op.def"
|
||||
};
|
||||
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#define DEFINE_RUNTIME_OP(target, id, name, value) \
|
||||
case rop_##id: \
|
||||
return #name;
|
||||
|
||||
constexpr std::string_view node_opcode_names(runtime_opcode opcode)
|
||||
{
|
||||
switch (opcode)
|
||||
{
|
||||
#include "runtime_op.def"
|
||||
default:
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
#undef BEGINE_DEFINE_TARGET
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#undef END_DEFINE_TARGET
|
||||
}
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
#pragma once
|
||||
#include <xtl/xspan.hpp>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
class span_reader
|
||||
{
|
||||
public:
|
||||
span_reader(xtl::span<const uint8_t> span)
|
||||
: span_(span)
|
||||
{
|
||||
}
|
||||
|
||||
bool empty() const noexcept { return span_.empty(); }
|
||||
|
||||
template <class T>
|
||||
T read()
|
||||
{
|
||||
auto value = *reinterpret_cast<const T *>(span_.data());
|
||||
advance(sizeof(T));
|
||||
return value;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void read(T &value)
|
||||
{
|
||||
value = *reinterpret_cast<const T *>(span_.data());
|
||||
advance(sizeof(T));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void read_span(xtl::span<const T> &span, size_t size)
|
||||
{
|
||||
span = { reinterpret_cast<const T *>(span_.data()), size };
|
||||
advance(sizeof(T) * size);
|
||||
}
|
||||
|
||||
template <class T, ptrdiff_t N>
|
||||
void read_span(xtl::span<const T, N> &span)
|
||||
{
|
||||
span = { reinterpret_cast<const T *>(span_.data()), N };
|
||||
advance(sizeof(T) * N);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
const T *peek() const noexcept
|
||||
{
|
||||
return reinterpret_cast<const T *>(span_.data());
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void get_array(const T *&value, size_t size)
|
||||
{
|
||||
value = peek<T>();
|
||||
advance(size * sizeof(T));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void get_ref(const T *&value)
|
||||
{
|
||||
value = peek<T>();
|
||||
advance(sizeof(T));
|
||||
}
|
||||
|
||||
void skip(size_t count)
|
||||
{
|
||||
advance(count);
|
||||
}
|
||||
|
||||
private:
|
||||
void advance(size_t count)
|
||||
{
|
||||
span_ = span_.subspan(count);
|
||||
}
|
||||
|
||||
private:
|
||||
xtl::span<const uint8_t> span_;
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
#pragma once
|
||||
|
||||
#define NNCASE_CONCAT_3(a, b, c) a/b/c
|
||||
#define NNCASE_TARGET_HEADER_(target, name) <NNCASE_CONCAT_3(targets, target, name)>
|
||||
#define NNCASE_TARGET_HEADER(name) NNCASE_TARGET_HEADER_(NNCASE_TARGET, name)
|
||||
|
||||
#include NNCASE_TARGET_HEADER(interpreter.h)
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
using interpreter_t = nncase::targets::NNCASE_TARGET::interpreter;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
#pragma once
|
||||
#include <cassert>
|
||||
#include <datatypes.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace runtime
|
||||
{
|
||||
inline size_t get_bytes(datatype_t type)
|
||||
{
|
||||
size_t element_size;
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case dt_float32:
|
||||
element_size = 4;
|
||||
break;
|
||||
case dt_uint8:
|
||||
element_size = 1;
|
||||
break;
|
||||
default:
|
||||
assert(!"Not supported data type");
|
||||
}
|
||||
|
||||
return element_size;
|
||||
}
|
||||
|
||||
template <int32_t Bits, class T>
|
||||
uint8_t count_leading_zeros(T value)
|
||||
{
|
||||
uint8_t num_zeroes = 0;
|
||||
for (int32_t i = Bits - 1; i >= 0; i--)
|
||||
{
|
||||
if ((value & (1ULL << i)) == 0)
|
||||
++num_zeroes;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
return num_zeroes;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T carry_shift(T value, uint8_t shift)
|
||||
{
|
||||
if (shift > 0)
|
||||
{
|
||||
value >>= shift - 1;
|
||||
if (value & 0x1)
|
||||
{
|
||||
if (value < 0)
|
||||
value = (value >> 1) - 1;
|
||||
else
|
||||
value = (value >> 1) + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
value >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
inline int32_t mul_and_carry_shift(int32_t value, int32_t mul, uint8_t shift)
|
||||
{
|
||||
return (int32_t)carry_shift((int64_t) value * mul, shift);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,193 @@
|
|||
#pragma once
|
||||
#include "../node_body.h"
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace cpu
|
||||
{
|
||||
struct cpu_conv2d_options
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
int32_t out_channels;
|
||||
padding padding_h;
|
||||
padding padding_w;
|
||||
int32_t filter_h;
|
||||
int32_t filter_w;
|
||||
int32_t stride_h;
|
||||
int32_t stride_w;
|
||||
int32_t dilation_h;
|
||||
int32_t dilation_w;
|
||||
value_range<float> fused_activation;
|
||||
xtl::span<const float> weights;
|
||||
xtl::span<const float> bias;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(input);
|
||||
reader.read(output);
|
||||
reader.read(in_shape);
|
||||
reader.read(out_channels);
|
||||
reader.read(padding_h);
|
||||
reader.read(padding_w);
|
||||
reader.read(filter_h);
|
||||
reader.read(filter_w);
|
||||
reader.read(stride_h);
|
||||
reader.read(stride_w);
|
||||
reader.read(dilation_h);
|
||||
reader.read(dilation_w);
|
||||
reader.read(fused_activation);
|
||||
reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w);
|
||||
reader.read_span(bias, out_channels);
|
||||
}
|
||||
};
|
||||
|
||||
struct cpu_depthwise_conv2d_options
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
padding padding_h;
|
||||
padding padding_w;
|
||||
int32_t filter_h;
|
||||
int32_t filter_w;
|
||||
int32_t stride_h;
|
||||
int32_t stride_w;
|
||||
int32_t dilation_h;
|
||||
int32_t dilation_w;
|
||||
value_range<float> fused_activation;
|
||||
xtl::span<const float> weights;
|
||||
xtl::span<const float> bias;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(input);
|
||||
reader.read(output);
|
||||
reader.read(in_shape);
|
||||
reader.read(padding_h);
|
||||
reader.read(padding_w);
|
||||
reader.read(filter_h);
|
||||
reader.read(filter_w);
|
||||
reader.read(stride_h);
|
||||
reader.read(stride_w);
|
||||
reader.read(dilation_h);
|
||||
reader.read(dilation_w);
|
||||
reader.read(fused_activation);
|
||||
reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w);
|
||||
reader.read_span(bias, in_shape[3]);
|
||||
}
|
||||
};
|
||||
|
||||
struct cpu_reduce_window2d_options : simple_node_body<cpu_reduce_window2d_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
reduce_op_t reduce_op;
|
||||
runtime_shape_t in_shape;
|
||||
padding padding_h;
|
||||
padding padding_w;
|
||||
int32_t filter_h;
|
||||
int32_t filter_w;
|
||||
int32_t stride_h;
|
||||
int32_t stride_w;
|
||||
int32_t dilation_h;
|
||||
int32_t dilation_w;
|
||||
float init_value;
|
||||
value_range<float> fused_activation;
|
||||
};
|
||||
|
||||
struct cpu_quantized_conv2d_options
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
int32_t out_channels;
|
||||
padding padding_h;
|
||||
padding padding_w;
|
||||
int32_t filter_h;
|
||||
int32_t filter_w;
|
||||
int32_t stride_h;
|
||||
int32_t stride_w;
|
||||
int32_t dilation_h;
|
||||
int32_t dilation_w;
|
||||
int32_t input_offset;
|
||||
int32_t filter_offset;
|
||||
int32_t output_mul;
|
||||
int32_t output_shift;
|
||||
int32_t output_offset;
|
||||
xtl::span<const uint8_t> weights;
|
||||
xtl::span<const int32_t> bias;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(input);
|
||||
reader.read(output);
|
||||
reader.read(in_shape);
|
||||
reader.read(out_channels);
|
||||
reader.read(padding_h);
|
||||
reader.read(padding_w);
|
||||
reader.read(filter_h);
|
||||
reader.read(filter_w);
|
||||
reader.read(stride_h);
|
||||
reader.read(stride_w);
|
||||
reader.read(dilation_h);
|
||||
reader.read(dilation_w);
|
||||
reader.read(input_offset);
|
||||
reader.read(filter_offset);
|
||||
reader.read(output_mul);
|
||||
reader.read(output_shift);
|
||||
reader.read(output_offset);
|
||||
reader.read_span(weights, (size_t)out_channels * in_shape[3] * filter_h * filter_w);
|
||||
reader.read_span(bias, out_channels);
|
||||
}
|
||||
};
|
||||
|
||||
struct cpu_quantized_depthwise_conv2d_options
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
padding padding_h;
|
||||
padding padding_w;
|
||||
int32_t filter_h;
|
||||
int32_t filter_w;
|
||||
int32_t stride_h;
|
||||
int32_t stride_w;
|
||||
int32_t dilation_h;
|
||||
int32_t dilation_w;
|
||||
int32_t input_offset;
|
||||
int32_t filter_offset;
|
||||
int32_t output_mul;
|
||||
int32_t output_shift;
|
||||
int32_t output_offset;
|
||||
xtl::span<const uint8_t> weights;
|
||||
xtl::span<const int32_t> bias;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(input);
|
||||
reader.read(output);
|
||||
reader.read(in_shape);
|
||||
reader.read(padding_h);
|
||||
reader.read(padding_w);
|
||||
reader.read(filter_h);
|
||||
reader.read(filter_w);
|
||||
reader.read(stride_h);
|
||||
reader.read(stride_w);
|
||||
reader.read(dilation_h);
|
||||
reader.read(dilation_w);
|
||||
reader.read(input_offset);
|
||||
reader.read(filter_offset);
|
||||
reader.read(output_mul);
|
||||
reader.read(output_shift);
|
||||
reader.read(output_offset);
|
||||
reader.read_span(weights, (size_t)in_shape[3] * filter_h * filter_w);
|
||||
reader.read_span(bias, in_shape[3]);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include <runtime/interpreter.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace cpu
|
||||
{
|
||||
class interpreter : public runtime::interpreter_base
|
||||
{
|
||||
public:
|
||||
using interpreter_base::interpreter_base;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
#pragma once
|
||||
#include "k210_sim_types.h"
|
||||
#include <runtime/interpreter.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace k210
|
||||
{
|
||||
struct k210_interpreter_context
|
||||
{
|
||||
runtime::interpreter_base *interpreter;
|
||||
runtime::interpreter_step_t step;
|
||||
};
|
||||
|
||||
class interpreter : public runtime::interpreter_base
|
||||
{
|
||||
public:
|
||||
using interpreter_base::memory_at;
|
||||
|
||||
interpreter();
|
||||
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
|
||||
dmac_channel_number_t dma_ch() const noexcept { return dma_ch_; }
|
||||
void dma_ch(dmac_channel_number_t dma_ch) noexcept { dma_ch_ = dma_ch; }
|
||||
k210_interpreter_context &context() noexcept { return context_; }
|
||||
#endif
|
||||
|
||||
protected:
|
||||
xtl::span<uint8_t> memory_at(const memory_range &range) const noexcept override;
|
||||
|
||||
private:
|
||||
#if NNCASE_TARGET_K210_SIMULATOR
|
||||
std::unique_ptr<uint8_t[]> kpu_mem_;
|
||||
#else
|
||||
dmac_channel_number_t dma_ch_;
|
||||
k210_interpreter_context context_;
|
||||
#endif
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
#include "../node_body.h"
|
||||
#include "k210_runtime_op_utility.h"
|
||||
#include "k210_sim_types.h"
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace k210
|
||||
{
|
||||
struct kpu_upload_options : simple_node_body<kpu_upload_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
};
|
||||
|
||||
struct kpu_conv2d_options
|
||||
{
|
||||
memory_range main_mem_output;
|
||||
int32_t batches;
|
||||
int32_t reserved0;
|
||||
kpu_layer_argument_t layer;
|
||||
xtl::span<const kpu_batchnorm_argument_t> batch_norm;
|
||||
const kpu_activate_table_t *activation;
|
||||
xtl::span<const uint8_t> weights;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(main_mem_output);
|
||||
reader.read(batches);
|
||||
reader.read(reserved0);
|
||||
reader.read(layer);
|
||||
|
||||
auto ic = layer.image_channel_num.data.i_ch_num + 1;
|
||||
auto oc = layer.image_channel_num.data.o_ch_num + 1;
|
||||
auto filter = get_kpu_filter_size((kpu_filter_type_t)layer.kernel_pool_type_cfg.data.kernel_type);
|
||||
auto weights_size = layer.interrupt_enabe.data.depth_wise_layer
|
||||
? oc * filter * filter
|
||||
: ic * oc * filter * filter;
|
||||
|
||||
reader.skip(layer.kernel_pool_type_cfg.data.bwsx_base_addr);
|
||||
reader.read_span(batch_norm, oc);
|
||||
reader.skip(layer.kernel_calc_type_cfg.data.active_addr);
|
||||
reader.get_ref(activation);
|
||||
reader.skip(layer.kernel_load_cfg.data.para_start_addr);
|
||||
reader.read_span(weights, weights_size);
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)batch_norm.data();
|
||||
layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)activation;
|
||||
layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)weights.data();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
#pragma once
|
||||
#include "k210_sim_types.h"
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace k210
|
||||
{
|
||||
struct kpu_layout
|
||||
{
|
||||
int32_t groups;
|
||||
int32_t row_len;
|
||||
int32_t row_pitch;
|
||||
};
|
||||
|
||||
inline kpu_layout get_kpu_row_layout(int32_t width)
|
||||
{
|
||||
kpu_layout layout;
|
||||
|
||||
if (width <= 16)
|
||||
{
|
||||
layout.groups = 4;
|
||||
layout.row_len = 1;
|
||||
layout.row_pitch = 16;
|
||||
}
|
||||
else if (width <= 32)
|
||||
{
|
||||
layout.groups = 2;
|
||||
layout.row_len = 1;
|
||||
layout.row_pitch = 32;
|
||||
}
|
||||
else
|
||||
{
|
||||
layout.groups = 1;
|
||||
layout.row_len = (width + 63) / 64;
|
||||
layout.row_pitch = 64;
|
||||
}
|
||||
|
||||
return layout;
|
||||
}
|
||||
|
||||
inline int32_t get_kpu_filter_size(kpu_filter_type_t filter)
|
||||
{
|
||||
switch (filter)
|
||||
{
|
||||
case kpu_filter_1x1:
|
||||
return 1;
|
||||
case kpu_filter_3x3:
|
||||
return 3;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline int get_kpu_rows(int32_t width, int32_t height, int32_t channels)
|
||||
{
|
||||
auto layout = get_kpu_row_layout(width);
|
||||
auto one_line_channels = std::min(channels, layout.groups);
|
||||
auto blocks = (channels + one_line_channels - 1) / one_line_channels;
|
||||
auto size = layout.row_len * height * blocks;
|
||||
return size;
|
||||
}
|
||||
|
||||
inline int get_kpu_bytes(int32_t width, int32_t height, int32_t channels)
|
||||
{
|
||||
return get_kpu_rows(width, height, channels) * 64;
|
||||
}
|
||||
|
||||
#if NNCASE_TARGET_K210_SIMULATOR
|
||||
|
||||
inline int32_t get_kpu_filter_size(kpu_pool_type_t filter)
|
||||
{
|
||||
switch (filter)
|
||||
{
|
||||
case kpu_pool_bypass:
|
||||
return 1;
|
||||
case kpu_pool_max_2_s2:
|
||||
case kpu_pool_mean_2_s2:
|
||||
case kpu_pool_left_top_2_s2:
|
||||
case kpu_pool_right_top_2_s2:
|
||||
case kpu_pool_max_2_s1:
|
||||
case kpu_pool_mean_2_s1:
|
||||
return 2;
|
||||
case kpu_pool_max_4_s4:
|
||||
case kpu_pool_mean_4_s4:
|
||||
case kpu_pool_left_top_4_s4:
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
inline int32_t get_kpu_filter_stride(kpu_pool_type_t filter)
|
||||
{
|
||||
switch (filter)
|
||||
{
|
||||
case kpu_pool_bypass:
|
||||
return 1;
|
||||
case kpu_pool_max_2_s2:
|
||||
case kpu_pool_mean_2_s2:
|
||||
case kpu_pool_left_top_2_s2:
|
||||
case kpu_pool_right_top_2_s2:
|
||||
return 2;
|
||||
case kpu_pool_max_2_s1:
|
||||
case kpu_pool_mean_2_s1:
|
||||
return 1;
|
||||
case kpu_pool_max_4_s4:
|
||||
case kpu_pool_mean_4_s4:
|
||||
case kpu_pool_left_top_4_s4:
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
inline int32_t get_kpu_pool_output_size(int32_t input, kpu_pool_type_t pool_type)
|
||||
{
|
||||
return input / get_kpu_filter_stride(pool_type);
|
||||
}
|
||||
|
||||
inline std::array<int32_t, 2> get_kpu_select_pool_offset(kpu_pool_type_t pool_type)
|
||||
{
|
||||
switch (pool_type)
|
||||
{
|
||||
case kpu_pool_left_top_2_s2:
|
||||
return { 0, 0 };
|
||||
case kpu_pool_right_top_2_s2:
|
||||
return { 0, 1 };
|
||||
case kpu_pool_left_top_4_s4:
|
||||
return { 0, 0 };
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,249 @@
|
|||
#pragma once
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
|
||||
#ifdef __riscv64
|
||||
#define NNCASE_TARGET_K210_SIMULATOR 0
|
||||
#include <kpu.h>
|
||||
#else
|
||||
#define NNCASE_TARGET_K210_SIMULATOR 1
|
||||
#endif
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace k210
|
||||
{
|
||||
#if NNCASE_TARGET_K210_SIMULATOR
|
||||
typedef struct
|
||||
{
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t int_en : 1;
|
||||
uint64_t ram_flag : 1;
|
||||
uint64_t full_add : 1;
|
||||
uint64_t depth_wise_layer : 1;
|
||||
uint64_t reserved : 60;
|
||||
} data;
|
||||
} interrupt_enabe;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t image_src_addr : 15;
|
||||
uint64_t reserved0 : 17;
|
||||
uint64_t image_dst_addr : 15;
|
||||
uint64_t reserved1 : 17;
|
||||
} data;
|
||||
} image_addr;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t i_ch_num : 10;
|
||||
uint64_t reserved0 : 22;
|
||||
uint64_t o_ch_num : 10;
|
||||
uint64_t reserved1 : 6;
|
||||
uint64_t o_ch_num_coef : 10;
|
||||
uint64_t reserved2 : 6;
|
||||
} data;
|
||||
} image_channel_num;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t i_row_wid : 10;
|
||||
uint64_t i_col_high : 9;
|
||||
uint64_t reserved0 : 13;
|
||||
uint64_t o_row_wid : 10;
|
||||
uint64_t o_col_high : 9;
|
||||
uint64_t reserved1 : 13;
|
||||
} data;
|
||||
} image_size;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t kernel_type : 3;
|
||||
uint64_t pad_type : 1;
|
||||
uint64_t pool_type : 4;
|
||||
uint64_t first_stride : 1;
|
||||
uint64_t bypass_conv : 1;
|
||||
uint64_t load_para : 1;
|
||||
uint64_t reserved0 : 5;
|
||||
uint64_t dma_burst_size : 8;
|
||||
uint64_t pad_value : 8;
|
||||
uint64_t bwsx_base_addr : 32;
|
||||
} data;
|
||||
} kernel_pool_type_cfg;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t load_coor : 1;
|
||||
uint64_t load_time : 6;
|
||||
uint64_t reserved0 : 8;
|
||||
uint64_t para_size : 17;
|
||||
uint64_t para_start_addr : 32;
|
||||
} data;
|
||||
} kernel_load_cfg;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t coef_column_offset : 4;
|
||||
uint64_t coef_row_offset : 12;
|
||||
uint64_t reserved0 : 48;
|
||||
} data;
|
||||
} kernel_offset;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t channel_switch_addr : 15;
|
||||
uint64_t reserved : 1;
|
||||
uint64_t row_switch_addr : 4;
|
||||
uint64_t coef_size : 8;
|
||||
uint64_t coef_group : 3;
|
||||
uint64_t load_act : 1;
|
||||
uint64_t active_addr : 32;
|
||||
} data;
|
||||
} kernel_calc_type_cfg;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t wb_channel_switch_addr : 15;
|
||||
uint64_t reserved0 : 1;
|
||||
uint64_t wb_row_switch_addr : 4;
|
||||
uint64_t wb_group : 3;
|
||||
uint64_t reserved1 : 41;
|
||||
} data;
|
||||
} write_back_cfg;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t shr_w : 4;
|
||||
uint64_t shr_x : 4;
|
||||
uint64_t arg_w : 24;
|
||||
uint64_t arg_x : 24;
|
||||
uint64_t reserved0 : 8;
|
||||
} data;
|
||||
} conv_value;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t arg_add : 40;
|
||||
uint64_t reserved : 24;
|
||||
} data;
|
||||
} conv_value2;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t send_data_out : 1;
|
||||
uint64_t reserved : 15;
|
||||
uint64_t channel_byte_num : 16;
|
||||
uint64_t dma_total_byte : 32;
|
||||
} data;
|
||||
} dma_parameter;
|
||||
} kpu_layer_argument_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t shift_number : 8;
|
||||
uint64_t y_mul : 16;
|
||||
uint64_t x_start : 36;
|
||||
} data;
|
||||
} activate_para[16];
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint8_t result_bias[8];
|
||||
} data;
|
||||
} activate_para_bias0;
|
||||
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint8_t result_bias[8];
|
||||
} data;
|
||||
} activate_para_bias1;
|
||||
} kpu_activate_table_t;
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union {
|
||||
uint64_t reg;
|
||||
struct
|
||||
{
|
||||
uint64_t norm_mul : 24;
|
||||
uint64_t norm_add : 32;
|
||||
uint64_t norm_shift : 4;
|
||||
} data;
|
||||
} batchnorm;
|
||||
} kpu_batchnorm_argument_t;
|
||||
|
||||
typedef enum _kpu_filter_type
|
||||
{
|
||||
kpu_filter_1x1 = 0,
|
||||
kpu_filter_3x3 = 1
|
||||
} kpu_filter_type_t;
|
||||
|
||||
typedef enum _kpu_pool_type
|
||||
{
|
||||
kpu_pool_bypass = 0,
|
||||
kpu_pool_max_2_s2 = 1,
|
||||
kpu_pool_mean_2_s2 = 2,
|
||||
kpu_pool_max_4_s4 = 3,
|
||||
kpu_pool_mean_4_s4 = 4,
|
||||
kpu_pool_left_top_2_s2 = 5,
|
||||
kpu_pool_right_top_2_s2 = 6,
|
||||
kpu_pool_left_top_4_s4 = 7,
|
||||
kpu_pool_mean_2_s1 = 8,
|
||||
kpu_pool_max_2_s1 = 9
|
||||
} kpu_pool_type_t;
|
||||
|
||||
struct kpu_batchnorm_segment
|
||||
{
|
||||
int32_t mul;
|
||||
int32_t shift;
|
||||
int32_t add;
|
||||
};
|
||||
|
||||
struct kpu_activation_segment
|
||||
{
|
||||
int64_t start_x;
|
||||
int32_t mul;
|
||||
int32_t shift;
|
||||
int32_t add;
|
||||
};
|
||||
|
||||
using kpu_activation_table_t = std::array<kpu_activation_segment, 16>;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,258 @@
|
|||
#pragma once
|
||||
#include "../node_body.h"
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace neutral
|
||||
{
|
||||
struct binary_options : public simple_node_body<binary_options>
|
||||
{
|
||||
memory_range input_a;
|
||||
memory_range input_b;
|
||||
memory_range output;
|
||||
binary_op_t binary_op;
|
||||
runtime_shape_t in_a_shape;
|
||||
runtime_shape_t in_b_shape;
|
||||
runtime_shape_t out_shape;
|
||||
value_range<float> fused_activation;
|
||||
};
|
||||
|
||||
struct concat_options
|
||||
{
|
||||
memory_range output;
|
||||
uint32_t inner_size;
|
||||
uint32_t outer_size;
|
||||
uint32_t inputs_count;
|
||||
xtl::span<const memory_range> inputs;
|
||||
xtl::span<const int32_t> dims;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(output);
|
||||
reader.read(inner_size);
|
||||
reader.read(outer_size);
|
||||
reader.read(inputs_count);
|
||||
reader.read_span(inputs, inputs_count);
|
||||
reader.read_span(dims, inputs_count);
|
||||
}
|
||||
|
||||
void serialize(runtime::binary_writer &writer) const
|
||||
{
|
||||
writer.write(output);
|
||||
writer.write(inner_size);
|
||||
writer.write(outer_size);
|
||||
writer.write(inputs_count);
|
||||
writer.write_array(inputs);
|
||||
writer.write_array(dims);
|
||||
}
|
||||
};
|
||||
|
||||
struct conv2d_options
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
int32_t groups;
|
||||
int32_t out_channels;
|
||||
padding padding_h;
|
||||
padding padding_w;
|
||||
int32_t filter_h;
|
||||
int32_t filter_w;
|
||||
int32_t stride_h;
|
||||
int32_t stride_w;
|
||||
int32_t dilation_h;
|
||||
int32_t dilation_w;
|
||||
value_range<float> fused_activation;
|
||||
xtl::span<const float> weights;
|
||||
xtl::span<const float> bias;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(input);
|
||||
reader.read(output);
|
||||
reader.read(in_shape);
|
||||
reader.read(groups);
|
||||
reader.read(out_channels);
|
||||
reader.read(padding_h);
|
||||
reader.read(padding_w);
|
||||
reader.read(filter_h);
|
||||
reader.read(filter_w);
|
||||
reader.read(stride_h);
|
||||
reader.read(stride_w);
|
||||
reader.read(dilation_h);
|
||||
reader.read(dilation_w);
|
||||
reader.read(fused_activation);
|
||||
reader.read_span(weights, (size_t)out_channels * in_shape[1] / groups * filter_h * filter_w);
|
||||
reader.read_span(bias, out_channels);
|
||||
}
|
||||
|
||||
void serialize(runtime::binary_writer &writer) const
|
||||
{
|
||||
writer.write(input);
|
||||
writer.write(output);
|
||||
writer.write(in_shape);
|
||||
writer.write(groups);
|
||||
writer.write(out_channels);
|
||||
writer.write(padding_h);
|
||||
writer.write(padding_w);
|
||||
writer.write(filter_h);
|
||||
writer.write(filter_w);
|
||||
writer.write(stride_h);
|
||||
writer.write(stride_w);
|
||||
writer.write(dilation_h);
|
||||
writer.write(dilation_w);
|
||||
writer.write(fused_activation);
|
||||
writer.write_array(weights);
|
||||
writer.write_array(bias);
|
||||
}
|
||||
};
|
||||
|
||||
struct dequantize_options : public simple_node_body<dequantize_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
quant_param_t quant_param;
|
||||
};
|
||||
|
||||
struct matmul_options
|
||||
{
|
||||
memory_range input_a;
|
||||
memory_range input_b;
|
||||
memory_range output;
|
||||
int32_t a_rows;
|
||||
int32_t a_cols;
|
||||
int32_t b_cols;
|
||||
value_range<float> fused_activation;
|
||||
xtl::span<const float> bias;
|
||||
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(input_a);
|
||||
reader.read(input_b);
|
||||
reader.read(output);
|
||||
reader.read(a_rows);
|
||||
reader.read(a_cols);
|
||||
reader.read(b_cols);
|
||||
reader.read(fused_activation);
|
||||
reader.read_span(bias, b_cols);
|
||||
}
|
||||
|
||||
void serialize(runtime::binary_writer &writer) const
|
||||
{
|
||||
writer.write(input_a);
|
||||
writer.write(input_b);
|
||||
writer.write(output);
|
||||
writer.write(a_rows);
|
||||
writer.write(a_cols);
|
||||
writer.write(b_cols);
|
||||
writer.write(fused_activation);
|
||||
writer.write_array(bias);
|
||||
}
|
||||
};
|
||||
|
||||
struct memory_copy_options : public simple_node_body<memory_copy_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
};
|
||||
|
||||
struct pad_options : public simple_node_body<pad_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
runtime_paddings_t paddings;
|
||||
scalar pad_value;
|
||||
};
|
||||
|
||||
struct quantize_options : public simple_node_body<quantize_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
quant_param_t quant_param;
|
||||
};
|
||||
|
||||
struct reduce_options : public simple_node_body<reduce_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
reduce_op_t reduce_op;
|
||||
runtime_shape_t in_shape;
|
||||
runtime_shape_t out_shape;
|
||||
float init_value;
|
||||
};
|
||||
|
||||
struct reduce_window2d_options : simple_node_body<reduce_window2d_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
reduce_op_t reduce_op;
|
||||
runtime_shape_t in_shape;
|
||||
padding padding_h;
|
||||
padding padding_w;
|
||||
int32_t filter_h;
|
||||
int32_t filter_w;
|
||||
int32_t stride_h;
|
||||
int32_t stride_w;
|
||||
int32_t dilation_h;
|
||||
int32_t dilation_w;
|
||||
float init_value;
|
||||
value_range<float> fused_activation;
|
||||
};
|
||||
|
||||
struct resize_bilinear_options : public simple_node_body<resize_bilinear_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
int32_t out_h;
|
||||
int32_t out_w;
|
||||
bool align_corners;
|
||||
};
|
||||
|
||||
struct resize_nearest_neighbor_options : public simple_node_body<resize_nearest_neighbor_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
int32_t out_h;
|
||||
int32_t out_w;
|
||||
bool align_corners;
|
||||
};
|
||||
|
||||
struct softmax_options : public simple_node_body<softmax_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
int32_t inner_size;
|
||||
int32_t outer_size;
|
||||
float beta;
|
||||
};
|
||||
|
||||
struct transpose_options : public simple_node_body<transpose_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
runtime_shape_t perm;
|
||||
};
|
||||
|
||||
struct strided_slice_options : public simple_node_body<strided_slice_options>
|
||||
{
|
||||
memory_range input;
|
||||
memory_range output;
|
||||
runtime_shape_t in_shape;
|
||||
runtime_shape_t begin;
|
||||
runtime_shape_t end;
|
||||
runtime_shape_t strides;
|
||||
int32_t begin_mask;
|
||||
int32_t end_mask;
|
||||
int32_t ellipsis_mask;
|
||||
int32_t new_axis_mask;
|
||||
int32_t shrink_axis_mask;
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
#pragma once
|
||||
#include "../runtime/binary_writer.h"
|
||||
#include "../runtime/span_reader.h"
|
||||
#include <datatypes.h>
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
template <class T>
|
||||
struct simple_node_body
|
||||
{
|
||||
void deserialize(runtime::span_reader &reader)
|
||||
{
|
||||
reader.read(static_cast<T &>(*this));
|
||||
}
|
||||
|
||||
void serialize(runtime::binary_writer &writer) const
|
||||
{
|
||||
writer.write(static_cast<const T &>(*this));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,116 @@
|
|||
/* Copyright 2018 Canaan Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <nncase.h>
|
||||
#include <runtime/target_config.h>
|
||||
#include <stdio.h>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
|
||||
class nncase_context
|
||||
{
|
||||
public:
|
||||
int load_kmodel(const uint8_t *buffer)
|
||||
{
|
||||
return interpreter_.try_load_model(buffer) ? 0 : -1;
|
||||
}
|
||||
|
||||
int get_output(uint32_t index, uint8_t **data, size_t *size)
|
||||
{
|
||||
if (index >= interpreter_.outputs_size())
|
||||
return -1;
|
||||
|
||||
auto mem = interpreter_.memory_at<uint8_t>(interpreter_.output_at(index));
|
||||
*data = mem.data();
|
||||
*size = mem.size();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int run_kmodel(const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
|
||||
{
|
||||
done_callback_ = done_callback;
|
||||
userdata_ = userdata;
|
||||
interpreter_.dma_ch(dma_ch);
|
||||
|
||||
auto input = interpreter_.input_at(0);
|
||||
auto mem = interpreter_.memory_at<uint8_t>(input);
|
||||
std::copy(src, src + mem.size(), mem.begin());
|
||||
interpreter_.run(done_thunk, on_error_thunk, node_profile_thunk, this);
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
void on_done()
|
||||
{
|
||||
printf("Total: %fms\n", interpreter_.total_duration().count() / 1e6);
|
||||
|
||||
if (done_callback_)
|
||||
done_callback_(userdata_);
|
||||
}
|
||||
|
||||
static void done_thunk(void *userdata)
|
||||
{
|
||||
reinterpret_cast<nncase_context *>(userdata)->on_done();
|
||||
}
|
||||
|
||||
static void on_error_thunk(const char *err, void *userdata)
|
||||
{
|
||||
printf("Fatal: %s\n", err);
|
||||
}
|
||||
|
||||
static void node_profile_thunk(runtime_opcode op, std::chrono::nanoseconds duration, void *userdata)
|
||||
{
|
||||
printf("%s: %fms\n", node_opcode_names(op).data(), duration.count() / 1e6);
|
||||
}
|
||||
|
||||
private:
|
||||
interpreter_t interpreter_;
|
||||
kpu_done_callback_t done_callback_;
|
||||
void *userdata_;
|
||||
};
|
||||
|
||||
int nncase_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
||||
{
|
||||
auto nnctx = new (std::nothrow) nncase_context();
|
||||
if (ctx)
|
||||
{
|
||||
ctx->is_nncase = 1;
|
||||
ctx->nncase_ctx = nnctx;
|
||||
return nnctx->load_kmodel(buffer);
|
||||
}
|
||||
else
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int nncase_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
|
||||
{
|
||||
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
|
||||
return nnctx->get_output(index, data, size);
|
||||
}
|
||||
|
||||
void nncase_model_free(kpu_model_context_t *ctx)
|
||||
{
|
||||
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
|
||||
delete nnctx;
|
||||
ctx->nncase_ctx = nullptr;
|
||||
}
|
||||
|
||||
int nncase_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
|
||||
{
|
||||
auto nnctx = reinterpret_cast<nncase_context *>(ctx->nncase_ctx);
|
||||
return nnctx->run_kmodel(src, dma_ch, done_callback, userdata);
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <runtime/interpreter.h>
|
||||
#include <runtime/kernel_registry.h>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
|
||||
bool interpreter_base::try_load_model(const uint8_t *buffer)
|
||||
{
|
||||
auto offset = buffer;
|
||||
model_header_ = reinterpret_cast<const model_header *>(buffer);
|
||||
|
||||
// Validate model
|
||||
if (model_header_->identifier != MODEL_IDENTIFIER || model_header_->version != MODEL_VERSION || (model_header_->target != MODEL_TARGET_CPU && model_header_->target != MODEL_TARGET_K210))
|
||||
return false;
|
||||
|
||||
// Allocate buffers
|
||||
main_mem_.reset(new (std::nothrow) uint8_t[model_header_->main_mem]);
|
||||
if (!main_mem_)
|
||||
return false;
|
||||
|
||||
offset += sizeof(model_header);
|
||||
inputs_ = { reinterpret_cast<const memory_range *>(offset), inputs_size() };
|
||||
offset += sizeof(memory_range) * inputs_size();
|
||||
input_shapes_ = { reinterpret_cast<const runtime_shape_t *>(offset), inputs_size() };
|
||||
offset += sizeof(runtime_shape_t) * inputs_size();
|
||||
outputs_ = { reinterpret_cast<const memory_range *>(offset), outputs_size() };
|
||||
offset += sizeof(memory_range) * outputs_size();
|
||||
constants_ = { offset, model_header_->constants };
|
||||
offset += constants_.size();
|
||||
node_headers_ = { reinterpret_cast<const node_header *>(offset), nodes_size() };
|
||||
offset += sizeof(node_header) * nodes_size();
|
||||
node_body_start_ = offset;
|
||||
|
||||
return initialize();
|
||||
}
|
||||
|
||||
bool interpreter_base::initialize()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void interpreter_base::run(run_callback_t callback, error_callback_t on_error, node_profile_callback_t node_profile, void *userdata)
|
||||
{
|
||||
run_callback_ = callback;
|
||||
on_error_ = on_error;
|
||||
node_profile_ = node_profile;
|
||||
userdata_ = userdata;
|
||||
cnt_node_ = 0;
|
||||
cnt_node_body_ = node_body_start_;
|
||||
total_duration_ = {};
|
||||
last_time_.reset();
|
||||
step();
|
||||
}
|
||||
|
||||
void interpreter_base::step()
|
||||
{
|
||||
auto result = kcr_done;
|
||||
|
||||
while (result == kcr_done)
|
||||
{
|
||||
if (!last_time_)
|
||||
{
|
||||
last_time_ = clock_t::now();
|
||||
}
|
||||
else
|
||||
{
|
||||
auto now = clock_t::now();
|
||||
auto duration = now - *last_time_;
|
||||
total_duration_ += duration;
|
||||
last_time_ = now;
|
||||
|
||||
if (node_profile_)
|
||||
node_profile_(last_op_, duration, userdata_);
|
||||
}
|
||||
|
||||
if (cnt_node_ == nodes_size())
|
||||
{
|
||||
run_callback_(userdata_);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto node_id = cnt_node_++;
|
||||
auto header = node_headers_[node_id];
|
||||
xtl::span<const uint8_t> body(cnt_node_body_, header.body_size);
|
||||
cnt_node_body_ += header.body_size;
|
||||
last_op_ = header.opcode;
|
||||
|
||||
result = call_kernel(header.opcode, body, static_cast<interpreter_t &>(*this), &interpreter_base::step);
|
||||
|
||||
if (result == kcr_error)
|
||||
{
|
||||
if (on_error_)
|
||||
{
|
||||
char buffer[256];
|
||||
auto name = node_opcode_names(header.opcode);
|
||||
if (!name.empty())
|
||||
std::sprintf(buffer, "error occurs in running kernel: %s", name.data());
|
||||
else
|
||||
std::sprintf(buffer, "Unknown opcode: (%d)", header.opcode);
|
||||
on_error_(buffer, userdata_);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
xtl::span<uint8_t> interpreter_base::memory_at(const memory_range &range) const noexcept
|
||||
{
|
||||
uintptr_t base;
|
||||
|
||||
switch (range.memory_type)
|
||||
{
|
||||
case mem_const:
|
||||
base = (uintptr_t)constants_.data();
|
||||
break;
|
||||
case mem_main:
|
||||
base = (uintptr_t)main_mem_.get();
|
||||
break;
|
||||
default:
|
||||
base = 0;
|
||||
assert(!"Invalid memory type");
|
||||
break;
|
||||
}
|
||||
|
||||
return { reinterpret_cast<uint8_t *>(base + range.start), range.size };
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
#include <runtime/kernel_registry.h>
|
||||
#include <runtime/span_reader.h>
|
||||
#include <targets/cpu/cpu_ops_body.h>
|
||||
#include <targets/k210/k210_ops_body.h>
|
||||
#include <targets/neutral/neutral_ops_body.h>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
#define BEGINE_DEFINE_TARGET(target) \
|
||||
namespace target \
|
||||
{
|
||||
|
||||
#define DEFINE_RUNTIME_OP(target, id, name, value) \
|
||||
kernel_call_result id(id##_options &, interpreter_t &, interpreter_step_t);
|
||||
|
||||
#define END_DEFINE_TARGET() }
|
||||
|
||||
#include <runtime/runtime_op.def>
|
||||
|
||||
#undef BEGINE_DEFINE_TARGET
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#undef END_DEFINE_TARGET
|
||||
}
|
||||
}
|
||||
|
||||
kernel_call_result runtime::call_kernel(runtime_opcode opcode, xtl::span<const uint8_t> body, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
span_reader reader(body);
|
||||
|
||||
switch (opcode)
|
||||
{
|
||||
#define BEGINE_DEFINE_TARGET(...)
|
||||
#define DEFINE_RUNTIME_OP(target, id, name, value) \
|
||||
case rop_##id: \
|
||||
{ \
|
||||
nncase::targets::target::id##_options options; \
|
||||
options.deserialize(reader); \
|
||||
return nncase::targets::target::id(options, interpreter, step); \
|
||||
}
|
||||
#define END_DEFINE_TARGET()
|
||||
|
||||
#include <runtime/runtime_op.def>
|
||||
|
||||
#undef BEGINE_DEFINE_TARGET
|
||||
#undef DEFINE_RUNTIME_OP
|
||||
#undef END_DEFINE_TARGET
|
||||
default:
|
||||
return kcr_error;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
#include <kernels/cpu/cpu_kernels.h>
|
||||
#include <runtime/kernel_registry.h>
|
||||
#include <targets/cpu/cpu_ops_body.h>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace cpu
|
||||
{
|
||||
kernel_call_result cpu_conv2d(cpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
kernels::cpu::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h,
|
||||
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result cpu_depthwise_conv2d(cpu_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
kernels::cpu::depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h,
|
||||
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
runtime::kernel_call_result cpu_reduce_window2d(cpu_reduce_window2d_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
auto reduce = [&](auto binary_op, auto window_op) {
|
||||
kernels::cpu::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h,
|
||||
options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op);
|
||||
};
|
||||
|
||||
switch (options.reduce_op)
|
||||
{
|
||||
case reduce_mean:
|
||||
reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; });
|
||||
return runtime::kcr_done;
|
||||
case reduce_min:
|
||||
reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; });
|
||||
return runtime::kcr_done;
|
||||
case reduce_max:
|
||||
reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; });
|
||||
return kcr_done;
|
||||
default:
|
||||
return kcr_error;
|
||||
}
|
||||
}
|
||||
|
||||
kernel_call_result cpu_quantized_conv2d(cpu_quantized_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
kernels::cpu::quantized_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.out_channels, options.filter_h,
|
||||
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w,
|
||||
options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result cpu_quantized_depthwise_conv2d(cpu_quantized_depthwise_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
kernels::cpu::quantized_depthwise_conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.filter_h,
|
||||
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w,
|
||||
options.input_offset, options.filter_offset, options.output_mul, options.output_shift, options.output_offset);
|
||||
return kcr_done;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
#include <targets/k210/interpreter.h>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
using namespace nncase::targets::k210;
|
||||
|
||||
interpreter::interpreter()
|
||||
#if NNCASE_TARGET_K210_SIMULATOR
|
||||
: kpu_mem_(std::make_unique<uint8_t[]>(2 * 1024 * 1024))
|
||||
#endif
|
||||
{
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
kpu->interrupt_clear.reg = 7;
|
||||
kpu->interrupt_mask.reg = 7;
|
||||
kpu->fifo_threshold.reg = 10 | (1 << 4);
|
||||
kpu->eight_bit_mode.reg = 1;
|
||||
|
||||
plic_set_priority(IRQN_AI_INTERRUPT, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
xtl::span<uint8_t> interpreter::memory_at(const memory_range &range) const noexcept
|
||||
{
|
||||
if (range.memory_type == mem_k210_kpu)
|
||||
{
|
||||
uintptr_t base =
|
||||
#if NNCASE_TARGET_K210_SIMULATOR
|
||||
(uintptr_t)kpu_mem_.get();
|
||||
#else
|
||||
(uintptr_t)AI_IO_BASE_ADDR;
|
||||
#endif
|
||||
return { reinterpret_cast<uint8_t *>(base + range.start), range.size };
|
||||
}
|
||||
|
||||
return interpreter_base::memory_at(range);
|
||||
}
|
|
@ -0,0 +1,179 @@
|
|||
#include <kernels/k210/k210_kernels.h>
|
||||
#include <runtime/kernel_registry.h>
|
||||
#include <targets/k210/k210_ops_body.h>
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
#include <dmac.h>
|
||||
#include <sysctl.h>
|
||||
#endif
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
using namespace nncase::targets::k210;
|
||||
|
||||
namespace
|
||||
{
|
||||
#if !NNCASE_TARGET_K210_SIMULATOR
|
||||
void kpu_send_layer(const kpu_layer_argument_t &layer)
|
||||
{
|
||||
kpu->layer_argument_fifo = layer.interrupt_enabe.reg;
|
||||
kpu->layer_argument_fifo = layer.image_addr.reg;
|
||||
kpu->layer_argument_fifo = layer.image_channel_num.reg;
|
||||
kpu->layer_argument_fifo = layer.image_size.reg;
|
||||
kpu->layer_argument_fifo = layer.kernel_pool_type_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer.kernel_load_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer.kernel_offset.reg;
|
||||
kpu->layer_argument_fifo = layer.kernel_calc_type_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer.write_back_cfg.reg;
|
||||
kpu->layer_argument_fifo = layer.conv_value.reg;
|
||||
kpu->layer_argument_fifo = layer.conv_value2.reg;
|
||||
kpu->layer_argument_fifo = layer.dma_parameter.reg;
|
||||
}
|
||||
|
||||
void kpu_conv2d_normal(kpu_layer_argument_t &layer, plic_irq_callback_t callback, void *userdata)
|
||||
{
|
||||
kpu->interrupt_clear.reg = 0b111;
|
||||
kpu->interrupt_mask.reg = 0b110;
|
||||
layer.interrupt_enabe.data.int_en = 1;
|
||||
plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
|
||||
plic_irq_enable(IRQN_AI_INTERRUPT);
|
||||
kpu_send_layer(layer);
|
||||
}
|
||||
|
||||
void kpu_conv2d_output(kpu_layer_argument_t &layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
|
||||
{
|
||||
kpu->interrupt_clear.reg = 0b111;
|
||||
kpu->interrupt_mask.reg = 0b111;
|
||||
layer.dma_parameter.data.send_data_out = 1;
|
||||
sysctl_dma_select((sysctl_dma_channel_t)dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
|
||||
dmac_set_irq(dma_ch, callback, userdata, 1);
|
||||
dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
||||
DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
|
||||
kpu_send_layer(layer);
|
||||
}
|
||||
|
||||
int kpu_plic_thunk(void *userdata)
|
||||
{
|
||||
kpu->interrupt_clear.reg = 0b111;
|
||||
kpu->interrupt_mask.reg = 0b111;
|
||||
|
||||
auto &ctx = *reinterpret_cast<k210_interpreter_context *>(userdata);
|
||||
(ctx.interpreter->*ctx.step)();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace k210
|
||||
{
|
||||
kernel_call_result kpu_upload(kpu_upload_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
kernels::k210::kpu_upload(input.data(), output.data(), options.in_shape);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result kpu_conv2d(kpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
#if NNCASE_TARGET_K210_SIMULATOR
|
||||
auto input = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_src_addr * 64, 1 });
|
||||
auto kpu_out = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_dst_addr * 64, 1 });
|
||||
|
||||
auto in_h = static_cast<int32_t>(options.layer.image_size.data.i_col_high + 1);
|
||||
auto in_w = static_cast<int32_t>(options.layer.image_size.data.i_row_wid + 1);
|
||||
auto in_ch = static_cast<int32_t>(options.layer.image_channel_num.data.i_ch_num + 1);
|
||||
runtime_shape_t in_shape { options.batches, in_ch, in_h, in_w };
|
||||
auto in_fmap_size = kernels::details::compute_size(in_shape);
|
||||
|
||||
auto out_h = static_cast<int32_t>(options.layer.image_size.data.o_col_high + 1);
|
||||
auto out_w = static_cast<int32_t>(options.layer.image_size.data.o_row_wid + 1);
|
||||
auto out_ch = static_cast<int32_t>(options.layer.image_channel_num.data.o_ch_num + 1);
|
||||
runtime_shape_t conv_out_shape { options.batches, out_ch, in_h, in_w };
|
||||
auto conv_out_fmap_size = kernels::details::compute_size(conv_out_shape);
|
||||
runtime_shape_t out_shape { options.batches, out_ch, out_h, out_w };
|
||||
auto out_fmap_size = kernels::details::compute_size(out_shape);
|
||||
|
||||
auto input_tmp = std::make_unique<uint8_t[]>(in_fmap_size);
|
||||
auto workspace = std::make_unique<int64_t[]>(conv_out_fmap_size);
|
||||
auto conv_output_tmp = std::make_unique<uint8_t[]>(conv_out_fmap_size);
|
||||
auto output_tmp = std::make_unique<uint8_t[]>(out_fmap_size);
|
||||
|
||||
kernels::k210::kpu_download(input.data(), input_tmp.get(), in_shape);
|
||||
auto is_depthwise = options.layer.interrupt_enabe.data.depth_wise_layer != 0;
|
||||
auto filter_size = get_kpu_filter_size((kpu_filter_type_t)options.layer.kernel_pool_type_cfg.data.kernel_type);
|
||||
auto pad_value = (uint8_t)options.layer.kernel_pool_type_cfg.data.pad_value;
|
||||
auto arg_x = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_x);
|
||||
auto shift_x = (int32_t)options.layer.conv_value.data.shr_x;
|
||||
auto arg_w = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_w);
|
||||
auto shift_w = (int32_t)options.layer.conv_value.data.shr_w;
|
||||
auto arg_add = kernels::details::to_signed<40>(options.layer.conv_value2.data.arg_add);
|
||||
|
||||
auto batchnorm = std::make_unique<kpu_batchnorm_segment[]>(out_ch);
|
||||
for (size_t i = 0; i < out_ch; i++)
|
||||
{
|
||||
auto &src = options.batch_norm[i].batchnorm.data;
|
||||
auto &dest = batchnorm[i];
|
||||
dest.mul = (int32_t)kernels::details::to_signed<24>(src.norm_mul);
|
||||
dest.shift = (int32_t)src.norm_shift;
|
||||
dest.add = (int32_t)kernels::details::to_signed<32>(src.norm_add);
|
||||
}
|
||||
|
||||
kpu_activation_table_t activation;
|
||||
for (size_t i = 0; i < 16; i++)
|
||||
{
|
||||
auto &src = options.activation->activate_para[i].data;
|
||||
auto &dest = activation[i];
|
||||
dest.start_x = kernels::details::to_signed<36>(src.x_start);
|
||||
dest.mul = (int32_t)kernels::details::to_signed<16>(src.y_mul);
|
||||
dest.shift = (int32_t)src.shift_number;
|
||||
|
||||
if (i < 16)
|
||||
dest.add = options.activation->activate_para_bias0.data.result_bias[i];
|
||||
else
|
||||
dest.add = options.activation->activate_para_bias1.data.result_bias[i - 16];
|
||||
}
|
||||
|
||||
#define KPU_CONV2D_IMPL(is_depthwise_val, filter_size_val) \
|
||||
if (is_depthwise == is_depthwise_val && filter_size == filter_size_val) \
|
||||
kernels::k210::kpu_conv2d<is_depthwise_val, filter_size_val>(input_tmp.get(), workspace.get(), conv_output_tmp.get(), options.weights.data(), \
|
||||
in_h, in_w, in_ch, out_ch, pad_value, arg_x, shift_x, arg_w, shift_w, arg_add, batchnorm.get(), activation)
|
||||
|
||||
KPU_CONV2D_IMPL(true, 1);
|
||||
else KPU_CONV2D_IMPL(true, 3);
|
||||
else KPU_CONV2D_IMPL(false, 1);
|
||||
else KPU_CONV2D_IMPL(false, 3);
|
||||
|
||||
kernels::k210::kpu_pool2d(conv_output_tmp.get(), output_tmp.get(), in_h, in_w, out_ch, (kpu_pool_type_t)options.layer.kernel_pool_type_cfg.data.pool_type);
|
||||
kernels::k210::kpu_upload(output_tmp.get(), kpu_out.data(), out_shape);
|
||||
if (options.main_mem_output.size)
|
||||
{
|
||||
auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
|
||||
std::copy(output_tmp.get(), output_tmp.get() + out_fmap_size, main_output.data());
|
||||
}
|
||||
|
||||
return kcr_done;
|
||||
#else
|
||||
auto &ctx = interpreter.context();
|
||||
ctx.interpreter = &interpreter;
|
||||
ctx.step = step;
|
||||
|
||||
if (options.main_mem_output.size)
|
||||
{
|
||||
auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
|
||||
kpu_conv2d_output(options.layer, interpreter.dma_ch(), main_output.data(), kpu_plic_thunk, &ctx);
|
||||
}
|
||||
else
|
||||
{
|
||||
kpu_conv2d_normal(options.layer, kpu_plic_thunk, &ctx);
|
||||
}
|
||||
|
||||
return kcr_async;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,238 @@
|
|||
#include <kernels/neutral/neutral_kernels.h>
|
||||
#include <runtime/kernel_registry.h>
|
||||
#include <targets/neutral/neutral_ops_body.h>
|
||||
|
||||
using namespace nncase;
|
||||
using namespace nncase::runtime;
|
||||
|
||||
#define ELEM_SIZE_IMPL(type, KERNEL) \
|
||||
switch (runtime::get_bytes(type)) \
|
||||
{ \
|
||||
case 1: \
|
||||
KERNEL(uint8_t); \
|
||||
break; \
|
||||
case 2: \
|
||||
KERNEL(uint16_t); \
|
||||
break; \
|
||||
case 4: \
|
||||
KERNEL(uint32_t); \
|
||||
break; \
|
||||
default: \
|
||||
return kcr_error; \
|
||||
}
|
||||
|
||||
namespace nncase
|
||||
{
|
||||
namespace targets
|
||||
{
|
||||
namespace neutral
|
||||
{
|
||||
kernel_call_result binary(binary_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input_a = interpreter.memory_at<float>(options.input_a);
|
||||
auto input_b = interpreter.memory_at<float>(options.input_b);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
auto binary = [&](auto op) {
|
||||
kernels::neutral::binary(input_a.data(), input_b.data(), output.data(), options.in_a_shape, options.in_b_shape, options.out_shape, options.fused_activation, op);
|
||||
};
|
||||
|
||||
switch (options.binary_op)
|
||||
{
|
||||
case binary_add:
|
||||
binary([](auto a, auto b) { return a + b; });
|
||||
return kcr_done;
|
||||
case binary_sub:
|
||||
binary([](auto a, auto b) { return a - b; });
|
||||
return kcr_done;
|
||||
case binary_mul:
|
||||
binary([](auto a, auto b) { return a * b; });
|
||||
return kcr_done;
|
||||
case binary_div:
|
||||
binary([](auto a, auto b) { return a / b; });
|
||||
return kcr_done;
|
||||
default:
|
||||
return kcr_error;
|
||||
}
|
||||
}
|
||||
|
||||
kernel_call_result concat(concat_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
kernels::neutral::concat(options.inputs, output.data(), options.dims, options.inner_size, options.outer_size,
|
||||
[&](const memory_range &range) { return interpreter.memory_at<uint8_t>(range).data(); });
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result conv2d(conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
kernels::neutral::conv2d(input.data(), output.data(), options.weights.data(), options.bias.data(), options.in_shape, options.groups, options.out_channels, options.filter_h,
|
||||
options.filter_w, options.stride_h, options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result dequantize(dequantize_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
kernels::neutral::dequantize(input.data(), output.data(), input.size(), options.quant_param);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result matmul(matmul_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input_a = interpreter.memory_at<float>(options.input_a);
|
||||
auto input_b = interpreter.memory_at<float>(options.input_b);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
kernels::neutral::matmul(input_a.data(), input_b.data(), output.data(), options.bias.data(), options.a_rows, options.a_cols, options.b_cols, options.fused_activation);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result memory_copy(memory_copy_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
std::copy(input.begin(), input.end(), output.begin());
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result pad(pad_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
|
||||
#define PAD_KERNEL(T) \
|
||||
kernels::neutral::pad(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.paddings, options.pad_value.as<T>());
|
||||
|
||||
ELEM_SIZE_IMPL(options.input.datatype, PAD_KERNEL);
|
||||
return kcr_done;
|
||||
#undef PAD_KERNEL
|
||||
}
|
||||
|
||||
kernel_call_result quantize(quantize_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
|
||||
kernels::neutral::quantize(input.data(), output.data(), input.size(), options.quant_param);
|
||||
return runtime::kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result reduce(reduce_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
auto reduce = [&](auto op) {
|
||||
kernels::neutral::reduce(input.data(), output.data(), options.init_value, options.in_shape, options.out_shape, op);
|
||||
};
|
||||
|
||||
switch (options.reduce_op)
|
||||
{
|
||||
case reduce_mean:
|
||||
{
|
||||
reduce([](auto a, auto b) { return a + b; });
|
||||
auto mul = (float)output.size() / input.size();
|
||||
kernels::neutral::unary(output.data(), output.data(), output.size(), [mul](auto a) { return a * mul; });
|
||||
return kcr_done;
|
||||
}
|
||||
case reduce_min:
|
||||
reduce([](auto a, auto b) { return std::min(a, b); });
|
||||
return kcr_done;
|
||||
case reduce_max:
|
||||
reduce([](auto a, auto b) { return std::max(a, b); });
|
||||
return kcr_done;
|
||||
default:
|
||||
return kcr_error;
|
||||
}
|
||||
}
|
||||
|
||||
kernel_call_result reduce_window2d(reduce_window2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
auto reduce = [&](auto binary_op, auto window_op) {
|
||||
kernels::neutral::reduce_window2d(input.data(), output.data(), options.init_value, options.in_shape, options.filter_h, options.filter_w, options.stride_h,
|
||||
options.stride_w, options.dilation_h, options.dilation_w, options.padding_h, options.padding_w, options.fused_activation, binary_op, window_op);
|
||||
};
|
||||
|
||||
switch (options.reduce_op)
|
||||
{
|
||||
case reduce_mean:
|
||||
reduce([](auto a, auto b) { return a + b; }, [](auto v, auto k) { return v / k; });
|
||||
return kcr_done;
|
||||
case reduce_min:
|
||||
reduce([](auto a, auto b) { return std::min(a, b); }, [](auto v, auto k) { return v; });
|
||||
return kcr_done;
|
||||
case reduce_max:
|
||||
reduce([](auto a, auto b) { return std::max(a, b); }, [](auto v, auto k) { return v; });
|
||||
return kcr_done;
|
||||
default:
|
||||
return kcr_error;
|
||||
}
|
||||
}
|
||||
|
||||
kernel_call_result resize_bilinear(resize_bilinear_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
kernels::neutral::resize_bilinear(input.data(), output.data(), options.in_shape, options.out_h, options.out_w, options.align_corners);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result resize_nearest_neighbor(resize_nearest_neighbor_options &options, interpreter_t &interpreter, runtime::interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
|
||||
#define RESIZE_NN_KERNEL(T) \
|
||||
kernels::neutral::resize_nearest_neighbor(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.out_h, options.out_w);
|
||||
|
||||
ELEM_SIZE_IMPL(options.input.datatype, RESIZE_NN_KERNEL);
|
||||
return kcr_done;
|
||||
#undef RESIZE_NN_KERNEL
|
||||
}
|
||||
|
||||
kernel_call_result softmax(softmax_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<float>(options.input);
|
||||
auto output = interpreter.memory_at<float>(options.output);
|
||||
|
||||
kernels::neutral::softmax(input.data(), output.data(), options.beta, options.outer_size, options.inner_size);
|
||||
return kcr_done;
|
||||
}
|
||||
|
||||
kernel_call_result transpose(transpose_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
|
||||
#define TRANSPOSE_KERNEL(T) \
|
||||
kernels::neutral::transpose(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.perm);
|
||||
|
||||
ELEM_SIZE_IMPL(options.input.datatype, TRANSPOSE_KERNEL);
|
||||
return kcr_done;
|
||||
#undef TRANSPOSE_KERNEL
|
||||
}
|
||||
|
||||
kernel_call_result strided_slice(strided_slice_options &options, interpreter_t &interpreter, interpreter_step_t step)
|
||||
{
|
||||
auto input = interpreter.memory_at<uint8_t>(options.input);
|
||||
auto output = interpreter.memory_at<uint8_t>(options.output);
|
||||
|
||||
#define STRIDED_SLICE_KERNEL(T) \
|
||||
kernels::neutral::strided_slice(reinterpret_cast<const T *>(input.data()), reinterpret_cast<T *>(output.data()), options.in_shape, options.begin, options.end, options.strides);
|
||||
|
||||
ELEM_SIZE_IMPL(options.input.datatype, STRIDED_SLICE_KERNEL);
|
||||
return kcr_done;
|
||||
#undef STRIDED_SLICE_KERNEL
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2017, Sylvain Corlay and Johan Mabille
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,66 @@
|
|||
# ![xtl](docs/source/xtl.svg)
|
||||
|
||||
[![Travis](https://travis-ci.org/QuantStack/xtl.svg?branch=master)](https://travis-ci.org/QuantStack/xtl)
|
||||
[![Appveyor](https://ci.appveyor.com/api/projects/status/g9bldap2wirlue9w?svg=true)](https://ci.appveyor.com/project/QuantStack/xtl)
|
||||
[![Azure](https://dev.azure.com/johanmabille/johanmabille/_apis/build/status/QuantStack.xtl?branchName=master)](https://dev.azure.com/johanmabille/johanmabille/_build/latest?definitionId=1&branchName=master)
|
||||
[![Documentation Status](http://readthedocs.org/projects/xtl/badge/?version=latest)](https://xtl.readthedocs.io/en/latest/?badge=latest)
|
||||
[![Join the Gitter Chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/QuantStack/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
|
||||
Basic tools (containers, algorithms) used by other quantstack packages
|
||||
|
||||
## Installation
|
||||
|
||||
`xtl` is a header-only library. We provide a package for the conda package manager.
|
||||
|
||||
```bash
|
||||
conda install -c conda-forge xtl
|
||||
```
|
||||
|
||||
Or you can directly install it from the sources:
|
||||
|
||||
```bash
|
||||
cmake -DCMAKE_INSTALL_PREFIX=your_install_prefix
|
||||
make install
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
To get started with using `xtl`, check out the full documentation
|
||||
|
||||
http://xtl.readthedocs.io/
|
||||
|
||||
|
||||
## Building the HTML documentation
|
||||
|
||||
xtl's documentation is built with three tools
|
||||
|
||||
- [doxygen](http://www.doxygen.org)
|
||||
- [sphinx](http://www.sphinx-doc.org)
|
||||
- [breathe](https://breathe.readthedocs.io)
|
||||
|
||||
While doxygen must be installed separately, you can install breathe by typing
|
||||
|
||||
```bash
|
||||
pip install breathe
|
||||
```
|
||||
|
||||
Breathe can also be installed with `conda`
|
||||
|
||||
```bash
|
||||
conda install -c conda-forge breathe
|
||||
```
|
||||
|
||||
Finally, build the documentation with
|
||||
|
||||
```bash
|
||||
make html
|
||||
```
|
||||
|
||||
from the `docs` subdirectory.
|
||||
|
||||
## License
|
||||
|
||||
We use a shared copyright model that enables all contributors to maintain the
|
||||
copyright on their contributions.
|
||||
|
||||
This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details.
|
|
@ -0,0 +1,20 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2016, Sylvain Corlay and Johan Mabille *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XTL_XSPAN_HPP
|
||||
#define XTL_XSPAN_HPP
|
||||
|
||||
#include "xspan_impl.hpp"
|
||||
|
||||
namespace xtl
|
||||
{
|
||||
using tcb::span;
|
||||
constexpr std::ptrdiff_t dynamic_extent = tcb::dynamic_extent;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,778 @@
|
|||
// https://github.com/tcbrindle/span/blob/master/include/tcb/span.hpp
|
||||
// TCP SPAN @commit cd0c6d0
|
||||
|
||||
/*
|
||||
This is an implementation of std::span from P0122R7
|
||||
http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0122r7.pdf
|
||||
*/
|
||||
|
||||
// Copyright Tristan Brindle 2018.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file ../../LICENSE_1_0.txt or copy at
|
||||
// https://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef TCB_SPAN_HPP_INCLUDED
|
||||
#define TCB_SPAN_HPP_INCLUDED
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
|
||||
#ifndef TCB_SPAN_NO_EXCEPTIONS
|
||||
// Attempt to discover whether we're being compiled with exception support
|
||||
#if !(defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND))
|
||||
#define TCB_SPAN_NO_EXCEPTIONS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef TCB_SPAN_NO_EXCEPTIONS
|
||||
#include <cstdio>
|
||||
#include <stdexcept>
|
||||
#endif
|
||||
|
||||
// Various feature test macros
|
||||
|
||||
#ifndef TCB_SPAN_NAMESPACE_NAME
|
||||
#define TCB_SPAN_NAMESPACE_NAME tcb
|
||||
#endif
|
||||
|
||||
#ifdef TCB_SPAN_STD_COMPLIANT_MODE
|
||||
#define TCB_SPAN_NO_DEPRECATION_WARNINGS
|
||||
#endif
|
||||
|
||||
#ifndef TCB_SPAN_NO_DEPRECATION_WARNINGS
|
||||
#define TCB_SPAN_DEPRECATED_FOR(msg) [[deprecated(msg)]]
|
||||
#else
|
||||
#define TCB_SPAN_DEPRECATED_FOR(msg)
|
||||
#endif
|
||||
|
||||
#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
|
||||
#define TCB_SPAN_HAVE_CPP17
|
||||
#endif
|
||||
|
||||
#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
|
||||
#define TCB_SPAN_HAVE_CPP14
|
||||
#endif
|
||||
|
||||
namespace TCB_SPAN_NAMESPACE_NAME {
|
||||
|
||||
// Establish default contract checking behavior
|
||||
#if !defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION) && \
|
||||
!defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION) && \
|
||||
!defined(TCB_SPAN_NO_CONTRACT_CHECKING)
|
||||
#if defined(NDEBUG) || !defined(TCB_SPAN_HAVE_CPP14)
|
||||
#define TCB_SPAN_NO_CONTRACT_CHECKING
|
||||
#else
|
||||
#define TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_THROW_ON_CONTRACT_VIOLATION)
|
||||
struct contract_violation_error : std::logic_error {
|
||||
explicit contract_violation_error(const char* msg) : std::logic_error(msg)
|
||||
{}
|
||||
};
|
||||
|
||||
inline void contract_violation(const char* msg)
|
||||
{
|
||||
throw contract_violation_error(msg);
|
||||
}
|
||||
|
||||
#elif defined(TCB_SPAN_TERMINATE_ON_CONTRACT_VIOLATION)
|
||||
[[noreturn]] inline void contract_violation(const char* /*unused*/)
|
||||
{
|
||||
std::terminate();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(TCB_SPAN_NO_CONTRACT_CHECKING)
|
||||
#define TCB_SPAN_STRINGIFY(cond) #cond
|
||||
#define TCB_SPAN_EXPECT(cond) \
|
||||
cond ? (void) 0 : contract_violation("Expected " TCB_SPAN_STRINGIFY(cond))
|
||||
#else
|
||||
#define TCB_SPAN_EXPECT(cond)
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_inline_variables)
|
||||
#define TCB_SPAN_INLINE_VAR inline
|
||||
#else
|
||||
#define TCB_SPAN_INLINE_VAR
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_HAVE_CPP14) || \
|
||||
(defined(__cpp_constexpr) && __cpp_constexpr >= 201304)
|
||||
#define TCB_SPAN_CONSTEXPR14 constexpr
|
||||
#else
|
||||
#define TCB_SPAN_CONSTEXPR14
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_NO_CONTRACT_CHECKING)
|
||||
#define TCB_SPAN_CONSTEXPR11 constexpr
|
||||
#else
|
||||
#define TCB_SPAN_CONSTEXPR11 TCB_SPAN_CONSTEXPR14
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_deduction_guides)
|
||||
#define TCB_SPAN_HAVE_DEDUCTION_GUIDES
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_byte)
|
||||
#define TCB_SPAN_HAVE_STD_BYTE
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_array_constexpr)
|
||||
#define TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC
|
||||
#endif
|
||||
|
||||
#if defined(TCB_SPAN_HAVE_CONSTEXPR_STD_ARRAY_ETC)
|
||||
#define TCB_SPAN_ARRAY_CONSTEXPR constexpr
|
||||
#else
|
||||
#define TCB_SPAN_ARRAY_CONSTEXPR
|
||||
#endif
|
||||
|
||||
#ifdef TCB_SPAN_HAVE_STD_BYTE
|
||||
using byte = std::byte;
|
||||
#else
|
||||
using byte = unsigned char;
|
||||
#endif
|
||||
|
||||
TCB_SPAN_INLINE_VAR constexpr std::ptrdiff_t dynamic_extent = -1;
|
||||
|
||||
template <typename ElementType, std::ptrdiff_t Extent = dynamic_extent>
|
||||
class span;
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename E, std::ptrdiff_t S>
|
||||
struct span_storage {
|
||||
constexpr span_storage() noexcept = default;
|
||||
|
||||
constexpr span_storage(E* ptr, std::ptrdiff_t /*unused*/) noexcept
|
||||
: ptr(ptr)
|
||||
{}
|
||||
|
||||
E* ptr = nullptr;
|
||||
static constexpr std::ptrdiff_t size = S;
|
||||
};
|
||||
|
||||
template <typename E>
|
||||
struct span_storage<E, dynamic_extent> {
|
||||
constexpr span_storage() noexcept = default;
|
||||
|
||||
constexpr span_storage(E* ptr, std::size_t size) noexcept
|
||||
: ptr(ptr), size(size)
|
||||
{}
|
||||
|
||||
E* ptr = nullptr;
|
||||
std::size_t size = 0;
|
||||
};
|
||||
|
||||
// Reimplementation of C++17 std::size() and std::data()
|
||||
#if defined(TCB_SPAN_HAVE_CPP17) || \
|
||||
defined(__cpp_lib_nonmember_container_access)
|
||||
using std::data;
|
||||
using std::size;
|
||||
#else
|
||||
template <class C>
|
||||
constexpr auto size(const C& c) -> decltype(c.size())
|
||||
{
|
||||
return c.size();
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
constexpr std::size_t size(const T (&)[N]) noexcept
|
||||
{
|
||||
return N;
|
||||
}
|
||||
|
||||
template <class C>
|
||||
constexpr auto data(C& c) -> decltype(c.data())
|
||||
{
|
||||
return c.data();
|
||||
}
|
||||
|
||||
template <class C>
|
||||
constexpr auto data(const C& c) -> decltype(c.data())
|
||||
{
|
||||
return c.data();
|
||||
}
|
||||
|
||||
template <class T, std::size_t N>
|
||||
constexpr T* data(T (&array)[N]) noexcept
|
||||
{
|
||||
return array;
|
||||
}
|
||||
|
||||
template <class E>
|
||||
constexpr const E* data(std::initializer_list<E> il) noexcept
|
||||
{
|
||||
return il.begin();
|
||||
}
|
||||
#endif // TCB_SPAN_HAVE_CPP17
|
||||
|
||||
#if defined(TCB_SPAN_HAVE_CPP17) || defined(__cpp_lib_void_t)
|
||||
using std::void_t;
|
||||
#else
|
||||
template <typename...>
|
||||
using void_t = void;
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
using uncvref_t =
|
||||
typename std::remove_cv<typename std::remove_reference<T>::type>::type;
|
||||
|
||||
template <typename>
|
||||
struct is_span : std::false_type {};
|
||||
|
||||
template <typename T, std::ptrdiff_t S>
|
||||
struct is_span<span<T, S>> : std::true_type {};
|
||||
|
||||
template <typename>
|
||||
struct is_std_array : std::false_type {};
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
struct is_std_array<std::array<T, N>> : std::true_type {};
|
||||
|
||||
template <typename, typename = void>
|
||||
struct has_size_and_data : std::false_type {};
|
||||
|
||||
template <typename T>
|
||||
struct has_size_and_data<T, void_t<decltype(detail::size(std::declval<T>())),
|
||||
decltype(detail::data(std::declval<T>()))>>
|
||||
: std::true_type {};
|
||||
|
||||
template <typename C, typename U = uncvref_t<C>>
|
||||
struct is_container {
|
||||
static constexpr bool value =
|
||||
!is_span<U>::value && !is_std_array<U>::value &&
|
||||
!std::is_array<U>::value && has_size_and_data<C>::value;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using remove_pointer_t = typename std::remove_pointer<T>::type;
|
||||
|
||||
template <typename, typename, typename = void>
|
||||
struct is_container_element_type_compatible : std::false_type {};
|
||||
|
||||
template <typename T, typename E>
|
||||
struct is_container_element_type_compatible<
|
||||
T, E, void_t<decltype(detail::data(std::declval<T>()))>>
|
||||
: std::is_convertible<
|
||||
remove_pointer_t<decltype(detail::data(std::declval<T>()))> (*)[],
|
||||
E (*)[]> {};
|
||||
|
||||
template <typename, typename = size_t>
|
||||
struct is_complete : std::false_type {};
|
||||
|
||||
template <typename T>
|
||||
struct is_complete<T, decltype(sizeof(T))> : std::true_type {};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename ElementType, std::ptrdiff_t Extent>
|
||||
class span {
|
||||
static_assert(Extent == dynamic_extent || Extent >= 0,
|
||||
"A span must have an extent greater than or equal to zero, "
|
||||
"or a dynamic extent");
|
||||
static_assert(std::is_object<ElementType>::value,
|
||||
"A span's ElementType must be an object type (not a "
|
||||
"reference type or void)");
|
||||
static_assert(detail::is_complete<ElementType>::value,
|
||||
"A span's ElementType must be a complete type (not a forward "
|
||||
"declaration)");
|
||||
static_assert(!std::is_abstract<ElementType>::value,
|
||||
"A span's ElementType cannot be an abstract class type");
|
||||
|
||||
using storage_type = detail::span_storage<ElementType, Extent>;
|
||||
|
||||
public:
|
||||
// constants and types
|
||||
using element_type = ElementType;
|
||||
using value_type = typename std::remove_cv<ElementType>::type;
|
||||
using index_type = std::size_t;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using pointer = ElementType*;
|
||||
using reference = ElementType&;
|
||||
using iterator = pointer;
|
||||
using const_iterator = const ElementType*;
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
|
||||
|
||||
static constexpr index_type extent = static_cast<index_type>(Extent);
|
||||
|
||||
// [span.cons], span constructors, copy, assignment, and destructor
|
||||
template <std::ptrdiff_t E = Extent,
|
||||
typename std::enable_if<E <= 0, int>::type = 0>
|
||||
constexpr span() noexcept
|
||||
{}
|
||||
|
||||
TCB_SPAN_CONSTEXPR11 span(pointer ptr, index_type count)
|
||||
: storage_(ptr, count)
|
||||
{
|
||||
TCB_SPAN_EXPECT(extent == dynamic_extent || count == extent);
|
||||
}
|
||||
|
||||
TCB_SPAN_CONSTEXPR11 span(pointer first_elem, pointer last_elem)
|
||||
: storage_(first_elem, last_elem - first_elem)
|
||||
{
|
||||
TCB_SPAN_EXPECT(extent == dynamic_extent ||
|
||||
last_elem - first_elem == extent);
|
||||
}
|
||||
|
||||
template <
|
||||
std::size_t N, std::ptrdiff_t E = Extent,
|
||||
typename std::enable_if<
|
||||
(E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
|
||||
detail::is_container_element_type_compatible<
|
||||
element_type (&)[N], ElementType>::value,
|
||||
int>::type = 0>
|
||||
constexpr span(element_type (&arr)[N]) noexcept : storage_(arr, N)
|
||||
{}
|
||||
|
||||
template <
|
||||
std::size_t N, std::ptrdiff_t E = Extent,
|
||||
typename std::enable_if<
|
||||
(E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
|
||||
detail::is_container_element_type_compatible<
|
||||
std::array<value_type, N>&, ElementType>::value,
|
||||
int>::type = 0>
|
||||
TCB_SPAN_ARRAY_CONSTEXPR span(std::array<value_type, N>& arr) noexcept
|
||||
: storage_(arr.data(), N)
|
||||
{}
|
||||
|
||||
template <
|
||||
std::size_t N, std::ptrdiff_t E = Extent,
|
||||
typename std::enable_if<
|
||||
(E == dynamic_extent || static_cast<std::ptrdiff_t>(N) == E) &&
|
||||
detail::is_container_element_type_compatible<
|
||||
const std::array<value_type, N>&, ElementType>::value,
|
||||
int>::type = 0>
|
||||
TCB_SPAN_ARRAY_CONSTEXPR span(const std::array<value_type, N>& arr) noexcept
|
||||
: storage_(arr.data(), N)
|
||||
{}
|
||||
|
||||
template <typename Container,
|
||||
typename std::enable_if<
|
||||
detail::is_container<Container>::value &&
|
||||
detail::is_container_element_type_compatible<
|
||||
Container&, ElementType>::value,
|
||||
int>::type = 0>
|
||||
TCB_SPAN_CONSTEXPR11 span(Container& cont)
|
||||
: storage_(detail::data(cont), detail::size(cont))
|
||||
{
|
||||
TCB_SPAN_EXPECT(extent == dynamic_extent ||
|
||||
static_cast<std::ptrdiff_t>(detail::size(cont)) ==
|
||||
extent);
|
||||
}
|
||||
|
||||
template <typename Container,
|
||||
typename std::enable_if<
|
||||
detail::is_container<Container>::value &&
|
||||
detail::is_container_element_type_compatible<
|
||||
const Container&, ElementType>::value,
|
||||
int>::type = 0>
|
||||
TCB_SPAN_CONSTEXPR11 span(const Container& cont)
|
||||
: storage_(detail::data(cont), detail::size(cont))
|
||||
{
|
||||
TCB_SPAN_EXPECT(extent == dynamic_extent ||
|
||||
static_cast<std::ptrdiff_t>(detail::size(cont)) ==
|
||||
extent);
|
||||
}
|
||||
|
||||
constexpr span(const span& other) noexcept = default;
|
||||
|
||||
template <typename OtherElementType, std::ptrdiff_t OtherExtent,
|
||||
typename std::enable_if<
|
||||
(Extent == OtherExtent || Extent == dynamic_extent) &&
|
||||
std::is_convertible<OtherElementType (*)[],
|
||||
ElementType (*)[]>::value,
|
||||
int>::type = 0>
|
||||
constexpr span(const span<OtherElementType, OtherExtent>& other) noexcept
|
||||
: storage_(other.data(), other.size())
|
||||
{}
|
||||
|
||||
~span() noexcept = default;
|
||||
|
||||
span& operator=(const span& other) noexcept = default;
|
||||
|
||||
// [span.sub], span subviews
|
||||
template <std::ptrdiff_t Count>
|
||||
TCB_SPAN_CONSTEXPR11 span<element_type, Count> first() const
|
||||
{
|
||||
TCB_SPAN_EXPECT(Count >= 0 && Count <= size());
|
||||
return {data(), Count};
|
||||
}
|
||||
|
||||
template <std::ptrdiff_t Count>
|
||||
TCB_SPAN_CONSTEXPR11 span<element_type, Count> last() const
|
||||
{
|
||||
TCB_SPAN_EXPECT(Count >= 0 && Count <= size());
|
||||
return {data() + (size() - Count), Count};
|
||||
}
|
||||
|
||||
template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent>
|
||||
using subspan_return_t =
|
||||
span<ElementType, Count != dynamic_extent
|
||||
? Count
|
||||
: (Extent != dynamic_extent ? Extent - Offset
|
||||
: dynamic_extent)>;
|
||||
|
||||
template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent>
|
||||
TCB_SPAN_CONSTEXPR11 subspan_return_t<Offset, Count> subspan() const
|
||||
{
|
||||
TCB_SPAN_EXPECT((Offset >= 0 && Offset <= size()) &&
|
||||
(Count == dynamic_extent ||
|
||||
(Count >= 0 && Offset + Count <= size())));
|
||||
return {data() + Offset,
|
||||
Count != dynamic_extent
|
||||
? Count
|
||||
: (Extent != dynamic_extent ? Extent - Offset
|
||||
: size() - Offset)};
|
||||
}
|
||||
|
||||
TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
|
||||
first(index_type count) const
|
||||
{
|
||||
TCB_SPAN_EXPECT(count >= 0 && count <= size());
|
||||
return {data(), count};
|
||||
}
|
||||
|
||||
TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
|
||||
last(index_type count) const
|
||||
{
|
||||
TCB_SPAN_EXPECT(count >= 0 && count <= size());
|
||||
return {data() + (size() - count), count};
|
||||
}
|
||||
|
||||
TCB_SPAN_CONSTEXPR11 span<element_type, dynamic_extent>
|
||||
subspan(index_type offset, index_type count = static_cast<index_type>(dynamic_extent)) const
|
||||
{
|
||||
TCB_SPAN_EXPECT((offset >= 0 && offset <= size()) &&
|
||||
(count == dynamic_extent ||
|
||||
(count >= 0 && offset + count <= size())));
|
||||
return {data() + offset,
|
||||
count == dynamic_extent ? size() - offset : count};
|
||||
}
|
||||
|
||||
// [span.obs], span observers
|
||||
constexpr index_type size() const noexcept { return storage_.size; }
|
||||
|
||||
constexpr index_type size_bytes() const noexcept
|
||||
{
|
||||
return size() * sizeof(element_type);
|
||||
}
|
||||
|
||||
constexpr bool empty() const noexcept { return size() == 0; }
|
||||
|
||||
// [span.elem], span element access
|
||||
TCB_SPAN_CONSTEXPR11 reference operator[](index_type idx) const
|
||||
{
|
||||
TCB_SPAN_EXPECT(idx >= 0 && idx < size());
|
||||
return *(data() + idx);
|
||||
}
|
||||
|
||||
/* Extension: not in P0122 */
|
||||
#ifndef TCB_SPAN_STD_COMPLIANT_MODE
|
||||
TCB_SPAN_CONSTEXPR14 reference at(index_type idx) const
|
||||
{
|
||||
#ifndef TCB_SPAN_NO_EXCEPTIONS
|
||||
if (idx < 0 || idx >= size()) {
|
||||
char msgbuf[64] = {
|
||||
0,
|
||||
};
|
||||
std::snprintf(msgbuf, sizeof(msgbuf),
|
||||
"Index %td is out of range for span of size %td", idx,
|
||||
size());
|
||||
throw std::out_of_range{msgbuf};
|
||||
}
|
||||
#endif // TCB_SPAN_NO_EXCEPTIONS
|
||||
return this->operator[](idx);
|
||||
}
|
||||
|
||||
TCB_SPAN_CONSTEXPR11 reference front() const
|
||||
{
|
||||
TCB_SPAN_EXPECT(!empty());
|
||||
return *data();
|
||||
}
|
||||
|
||||
TCB_SPAN_CONSTEXPR11 reference back() const
|
||||
{
|
||||
TCB_SPAN_EXPECT(!empty());
|
||||
return *(data() + (size() - 1));
|
||||
}
|
||||
|
||||
#endif // TCB_SPAN_STD_COMPLIANT_MODE
|
||||
|
||||
#ifndef TCB_SPAN_NO_FUNCTION_CALL_OPERATOR
|
||||
TCB_SPAN_DEPRECATED_FOR("Use operator[] instead")
|
||||
constexpr reference operator()(index_type idx) const
|
||||
{
|
||||
return this->operator[](idx);
|
||||
}
|
||||
#endif // TCB_SPAN_NO_FUNCTION_CALL_OPERATOR
|
||||
|
||||
constexpr pointer data() const noexcept { return storage_.ptr; }
|
||||
|
||||
// [span.iterators], span iterator support
|
||||
constexpr iterator begin() const noexcept { return data(); }
|
||||
|
||||
constexpr iterator end() const noexcept { return data() + size(); }
|
||||
|
||||
constexpr const_iterator cbegin() const noexcept { return begin(); }
|
||||
|
||||
constexpr const_iterator cend() const noexcept { return end(); }
|
||||
|
||||
TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rbegin() const noexcept
|
||||
{
|
||||
return reverse_iterator(end());
|
||||
}
|
||||
|
||||
TCB_SPAN_ARRAY_CONSTEXPR reverse_iterator rend() const noexcept
|
||||
{
|
||||
return reverse_iterator(begin());
|
||||
}
|
||||
|
||||
TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crbegin() const noexcept
|
||||
{
|
||||
return const_reverse_iterator(cend());
|
||||
}
|
||||
|
||||
TCB_SPAN_ARRAY_CONSTEXPR const_reverse_iterator crend() const noexcept
|
||||
{
|
||||
return const_reverse_iterator(cbegin());
|
||||
}
|
||||
|
||||
private:
|
||||
storage_type storage_{};
|
||||
};
|
||||
|
||||
#ifdef TCB_SPAN_HAVE_DEDUCTION_GUIDES
|
||||
|
||||
/* Deduction Guides */
|
||||
template <class T, size_t N>
|
||||
span(T (&)[N])->span<T, N>;
|
||||
|
||||
template <class T, size_t N>
|
||||
span(std::array<T, N>&)->span<T, N>;
|
||||
|
||||
template <class T, size_t N>
|
||||
span(const std::array<T, N>&)->span<const T, N>;
|
||||
|
||||
template <class Container>
|
||||
span(Container&)->span<typename Container::value_type>;
|
||||
|
||||
template <class Container>
|
||||
span(const Container&)->span<const typename Container::value_type>;
|
||||
|
||||
#endif // TCB_HAVE_DEDUCTION_GUIDES
|
||||
|
||||
template <typename ElementType, std::ptrdiff_t Extent>
|
||||
constexpr span<ElementType, Extent>
|
||||
make_span(span<ElementType, Extent> s) noexcept
|
||||
{
|
||||
return s;
|
||||
}
|
||||
|
||||
#define AS_SIGNED(N) static_cast<std::ptrdiff_t>(N)
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
constexpr span<T, AS_SIGNED(N)> make_span(T (&arr)[N]) noexcept
|
||||
{
|
||||
return {arr};
|
||||
}
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
TCB_SPAN_ARRAY_CONSTEXPR span<T, AS_SIGNED(N)> make_span(std::array<T, N>& arr) noexcept
|
||||
{
|
||||
return {arr};
|
||||
}
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
TCB_SPAN_ARRAY_CONSTEXPR span<const T, AS_SIGNED(N)>
|
||||
make_span(const std::array<T, N>& arr) noexcept
|
||||
{
|
||||
return {arr};
|
||||
}
|
||||
|
||||
#undef AS_SIGNED
|
||||
|
||||
template <typename Container>
|
||||
constexpr span<typename Container::value_type> make_span(Container& cont)
|
||||
{
|
||||
return {cont};
|
||||
}
|
||||
|
||||
template <typename Container>
|
||||
constexpr span<const typename Container::value_type>
|
||||
make_span(const Container& cont)
|
||||
{
|
||||
return {cont};
|
||||
}
|
||||
|
||||
/* Comparison operators */
|
||||
// Implementation note: the implementations of == and < are equivalent to
|
||||
// 4-legged std::equal and std::lexicographical_compare respectively
|
||||
|
||||
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
|
||||
TCB_SPAN_CONSTEXPR14 bool operator==(span<T, X> lhs, span<U, Y> rhs)
|
||||
{
|
||||
if (lhs.size() != rhs.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (std::ptrdiff_t i = 0; i < lhs.size(); i++) {
|
||||
if (lhs[i] != rhs[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
|
||||
TCB_SPAN_CONSTEXPR14 bool operator!=(span<T, X> lhs, span<U, Y> rhs)
|
||||
{
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
|
||||
TCB_SPAN_CONSTEXPR14 bool operator<(span<T, X> lhs, span<U, Y> rhs)
|
||||
{
|
||||
// No std::min to avoid dragging in <algorithm>
|
||||
const std::ptrdiff_t size =
|
||||
lhs.size() < rhs.size() ? lhs.size() : rhs.size();
|
||||
|
||||
for (std::ptrdiff_t i = 0; i < size; i++) {
|
||||
if (lhs[i] < rhs[i]) {
|
||||
return true;
|
||||
}
|
||||
if (lhs[i] > rhs[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return lhs.size() < rhs.size();
|
||||
}
|
||||
|
||||
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
|
||||
TCB_SPAN_CONSTEXPR14 bool operator<=(span<T, X> lhs, span<U, Y> rhs)
|
||||
{
|
||||
return !(rhs < lhs);
|
||||
}
|
||||
|
||||
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
|
||||
TCB_SPAN_CONSTEXPR14 bool operator>(span<T, X> lhs, span<U, Y> rhs)
|
||||
{
|
||||
return rhs < lhs;
|
||||
}
|
||||
|
||||
template <typename T, std::ptrdiff_t X, typename U, std::ptrdiff_t Y>
|
||||
TCB_SPAN_CONSTEXPR14 bool operator>=(span<T, X> lhs, span<U, Y> rhs)
|
||||
{
|
||||
return !(lhs < rhs);
|
||||
}
|
||||
|
||||
template <typename ElementType, std::ptrdiff_t Extent>
|
||||
span<const byte, ((Extent == dynamic_extent)
|
||||
? dynamic_extent
|
||||
: (static_cast<ptrdiff_t>(sizeof(ElementType)) * Extent))>
|
||||
as_bytes(span<ElementType, Extent> s) noexcept
|
||||
{
|
||||
return {reinterpret_cast<const byte*>(s.data()), s.size_bytes()};
|
||||
}
|
||||
|
||||
template <
|
||||
class ElementType, ptrdiff_t Extent,
|
||||
typename std::enable_if<!std::is_const<ElementType>::value, int>::type = 0>
|
||||
span<byte, ((Extent == dynamic_extent)
|
||||
? dynamic_extent
|
||||
: (static_cast<ptrdiff_t>(sizeof(ElementType)) * Extent))>
|
||||
as_writable_bytes(span<ElementType, Extent> s) noexcept
|
||||
{
|
||||
return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
|
||||
}
|
||||
|
||||
/* Extension: nonmember subview operations */
|
||||
|
||||
#ifndef TCB_SPAN_STD_COMPLIANT_MODE
|
||||
|
||||
template <std::ptrdiff_t Count, typename T>
|
||||
TCB_SPAN_CONSTEXPR11 auto first(T& t)
|
||||
-> decltype(make_span(t).template first<Count>())
|
||||
{
|
||||
return make_span(t).template first<Count>();
|
||||
}
|
||||
|
||||
template <std::ptrdiff_t Count, typename T>
|
||||
TCB_SPAN_CONSTEXPR11 auto last(T& t)
|
||||
-> decltype(make_span(t).template last<Count>())
|
||||
{
|
||||
return make_span(t).template last<Count>();
|
||||
}
|
||||
|
||||
template <std::ptrdiff_t Offset, std::ptrdiff_t Count = dynamic_extent,
|
||||
typename T>
|
||||
TCB_SPAN_CONSTEXPR11 auto subspan(T& t)
|
||||
-> decltype(make_span(t).template subspan<Offset, Count>())
|
||||
{
|
||||
return make_span(t).template subspan<Offset, Count>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TCB_SPAN_CONSTEXPR11 auto first(T& t, std::ptrdiff_t count)
|
||||
-> decltype(make_span(t).first(count))
|
||||
{
|
||||
return make_span(t).first(count);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TCB_SPAN_CONSTEXPR11 auto last(T& t, std::ptrdiff_t count)
|
||||
-> decltype(make_span(t).last(count))
|
||||
{
|
||||
return make_span(t).last(count);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TCB_SPAN_CONSTEXPR11 auto subspan(T& t, std::ptrdiff_t offset,
|
||||
std::ptrdiff_t count = dynamic_extent)
|
||||
-> decltype(make_span(t).subspan(offset, count))
|
||||
{
|
||||
return make_span(t).subspan(offset, count);
|
||||
}
|
||||
|
||||
#endif // TCB_SPAN_STD_COMPLIANT_MODE
|
||||
|
||||
} // namespace TCB_SPAN_NAMESPACE_NAME
|
||||
|
||||
/* Extension: support for C++17 structured bindings */
|
||||
|
||||
#ifndef TCB_SPAN_STD_COMPLIANT_MODE
|
||||
|
||||
namespace TCB_SPAN_NAMESPACE_NAME {
|
||||
|
||||
template <std::ptrdiff_t N, typename E, std::ptrdiff_t S>
|
||||
constexpr auto get(span<E, S> s) -> decltype(s[N])
|
||||
{
|
||||
return s[N];
|
||||
}
|
||||
|
||||
} // namespace TCB_SPAN_NAMESPACE_NAME
|
||||
|
||||
namespace std {
|
||||
|
||||
template <typename E, ptrdiff_t S>
|
||||
class tuple_size<tcb::span<E, S>> : public integral_constant<size_t, static_cast<size_t>(S)> {};
|
||||
|
||||
template <typename E>
|
||||
class tuple_size<tcb::span<E, tcb::dynamic_extent>>; // not defined
|
||||
|
||||
template <size_t N, typename E, ptrdiff_t S>
|
||||
class tuple_element<N, tcb::span<E, S>> {
|
||||
public:
|
||||
using type = E;
|
||||
};
|
||||
|
||||
} // end namespace std
|
||||
|
||||
#endif // TCB_SPAN_STD_COMPLIANT_MODE
|
||||
|
||||
#endif // TCB_SPAN_HPP_INCLUDED
|
Loading…
Reference in New Issue