Coyote/examples_hw/apps/gbm_dtrees/hdl/pe_datapath.sv

369 lines
17 KiB
Systemverilog

/*
* Copyright 2019 - 2020 Systems Group, ETH Zurich
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*
PE Datapath flow:
- Read Tree Node: This stage consumes either the input tree instruction or the next node instruction.
This stage issues a read address to tree memory and delays all the necessary information
for the tree memory delay. By the end of the stage, the tree node basic and extra part are
available in addition to the delayed information.
- Read feature: Use the feature index in the tree node to compute the feature address and read it from
the data memory. Then, we delay the current node instruction and obtained tree node info.
- Read Split Set: In this stage, we compute the address for large bitset word to read from
tree memory, If there is no large split bitset then the tree memory output is ignored.
In addition, the early tree evaluation occurs. In parallel, we use the feature value to select
from the small bitset, we use it also to compare to the split value, and we use the feature value
to check if it is outside the boundaries of the large bitset. In addition, If node_nop flag is zero,
then the node_res_val is not used, hence we store W2 of the tree node in its place to save resources.
In addition, we check if the feature value is NaN or not.
- Evaluate Node: At this stage we have all the necessary information to process a node.
Then based on the operation type the evaluation happens.
Pick A branch: In this stage, based on the node evaluation result then we compute next node
to evaluate, or pick result if no children on the next branch and set NOP if we
are not at the last level. In this branch, also if we at the last level, then we
output the result if the tree is not empty.
*/
import DTPackage::*;
module pe_datapath (
input wire clk,
input wire rst_n,
input wire [MAX_TREE_DEPTH_BITS-1:0] tree_depth,
// tree instruction
input TreeInstruction_t tree_instruction,
input wire tree_instruction_valid,
output wire tree_instruction_ready,
// data memory read port
output wire [TUPLE_OFFSET_BITS:0] data_mem_rd_addr,
output wire data_mem_ren,
input wire [31:0] data_mem_feature,
// tree memory read port A
output wire [TREE_OFFSET_BITS:0] tree_mem_rd_addr_a,
output wire tree_mem_ren_a,
input TreeNode_t tree_node_basic,
// tree memory read port B
output wire [TREE_OFFSET_BITS:0] tree_mem_rd_addr_b,
output wire tree_mem_ren_b,
input wire [31:0] node_large_bitset,
// result output
output wire [31:0] tree_eval_result,
output wire tree_eval_last,
output wire tree_eval_result_valid
);
NodeInstruction_t next_node_instr;
wire next_node_instr_valid;
NodeInstruction_t curr_node_instr_s1;
wire curr_node_instr_valid_s1;
NodeInstruction_t curr_node_instr_s2;
wire curr_node_instr_valid_s2;
NodeInstruction_t curr_node_instr_s3;
NodeInstruction_t curr_node_instr_s3_1;
wire curr_node_instr_valid_s3;
NodeInstruction_t curr_node_instr_s4;
NodeInstruction_t curr_node_instr_s4_1;
wire curr_node_instr_valid_s4;
NodeInstruction_t curr_node_instr_s5;
NodeInstruction_t curr_node_instr_s5_1;
wire curr_node_instr_valid_s5;
wire [31:0] small_bitset;
wire is_Feature_NaN;
wire small_bitset_eval;
wire split_value_eval;
wire feature_data_outside_bitset;
wire is_Feature_NaN_s4;
wire small_bitset_eval_s4;
wire split_value_eval_s4;
wire feature_data_outside_bitset_s4;
wire [4:0] feature_data_s4;
wire is_Feature_NaN_s5;
TreeNode_t tree_node_basic_s3;
TreeNode_t tree_node_basic_s4;
TreeNode_t tree_node_basic_s5;
reg go_right;
reg node_eval;
wire [10:0] left_child_offset_1;
wire [10:0] left_child_offset_2;
wire [10:0] right_child_offset;
wire [10:0] next_node_address;
wire [MAX_TREE_DEPTH_BITS-1:0] next_node_level;
wire next_node_nop;
wire [10:0] next_child_offset;
wire [10:0] result_offset;
wire next_node_leaf;
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////// Read Node Stage /////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
/*
*/
always @(*) begin
if(next_node_instr_valid) begin
curr_node_instr_s1 = next_node_instr;
end
else begin
curr_node_instr_s1 = '{node_address: {tree_instruction.tree_offset, 1'b0},
tuple_offset: tree_instruction.tuple_offset,
node_level : 3'b000,
empty_tree : tree_instruction.empty_tree,
node_nop : 1'b0,
last_tree : tree_instruction.last_tree,
leaf_node : 1'b0,
node_res_val: 32'b0
};
end
end
assign curr_node_instr_valid_s1 = next_node_instr_valid | tree_instruction_valid;
assign tree_instruction_ready = !next_node_instr_valid;
// Send read requests to tree memory
assign tree_mem_rd_addr_a = curr_node_instr_s1.node_address;
assign tree_mem_ren_a = curr_node_instr_valid_s1;
// Pipeline to next stage
delay #(.DATA_WIDTH( $bits(NodeInstruction_t)),
.DELAY_CYCLES(TREE_MEM_READ_LATENCY)
) ReadNodeStageDelay(
.clk ( clk ),
.rst_n ( rst_n ),
.data_in ( curr_node_instr_s1 ), //
.data_in_valid ( curr_node_instr_valid_s1 ),
.data_out ( curr_node_instr_s2 ),
.data_out_valid ( curr_node_instr_valid_s2 )
);
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////// Read Feature Stage /////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
/*
This stage reads the feature data corresponding to current tree node.
*/
assign data_mem_ren = curr_node_instr_valid_s2 && (tree_node_basic.node_type.findex != 8'hFF);
assign data_mem_rd_addr = {curr_node_instr_s2.tuple_offset, 1'b0} + tree_node_basic.node_type.findex; // 10-bit address
// Pipeline to next stage
delay #(.DATA_WIDTH($bits(NodeInstruction_t) + $bits(TreeNode_t)),
.DELAY_CYCLES(DATA_MEM_READ_LATENCY)
) ReadFeatureStageDelay(
.clk (clk),
.rst_n (rst_n),
.data_in ( {curr_node_instr_s2, tree_node_basic} ), //
.data_in_valid ( curr_node_instr_valid_s2 ),
.data_out ( {curr_node_instr_s3, tree_node_basic_s3} ),
.data_out_valid ( curr_node_instr_valid_s3 )
);
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////// Read Split Set/Value Stage /////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
assign tree_mem_rd_addr_b = curr_node_instr_s3.node_address +
((curr_node_instr_s3.leaf_node)? 0 :
(((!tree_node_basic_s3.node_type.left_child)? 1'b1 : 0) +
((!tree_node_basic_s3.node_type.right_child)? 1'b1 : 0) +
data_mem_feature[15:5] + 11'b00000010));
assign tree_mem_ren_b = curr_node_instr_valid_s3;
// Early Node evaluation
assign curr_node_instr_s3_1 = '{ node_address: curr_node_instr_s3.node_address,
tuple_offset: curr_node_instr_s3.tuple_offset,
node_level : curr_node_instr_s3.node_level,
empty_tree : curr_node_instr_s3.empty_tree,
node_nop : ((curr_node_instr_s3.node_nop)? 1'b1 : !curr_node_instr_s3.leaf_node && (tree_node_basic_s3.node_type.findex == 8'hFF) ),
last_tree : curr_node_instr_s3.last_tree,
leaf_node : curr_node_instr_s3.leaf_node,
node_res_val: ( (curr_node_instr_s3.node_nop)? curr_node_instr_s3.node_res_val : {tree_node_basic_s3.word_1_h, tree_node_basic_s3.word_1_l})
};
assign small_bitset = {tree_node_basic_s3.word_1_h, tree_node_basic_s3.word_1_l};
assign is_Feature_NaN = (data_mem_feature[30:23] == 7'b1111111) && (|data_mem_feature[22:0]);
assign small_bitset_eval = (data_mem_feature[31:5] == 0)? small_bitset[ data_mem_feature[4:0] ] : 1'b0;
assign split_value_eval = data_mem_feature < {tree_node_basic_s3.word_1_h, tree_node_basic_s3.word_1_l};
assign feature_data_outside_bitset = (data_mem_feature < tree_node_basic_s3.word_1_h) ||
(data_mem_feature > (tree_node_basic_s3.word_1_h + (tree_node_basic_s3.word_1_l << 5) ) );
// Pipeline to next stage
delay #(.DATA_WIDTH($bits(NodeInstruction_t) + $bits(TreeNode_t) + 5 + 1 + 1 + 1 + 1),
.DELAY_CYCLES(TREE_MEM_READ_LATENCY)
) ReadSplitSetStageDelay(
.clk (clk),
.rst_n (rst_n),
.data_in ( {curr_node_instr_s3_1, tree_node_basic_s3, data_mem_feature[4:0], small_bitset_eval, split_value_eval, feature_data_outside_bitset, is_Feature_NaN} ), //
.data_in_valid ( curr_node_instr_valid_s3 ),
.data_out ( {curr_node_instr_s4, tree_node_basic_s4, feature_data_s4, small_bitset_eval_s4, split_value_eval_s4, feature_data_outside_bitset_s4, is_Feature_NaN_s4} ),
.data_out_valid ( curr_node_instr_valid_s4 )
);
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////// Node Evaluation Stage /////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
/*
*/
always@(posedge clk)begin
if(~rst_n) begin
node_eval <= 1'b0;
end
else begin
case (tree_node_basic_s4.node_type.op_type)
2'b00: node_eval <= split_value_eval_s4;
2'b10: node_eval <= small_bitset_eval_s4;
2'b11: node_eval <= (feature_data_outside_bitset_s4)? 1'b0 : node_large_bitset[ feature_data_s4 ];
default : node_eval <= 1'b0;
endcase
end
end
// Early Node evaluation
assign curr_node_instr_s4_1 = '{ node_address: curr_node_instr_s4.node_address,
tuple_offset: curr_node_instr_s4.tuple_offset,
node_level : curr_node_instr_s4.node_level,
empty_tree : curr_node_instr_s4.empty_tree,
node_nop : curr_node_instr_s4.node_nop,
last_tree : curr_node_instr_s4.last_tree,
leaf_node : curr_node_instr_s4.leaf_node,
node_res_val: ( (curr_node_instr_s4.node_nop || !curr_node_instr_s4.leaf_node)? curr_node_instr_s4.node_res_val : node_large_bitset )
};
// Pipeline to next stage
delay #(.DATA_WIDTH($bits(NodeInstruction_t) + $bits(TreeNode_t) + 1),
.DELAY_CYCLES(1)
) EvalNodeStageDelay(
.clk (clk),
.rst_n (rst_n),
.data_in ( {curr_node_instr_s4_1, tree_node_basic_s4, is_Feature_NaN_s4} ), //
.data_in_valid ( curr_node_instr_valid_s4 ),
.data_out ( {curr_node_instr_s5, tree_node_basic_s5, is_Feature_NaN_s5} ),
.data_out_valid ( curr_node_instr_valid_s5 )
);
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////// Pick Branch Stage /////////////////////////
////////////////////////////////////// //////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
/*
*/
always@(*)begin
if(tree_node_basic_s5.node_type.split_dir == 4'b0001) begin
go_right = is_Feature_NaN_s5;
end
else if(tree_node_basic_s5.node_type.split_dir[0] == 1'b0) begin
if(is_Feature_NaN_s5) begin
go_right = 1'b0;
end
else begin
go_right = node_eval;
end
end
else begin
if(is_Feature_NaN_s5) begin
go_right = 1'b1;
end
else begin
go_right = node_eval;
end
end
end
// compute left child offset,
assign left_child_offset_1 = 11'b00000000010 + ((tree_node_basic_s5.node_type.left_child)? 0 : 1'b1) + ((tree_node_basic_s5.node_type.right_child)? 0 : 1'b1);
assign left_child_offset_2 = left_child_offset_1 + ( (tree_node_basic_s5.node_type.op_type != 3)? 11'd0 : tree_node_basic_s5.word_1_l[10:0]);
// compute right child offset
assign right_child_offset = left_child_offset_2 + tree_node_basic_s5.right_child_offset[10:0];
assign next_child_offset = (go_right && tree_node_basic_s5.node_type.left_child)? right_child_offset : left_child_offset_2;
// compute leaf node result offset
assign result_offset = (go_right)? 11'b00000000010 + ((tree_node_basic_s5.node_type.left_child)? 0 : 1'b1) : 11'b00000000010;
assign next_node_leaf = (go_right)? !tree_node_basic_s5.node_type.right_child : !tree_node_basic_s5.node_type.left_child;
// Next level node address
assign next_node_address = curr_node_instr_s5.node_address + ( (next_node_leaf)? result_offset : next_child_offset);
assign next_node_level = (curr_node_instr_s5.node_level == (tree_depth))? (tree_depth) : curr_node_instr_s5.node_level + 1'b1;
assign next_node_nop = curr_node_instr_s5.node_nop || ((curr_node_instr_s5.node_level < (tree_depth) && curr_node_instr_s5.leaf_node)? 1'b1 : 1'b0);
assign curr_node_instr_s5_1 = '{ node_address: next_node_address,
tuple_offset: curr_node_instr_s5.tuple_offset,
node_level : next_node_level,
empty_tree : curr_node_instr_s5.empty_tree,
node_nop : next_node_nop,
last_tree : curr_node_instr_s5.last_tree,
leaf_node : ( curr_node_instr_s5.leaf_node || next_node_leaf),
node_res_val: curr_node_instr_s5.node_res_val
};
// Pipeline to next stage
delay #(.DATA_WIDTH( $bits(NodeInstruction_t) ),
.DELAY_CYCLES(1)
) PickBranchStageDelay(
.clk (clk),
.rst_n (rst_n),
.data_in ( curr_node_instr_s5_1 ), //
.data_in_valid ( curr_node_instr_valid_s5 && (curr_node_instr_s5.node_level < (tree_depth)) ),
.data_out ( next_node_instr ),
.data_out_valid ( next_node_instr_valid )
);
assign tree_eval_result = (curr_node_instr_s5.empty_tree)? 32'b0 : curr_node_instr_s5.node_res_val;
assign tree_eval_result_valid = curr_node_instr_valid_s5 && (curr_node_instr_s5.node_level == (tree_depth) );
assign tree_eval_last = curr_node_instr_s5.last_tree;
endmodule