From 357b2bc35a03d997c336df7e03809321e52b0a81 Mon Sep 17 00:00:00 2001 From: kodario Date: Thu, 15 Jun 2023 11:44:39 +0200 Subject: [PATCH] completion test. --- driver/fpga_fops.c | 75 ++++++++++--------- hw/hdl/mmu/tlb_fsm_rd.sv | 1 + hw/hdl/mmu/tlb_region_top.sv | 26 +------ hw/hdl/network/rdma/rdma_flow.sv | 65 ++++++++-------- hw/hdl/network/rdma/rdma_req_parser.sv | 25 ------- hw/hdl/network/rdma/roce_stack.sv | 30 +++++++- hw/hdl/network/stack/network_bp_drop.sv | 49 ------------ hw/hdl/slave/cnfg_slave.sv | 8 +- hw/hdl/slave/cnfg_slave_avx.sv | 12 +-- hw/scripts/wr_hdl/template_gen/lynx_pkg.txt | 6 +- .../network/hls/rocev2/rocev2_config.hpp.in | 4 +- sw/examples/perf_rdma/main.cpp | 6 +- sw/include/cDefs.hpp | 5 +- sw/include/cProcess.hpp | 2 + sw/src/cProcess.cpp | 50 ++++++++++++- sw/src/ibvQpConn.cpp | 2 +- 16 files changed, 175 insertions(+), 191 deletions(-) diff --git a/driver/fpga_fops.c b/driver/fpga_fops.c index 4540d71..69702a5 100644 --- a/driver/fpga_fops.c +++ b/driver/fpga_fops.c @@ -49,7 +49,7 @@ int fpga_open(struct inode *inode, struct file *file) struct fpga_dev *d = container_of(inode->i_cdev, struct fpga_dev, cdev); BUG_ON(!d); - dbg_info("fpga device %d acquired, inode %ld\n", minor, inode->i_ino); + dbg_info("fpga device %d acquired, calling pid %d\n", minor, current->pid); // set private data file->private_data = (void *)d; @@ -89,7 +89,7 @@ int fpga_release(struct inode *inode, struct file *file) } } - dbg_info("fpga device %d released, inode %ld\n", minor, inode->i_ino); + dbg_info("fpga device %d released, pid %d\n", minor, current->pid); return 0; } @@ -225,32 +225,34 @@ long fpga_ioctl(struct file *file, unsigned int cmd, unsigned long arg) // register pid case IOCTL_REGISTER_PID: - spin_lock(&pd->stat_lock); + ret_val = copy_from_user(&tmp, (unsigned long *)arg, sizeof(unsigned long)); + if (ret_val != 0) { + pr_info("user data could not be coppied, return %d\n", ret_val); + } else { + spin_lock(&pd->stat_lock); - pid = current->pid; + pid = (pid_t) tmp[0]; - cpid = (uint64_t)register_pid(d, pid); - if (cpid == -1) - { - dbg_info("registration failed pid %d\n", pid); - return -1; + cpid = (int32_t)register_pid(d, pid); + if (cpid == -1) + { + dbg_info("registration failed pid %d\n", pid); + return -1; + } + + tmp_cid = kzalloc(sizeof(struct cid_entry), GFP_KERNEL); + BUG_ON(!tmp_cid); + + tmp_cid->pid = pid; + tmp_cid->cpid = cpid; + + hash_add(pid_cpid_map[d->id], &tmp_cid->entry, pid); + + // return cpid + ret_val = copy_to_user((unsigned long *)arg + 1, &cpid, sizeof(unsigned long)); + + spin_unlock(&pd->stat_lock); } - - dbg_info("registration succeeded pid %d, cpid %lld\n", pid, cpid); - - tmp_cid = kzalloc(sizeof(struct cid_entry), GFP_KERNEL); - BUG_ON(!tmp_cid); - - tmp_cid->pid = pid; - tmp_cid->cpid = cpid; - - hash_add(pid_cpid_map[d->id], &tmp_cid->entry, pid); - - // return cpid - ret_val = copy_to_user((unsigned long *)arg + 1, &cpid, sizeof(unsigned long)); - - spin_unlock(&pd->stat_lock); - break; // unregister pid @@ -259,35 +261,30 @@ long fpga_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ret_val = copy_from_user(&tmp, (unsigned long *)arg, sizeof(unsigned long)); if (ret_val != 0) { pr_info("user data could not be coppied, return %d\n", ret_val); - } - else { + } else { spin_lock(&pd->stat_lock); cpid = tmp[0]; pid = d->pid_array[cpid]; - ret_val = unregister_pid(d, cpid); // tmp[0] - cpid - if (ret_val == -1) { - dbg_info("unregistration failed cpid %lld\n", cpid); - return -1; - } - // map hash_for_each_possible(pid_cpid_map[d->id], tmp_cid, entry, pid) { if(tmp_cid->pid == pid && tmp_cid->cpid == cpid) { // unamp all leftover user pages tlb_put_user_pages_cpid(d, cpid, 1); - // unregister (if registered) - unregister_pid(d, cpid); - // Free from hash hash_del(&tmp_cid->entry); } } + ret_val = unregister_pid(d, cpid); // tmp[0] - cpid + if (ret_val == -1) { + dbg_info("unregistration failed cpid %lld\n", cpid); + return -1; + } + spin_unlock(&pd->stat_lock); - dbg_info("unregistration succeeded cpid %lld\n", tmp[0]); } break; @@ -807,6 +804,8 @@ int32_t register_pid(struct fpga_dev *d, pid_t pid) // unlock spin_unlock(&d->card_pid_lock); + dbg_info("registration succeeded pid %d, cpid %d\n", pid, cpid); + return cpid; } @@ -832,6 +831,8 @@ int unregister_pid(struct fpga_dev *d, int32_t cpid) // release lock spin_unlock(&d->card_pid_lock); + dbg_info("unregistration succeeded cpid %d\n", cpid); + return 0; } diff --git a/hw/hdl/mmu/tlb_fsm_rd.sv b/hw/hdl/mmu/tlb_fsm_rd.sv index 51baf60..23e9bd2 100644 --- a/hw/hdl/mmu/tlb_fsm_rd.sv +++ b/hw/hdl/mmu/tlb_fsm_rd.sv @@ -538,6 +538,7 @@ end ///////////////////////////////////////////////////////////////////////////// // DEBUG ///////////////////////////////////////////////////////////////////////////// +//`define DBG_TLB_FSM_RD `ifdef DBG_TLB_FSM_RD ila_fsm inst_ila_rd ( .clk(aclk), diff --git a/hw/hdl/mmu/tlb_region_top.sv b/hw/hdl/mmu/tlb_region_top.sv index 4923776..5198881 100644 --- a/hw/hdl/mmu/tlb_region_top.sv +++ b/hw/hdl/mmu/tlb_region_top.sv @@ -334,31 +334,7 @@ axis_interconnect_tlb inst_mux_stlb ( .S00_ARB_REQ_SUPPRESS(1'b0), .S01_ARB_REQ_SUPPRESS(1'b0) ); -/* -ila_tlbf_2 inst_ila_tlbf2 ( - .clk(aclk), - .probe0(axis_sTlb_0.tvalid), - .probe1(axis_sTlb_0.tready), - .probe2(axis_sTlb_0.tdata), // 128 - .probe3(axis_sTlb_0.tlast), - - .probe4(axis_sTlb_1.tvalid), - .probe5(axis_sTlb_1.tready), - .probe6(axis_sTlb_1.tdata), // 128 - .probe7(axis_sTlb_1.tlast), - - .probe8(axis_sTlb.tvalid), - .probe9(axis_sTlb.tready), - .probe10(axis_sTlb.tdata), // 128 - .probe11(axis_sTlb.tlast), - - .probe12(s_axis_sTlb.tvalid), - .probe13(s_axis_sTlb.tready), - .probe14(s_axis_sTlb.tdata), // 512 - .probe15(s_axis_sTlb.tlast), - .probe16(s_axis_sTlb.tkeep) // 64 - ); -*/ + axis_interconnect_tlb inst_mux_ltlb ( .ACLK(aclk), .ARESETN(aresetn), diff --git a/hw/hdl/network/rdma/rdma_flow.sv b/hw/hdl/network/rdma/rdma_flow.sv index 2776c06..bd5e8ef 100644 --- a/hw/hdl/network/rdma/rdma_flow.sv +++ b/hw/hdl/network/rdma/rdma_flow.sv @@ -40,29 +40,6 @@ logic [PID_BITS-1:0] req_pid; metaIntf #(.STYPE(rdma_ack_t)) ack_que_in (); -ila_req inst_ila_req ( - .clk(aclk), - .probe0(s_req.valid), - .probe1(s_req.ready), - .probe2(s_req.data), // 512 - .probe3(m_req.valid), - .probe4(m_req.ready), - .probe5(head_C[0][0]), // 4 - .probe6(tail_C[0][0]), // 4 - .probe7(issued_C[0][0]), - .probe8(ssn_rd_C), - .probe9(ack_vfid_C[0]), - .probe10(ack_pid_C[0]), // 6 - .probe11(ack_rd_C), - .probe12(s_ack.valid), - .probe13(s_ack.ready), - .probe14(s_ack.data), // 40 - .probe15(ssn_wr), // 4 - .probe16(ssn_addr), // 12 - .probe17(ssn_in), // 32 - .probe18(ssn_out) // 32 -); - ram_sp_nc #( .ADDR_BITS(1+N_REGIONS_BITS+PID_BITS+RDMA_OST_BITS), .DATA_BITS(32) @@ -111,15 +88,11 @@ always_comb begin ssn_wr = 0; ssn_addr = 0; - ssn_in = {s_req.data.cmplt, s_req.data.last, s_req.data.ssn}; + ssn_in = {6'd0, s_req.data.cmplt, s_req.data.last, s_req.data.ssn}; s_ack.ready = 1'b0; s_req.ready = 1'b0; - m_req.valid = s_req.valid & s_req.ready; - m_req.data = s_req.data; - m_req.data.offs = head_C[req_rd][req_vfid][req_pid]; - if(s_ack.valid) begin // Service ack s_ack.ready = 1'b1; @@ -130,7 +103,7 @@ always_comb begin end ssn_rd_N = 1'b1; - ssn_addr = {ack_rd, ack_vfid, ack_pid, tail_C[ack_rd][ack_vfid][ack_pid]}; + ssn_addr = {ack_rd, ack_vfid[N_REGIONS_BITS-1:0], ack_pid, tail_C[ack_rd][ack_vfid][ack_pid]}; ack_vfid_N = ack_vfid; ack_pid_N = ack_pid; @@ -145,11 +118,18 @@ always_comb begin issued_N[req_rd][req_vfid][req_pid] = 1'b1; ssn_wr = ~0; - ssn_addr = {req_rd, req_vfid, req_pid, head_C[req_rd][req_vfid][req_pid]}; + ssn_addr = {req_rd, req_vfid[N_REGIONS_BITS-1:0], req_pid, head_C[req_rd][req_vfid][req_pid]}; end end end +always_comb begin + m_req.valid = s_req.valid & s_req.ready; + + m_req.data = s_req.data; + m_req.data.offs = head_C[req_rd][req_vfid][req_pid]; +end + // DP assign ack_que_in.valid = ssn_rd_C && ssn_out[RDMA_MSN_BITS]; assign ack_que_in.data.rd = ack_rd_C; @@ -177,4 +157,29 @@ assign req_rd = s_req.data.opcode == RC_RDMA_READ_REQUEST; assign req_pid = s_req.data.qpn[0+:PID_BITS]; assign req_vfid = s_req.data.qpn[PID_BITS+:N_REGIONS_BITS]; +/* +ila_req inst_ila_req ( + .clk(aclk), + .probe0(s_req.valid), + .probe1(s_req.ready), + .probe2(s_req.data), // 512 + .probe3(m_req.valid), + .probe4(m_req.ready), + .probe5(head_C[0][0][0]), // 4 + .probe6(tail_C[0][0][0]), // 4 + .probe7(issued_C[0][0][0]), + .probe8(ssn_rd_C), + .probe9(ack_vfid_C[0]), + .probe10(ack_pid_C[0]), // 6 + .probe11(ack_rd_C), + .probe12(s_ack.valid), + .probe13(s_ack.ready), + .probe14(s_ack.data), // 40 + .probe15(ssn_wr), // 4 + .probe16(ssn_addr), // 12 + .probe17(ssn_in), // 32 + .probe18(ssn_out) // 32 +); +*/ + endmodule \ No newline at end of file diff --git a/hw/hdl/network/rdma/rdma_req_parser.sv b/hw/hdl/network/rdma/rdma_req_parser.sv index edd93ad..7517d72 100644 --- a/hw/hdl/network/rdma/rdma_req_parser.sv +++ b/hw/hdl/network/rdma/rdma_req_parser.sv @@ -82,31 +82,6 @@ logic [RDMA_LEN_BITS-1:0] plen_C, plen_N; metaIntf #(.STYPE(rdma_req_t)) req_pre_parsed (); metaIntf #(.STYPE(rdma_req_t)) req_parsed (); -ila_req_parser inst_ila_parser ( - .clk(aclk), - .probe0(s_req.valid), - .probe1(s_req.ready), - .probe2(m_req.valid), - .probe3(m_req.ready), - .probe4(state_C), // 4 - .probe5(qp_C), // 10 - .probe6(host_C), - .probe7(mode_C), - .probe8(cmplt_C), - .probe9(ssn_C), // 24 - .probe10(params_C), // 256 - .probe11(op_C), // 5 - .probe12(last_C), - .probe13(lvaddr_C[47:0]), // 48 - .probe14(rvaddr_C[47:0]), // 48 - .probe15(len_C), // 28 - .probe16(pop_C), // 5 - .probe17(plast_C), - .probe18(plvaddr_C[47:0]), // 48 - .probe19(prvaddr_C[47:0]), // 48 - .probe20(plen_C) // 32 -); - // Decoupling `META_ASSIGN(s_req, req_pre_parsed) diff --git a/hw/hdl/network/rdma/roce_stack.sv b/hw/hdl/network/rdma/roce_stack.sv index 75fc899..036f060 100644 --- a/hw/hdl/network/rdma/roce_stack.sv +++ b/hw/hdl/network/rdma/roce_stack.sv @@ -144,7 +144,7 @@ assign rdma_ack.data.rd = ack_meta_data[0]; assign rdma_ack.data.cmplt = 1'b0; assign rdma_ack.data.pid = ack_meta_data[1+:PID_BITS]; assign rdma_ack.data.vfid = ack_meta_data[1+PID_BITS+:N_REGIONS_BITS]; -assign rdma_ack.data.ssn = ack_meta_data[1+RDMA_ACK_QPN_BITS+:RDMA_ACK_PSN_BITS]; +assign rdma_ack.data.ssn = ack_meta_data[1+RDMA_ACK_QPN_BITS+:RDMA_ACK_PSN_BITS]; // msn // Flow control rdma_flow inst_rdma_flow ( @@ -318,4 +318,32 @@ rocev2_ip rocev2_inst( `endif ); +/* +ila_ack inst_ila_ack ( + .clk(nclk), + .probe0(rdma_ack.valid), + .probe1(rdma_ack.ready), + .probe2(rdma_ack.data), // 36 + .probe3(rdma_sq.valid), + .probe4(rdma_sq.ready), + .probe5(rdma_sq.data), // 512 + .probe6(s_axis_rx.tvalid), + .probe7(s_axis_rx.tready), + .probe8(s_axis_rx.tlast), + .probe9(m_axis_tx.tvalid), + .probe10(m_axis_tx.tready), + .probe11(m_axis_tx.tlast), + .probe12(m_rdma_wr_req.valid), + .probe13(m_rdma_wr_req.ready), + .probe14(m_rdma_rd_req.valid), + .probe15(m_rdma_rd_req.ready), + .probe16(m_axis_rdma_wr.tvalid), + .probe17(m_axis_rdma_wr.tready), + .probe18(m_axis_rdma_wr.tlast), + .probe19(s_axis_rdma_rd.tvalid), + .probe20(s_axis_rdma_rd.tready), + .probe21(s_axis_rdma_rd.tlast) +); +*/ + endmodule \ No newline at end of file diff --git a/hw/hdl/network/stack/network_bp_drop.sv b/hw/hdl/network/stack/network_bp_drop.sv index ec61c7d..13ef1b7 100644 --- a/hw/hdl/network/stack/network_bp_drop.sv +++ b/hw/hdl/network/stack/network_bp_drop.sv @@ -109,54 +109,5 @@ end // Slices (RX and TX) axis_reg_array #(.N_STAGES(N_STGS)) inst_rx (.aclk(aclk), .aresetn(aresetn), .s_axis(rx_axis), .m_axis(m_rx_axis)); axis_reg_array #(.N_STAGES(N_STGS)) inst_tx (.aclk(aclk), .aresetn(aresetn), .s_axis(s_tx_axis), .m_axis(m_tx_axis)); - -/* -logic [31:0] cnt_data_s; -logic [31:0] cnt_data_s_n4k_rx; -logic [31:0] cnt_data_m; -logic [31:0] cnt_data_m_n4k_rx; - -always_ff @(posedge aclk) begin - if(~aresetn) begin - cnt_data_s_n4k_rx <= 0; - cnt_data_s <= 0; - cnt_data_m_n4k_rx <= 0; - cnt_data_m <= 0; - end - else begin - cnt_data_s <= (s_rx_axis.tvalid & s_rx_axis.tready & s_rx_axis.tlast) ? - 0 : (s_rx_axis.tvalid & s_rx_axis.tready ? cnt_data_s + 1 : cnt_data_s); - cnt_data_s_n4k_rx <= (s_rx_axis.tvalid & s_rx_axis.tready & s_rx_axis.tlast) && (cnt_data_s != 64) ? cnt_data_s_n4k_rx + 1 : cnt_data_s_n4k_rx; - - cnt_data_m <= (rx_axis.tvalid & rx_axis.tready & rx_axis.tlast) ? - 0 : (rx_axis.tvalid & rx_axis.tready ? cnt_data_m + 1 : cnt_data_m); - cnt_data_m_n4k_rx <= (rx_axis.tvalid & rx_axis.tready & rx_axis.tlast) && (cnt_data_m != 64) ? cnt_data_m_n4k_rx + 1 : cnt_data_m_n4k_rx; - end -end - -ila_nstack inst_ila_nstack ( - .clk(aclk), - .probe0(s_rx_axis.tvalid), - .probe1(s_rx_axis.tready), - .probe2(s_rx_axis.tdata), // 512 - .probe3(s_rx_axis.tlast), - .probe4(s_rx_axis.tkeep), // 64 - - .probe5(rx_axis.tvalid), - .probe6(rx_axis.tready), - .probe7(rx_axis.tdata), // 512 - .probe8(rx_axis.tlast), - .probe9(rx_axis.tkeep), // 64 - - .probe10(cnt_data_s), // 32 - .probe11(cnt_data_m), // 32 - .probe12(cnt_data_s_n4k_rx), // 32 - .probe13(cnt_data_m_n4k_rx), // 32 - - .probe14(prog_full), - .probe15(wr_cnt), // 32 - .probe16(state_C) // 2 -); -*/ endmodule \ No newline at end of file diff --git a/hw/hdl/slave/cnfg_slave.sv b/hw/hdl/slave/cnfg_slave.sv index 367ac92..9369ba8 100644 --- a/hw/hdl/slave/cnfg_slave.sv +++ b/hw/hdl/slave/cnfg_slave.sv @@ -649,9 +649,9 @@ always_ff @(posedge aclk) begin RDMA_0_STAT_POSTED_REG: // Posts axi_rdata[31:0] <= slv_reg[RDMA_0_STAT_POSTED_REG][31:0]; RDMA_0_CMPLT_REG: begin - axi_rdata[0] <= cmplt_que_rdma_0_out.data.ssn; + axi_rdata[0] <= cmplt_que_rdma_0_out.data.valid; axi_rdata[RDMA_CMPLT_PID_OFFS+:PID_BITS] <= cmplt_que_rdma_0_out.data.pid; - axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_0_out.valid; + axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_0_out.ssn; end `endif @@ -677,9 +677,9 @@ always_ff @(posedge aclk) begin RDMA_1_STAT_POSTED_REG: // Posts axi_rdata[31:0] <= slv_reg[RDMA_1_STAT_POSTED_REG][31:0]; RDMA_1_CMPLT_REG: begin - axi_rdata[0] <= cmplt_que_rdma_1_out.data.ssn; + axi_rdata[0] <= cmplt_que_rdma_1_out.data.valid; axi_rdata[RDMA_CMPLT_PID_OFFS+:PID_BITS] <= cmplt_que_rdma_1_out.data.pid; - axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_1_out.valid; + axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_1_out.ssn; end `endif diff --git a/hw/hdl/slave/cnfg_slave_avx.sv b/hw/hdl/slave/cnfg_slave_avx.sv index ff57d71..b6243c0 100644 --- a/hw/hdl/slave/cnfg_slave_avx.sv +++ b/hw/hdl/slave/cnfg_slave_avx.sv @@ -494,9 +494,9 @@ always_ff @(posedge aclk) begin [RDMA_0_STAT_REG:RDMA_0_STAT_REG]: axi_rdata[63:0] <= {slv_reg[RDMA_0_STAT_REG][RDMA_POSTED_OFFS+:32], rdma_0_queue_used[31:0]}; [RDMA_0_CMPLT_REG:RDMA_0_CMPLT_REG]: begin - axi_rdata[0] <= cmplt_que_rdma_0_out.data.ssn; + axi_rdata[0] <= cmplt_que_rdma_0_out.data.valid; axi_rdata[RDMA_CMPLT_PID_OFFS+:PID_BITS] <= cmplt_que_rdma_0_out.data.pid; - axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_0_out.valid; + axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_0_out.ssn; end `endif @@ -508,9 +508,9 @@ always_ff @(posedge aclk) begin [RDMA_1_STAT_REG:RDMA_1_STAT_REG]: axi_rdata[63:0] <= {slv_reg[RDMA_1_STAT_REG][RDMA_POSTED_OFFS+:32], rdma_1_queue_used[31:0]}; [RDMA_1_CMPLT_REG:RDMA_1_CMPLT_REG]: begin - axi_rdata[0] <= cmplt_que_rdma_1_out.data.ssn; + axi_rdata[0] <= cmplt_que_rdma_1_out.data.valid; axi_rdata[RDMA_CMPLT_PID_OFFS+:PID_BITS] <= cmplt_que_rdma_1_out.data.pid; - axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_1_out.valid; + axi_rdata[RDMA_CMPLT_SSN_OFFS+:RDMA_MSN_BITS] <= cmplt_que_rdma_1_out.ssn; end `endif @@ -953,7 +953,7 @@ metaIntf #(.STYPE(rdma_req_t)) rdma_0_sq(); assign rdma_0_sq_cnfg.data.opcode = slv_reg[RDMA_0_POST_REG][1+:RDMA_OPCODE_BITS]; // opcode assign rdma_0_sq_cnfg.data.qpn[0+:PID_BITS] = slv_reg[RDMA_0_POST_REG][1+RDMA_OPCODE_BITS+:PID_BITS]; // local cpid assign rdma_0_sq_cnfg.data.qpn[PID_BITS+:DEST_BITS] = ID_REG; // local region -assign rdma_0_sq_cnfg.data.host = slv_reg[RDMA_0_POST_REG][1+RDMA_OPCODE_BITS+PID_BITS+DEST_BITS]; // host +assign rdma_0_sq_cnfg.data.host = 1'b1;//slv_reg[RDMA_0_POST_REG][1+RDMA_OPCODE_BITS+PID_BITS+DEST_BITS]; // host assign rdma_0_sq_cnfg.data.mode = RDMA_MODE_PARSE; // mode assign rdma_0_sq_cnfg.data.last = 1'b1; assign rdma_0_sq_cnfg.data.cmplt = slv_reg[RDMA_0_POST_REG][1+RDMA_OPCODE_BITS+PID_BITS+DEST_BITS+3]; @@ -1097,7 +1097,7 @@ metaIntf #(.STYPE(rdma_req_t)) rdma_1_sq(); assign rdma_1_sq_cnfg.data.opcode = slv_reg[RDMA_1_POST_REG][1+:RDMA_OPCODE_BITS]; // opcode assign rdma_1_sq_cnfg.data.qpn[0+:PID_BITS] = slv_reg[RDMA_1_POST_REG][1+RDMA_OPCODE_BITS+:PID_BITS]; // local cpid assign rdma_1_sq_cnfg.data.qpn[PID_BITS+:DEST_BITS] = ID_REG; // local region -assign rdma_1_sq_cnfg.data.host = slv_reg[RDMA_1_POST_REG][1+RDMA_OPCODE_BITS+PID_BITS+DEST_BITS]; // host +assign rdma_1_sq_cnfg.data.host = 1'b1; //slv_reg[RDMA_1_POST_REG][1+RDMA_OPCODE_BITS+PID_BITS+DEST_BITS]; // host assign rdma_1_sq_cnfg.data.mode = RDMA_MODE_PARSE; // mode assign rdma_1_sq_cnfg.data.last = 1'b1; assign rdma_1_sq_cnfg.data.cmplt = slv_reg[RDMA_1_POST_REG][1+RDMA_OPCODE_BITS+PID_BITS+DEST_BITS+3]; diff --git a/hw/scripts/wr_hdl/template_gen/lynx_pkg.txt b/hw/scripts/wr_hdl/template_gen/lynx_pkg.txt index 5c3517b..cffb75b 100644 --- a/hw/scripts/wr_hdl/template_gen/lynx_pkg.txt +++ b/hw/scripts/wr_hdl/template_gen/lynx_pkg.txt @@ -221,11 +221,11 @@ package lynxTypes; parameter integer RDMA_BASE_REQ_BITS = 96; parameter integer RDMA_VADDR_BITS = 64; parameter integer RDMA_LEN_BITS = 32; - parameter integer RDMA_REQ_BITS = 544; + parameter integer RDMA_REQ_BITS = 512; parameter integer RDMA_OPCODE_BITS = 5; parameter integer RDMA_QPN_BITS = 10; - parameter integer RDMA_PARAMS_BITS = 352; - parameter integer RDMA_MSG_BITS = 512; + parameter integer RDMA_PARAMS_BITS = 288; + parameter integer RDMA_MSG_BITS = 448; parameter integer RDMA_QP_INTF_BITS = 168; parameter integer RDMA_QP_CONN_BITS = 184; parameter integer RDMA_LVADDR_OFFS = 0; diff --git a/hw/services/network/hls/rocev2/rocev2_config.hpp.in b/hw/services/network/hls/rocev2/rocev2_config.hpp.in index 4553dc1..c85131e 100644 --- a/hw/services/network/hls/rocev2/rocev2_config.hpp.in +++ b/hw/services/network/hls/rocev2/rocev2_config.hpp.in @@ -1,9 +1,7 @@ #pragma once #include -//#if ${RETRANS_EN} - #define RETRANS_EN -//#endif +#define RETRANS_EN const unsigned DATA_WIDTH = ${DATA_WIDTH} * 8; diff --git a/sw/examples/perf_rdma/main.cpp b/sw/examples/perf_rdma/main.cpp index c5a7e57..972c66d 100644 --- a/sw/examples/perf_rdma/main.cpp +++ b/sw/examples/perf_rdma/main.cpp @@ -64,7 +64,7 @@ int main(int argc, char *argv[]) ("tcpaddr,t", boost::program_options::value(), "TCP conn IP") ("benchruns,b", boost::program_options::value(), "Number of bench runs") ("repst,r", boost::program_options::value(), "Number of throughput repetitions within a run") - ("repsl,r", boost::program_options::value(), "Number of latency repetitions within a run") + ("repsl,l", boost::program_options::value(), "Number of latency repetitions within a run") ("mins,n", boost::program_options::value(), "Minimum transfer size") ("maxs,x", boost::program_options::value(), "Maximum transfer size") ("oper,w", boost::program_options::value(), "Read or Write"); @@ -112,7 +112,7 @@ int main(int argc, char *argv[]) std::cout << "Max size: " << max_size << std::endl; std::cout << "Number of throughput reps: " << n_reps_thr << std::endl; std::cout << "Number of latency reps: " << n_reps_lat << std::endl; - + // Create queue pairs ibvQpMap ictx; ictx.addQpair(qpId, targetRegion, ibv_ip, n_pages); @@ -180,6 +180,8 @@ int main(int argc, char *argv[]) std::cout << std::fixed << std::setprecision(2); std::cout << std::setw(8) << sg.type.rdma.len << " [bytes], thoughput: " << std::setw(8) << ((1 + oper) * ((1000 * sg.type.rdma.len))) / ((bench.getAvg()) / n_reps_thr) << " [MB/s], latency: "; + + std::cout << std::endl << std::endl << "ACKs: " << cproc->ibvCheckAcks() << std::endl; #endif // Reset diff --git a/sw/include/cDefs.hpp b/sw/include/cDefs.hpp index 4ad6a3e..58a6c02 100644 --- a/sw/include/cDefs.hpp +++ b/sw/include/cDefs.hpp @@ -159,6 +159,7 @@ enum class CnfgAvxRegs : uint32_t { RDMA_POST_REG_0 = 17, RDMA_POST_REG_1 = 18, RDMA_STAT_REG = 19, + RDMA_CMPLT_REG = 20, STAT_DMA_REG = 64 }; @@ -197,7 +198,9 @@ enum class CnfgLegRegs : uint32_t { RDMA_POST_REG_7 = 40, RDMA_STAT_CMD_USED_REG = 41, RDMA_STAT_POSTED_REG = 42, - STAT_DMA_REG = 64 + RDMA_CMPLT_REG = 43, + STAT_DMA_REG = 64, + STAT_RDMA_REG = 128, }; /** diff --git a/sw/include/cProcess.hpp b/sw/include/cProcess.hpp index baa8f1a..c475abf 100644 --- a/sw/include/cProcess.hpp +++ b/sw/include/cProcess.hpp @@ -198,6 +198,8 @@ public: * @brief Return the number of completed RDMA acks * */ + uint32_t ibvCheckAcks(); + int32_t ibvGetCompleted(int32_t &cpid); uint32_t checkIbvAcks(); void clearIbvAcks(); diff --git a/sw/src/cProcess.cpp b/sw/src/cProcess.cpp index 91e3727..dfacd0c 100644 --- a/sw/src/cProcess.cpp +++ b/sw/src/cProcess.cpp @@ -555,6 +555,50 @@ void cProcess::clearCompleted() { // Network // ======------------------------------------------------------------------------------- +/** + * @brief Check number of completed RDMA operations + * + * @param cpid - Coyote operation struct + * @return uint32_t - number of completed operations + */ +uint32_t cProcess::ibvCheckAcks() { + if(fcnfg.en_wb) { + return wback[cpid + ((fcnfg.qsfp ? 3 : 2) * nCpidMax)]; + } else { +#ifdef EN_AVX + if(fcnfg.en_avx) + return fcnfg.qsfp ? _mm256_extract_epi32(cnfg_reg_avx[static_cast(CnfgAvxRegs::STAT_DMA_REG) + cpid], 3) : + _mm256_extract_epi32(cnfg_reg_avx[static_cast(CnfgAvxRegs::STAT_DMA_REG) + cpid], 2); + else +#endif + return (fcnfg.qsfp ? (HIGH_32(cnfg_reg[static_cast(CnfgLegRegs::STAT_RDMA_REG) + cpid])) : + ( LOW_32(cnfg_reg[static_cast(CnfgLegRegs::STAT_RDMA_REG) + cpid]))); + } +} + +/** + * @brief Check completion queue + * + * @param cmplt_cpid - Coyote pid + * @return int32_t - ssn + */ +int32_t cProcess::ibvGetCompleted(int32_t &cpid) { + uint64_t cmplt_meta; +#ifdef EN_AVX + if(fcnfg.en_avx) + cmplt_meta = _mm256_extract_epi64(cnfg_reg_avx[static_cast(CnfgAvxRegs::RDMA_CMPLT_REG)], 0); + else +#endif + cmplt_meta = cnfg_reg[static_cast(CnfgLegRegs::RDMA_CMPLT_REG)]; + + if(cmplt_meta & 0x1) { + cpid = (cmplt_meta >> 16) & 0x3f; + return HIGH_32(cmplt_meta); + } else { + return -1; + } +} + /** * @brief Post an IB operation * @@ -697,7 +741,7 @@ void cProcess::clearIbvAcks() { */ void cProcess::postCmd(uint64_t offs_3, uint64_t offs_2, uint64_t offs_1, uint64_t offs_0) { // Lock - //dlock.lock(); + dlock.lock(); // Check outstanding while (rdma_cmd_cnt > (cmd_fifo_depth - cmd_fifo_thr)) { @@ -714,10 +758,8 @@ void cProcess::postCmd(uint64_t offs_3, uint64_t offs_2, uint64_t offs_1, uint64 // Send #ifdef EN_AVX if(fcnfg.en_avx) { - std::cout << "HERE FIRING" << std::endl; cnfg_reg_avx[static_cast(CnfgAvxRegs::RDMA_POST_REG) + fcnfg.qsfp_offs] = _mm256_set_epi64x(offs_3, offs_2, offs_1, offs_0); - std::cout << "HERE FIRING 2" << std::endl; // Inc rdma_cmd_cnt++; } else { @@ -736,7 +778,7 @@ void cProcess::postCmd(uint64_t offs_3, uint64_t offs_2, uint64_t offs_1, uint64 #endif // Unlock - //dlock.unlock(); + dlock.unlock(); } // ======------------------------------------------------------------------------------- diff --git a/sw/src/ibvQpConn.cpp b/sw/src/ibvQpConn.cpp index acacf4a..a2e83a1 100644 --- a/sw/src/ibvQpConn.cpp +++ b/sw/src/ibvQpConn.cpp @@ -101,7 +101,7 @@ void ibvQpConn::initLocalQueue(string ip_addr) { qpair->local.rkey = 0; // Allocate buffer - void *vaddr = fdev->getMem({CoyoteAlloc::HOST_2M, n_pages}); + void *vaddr = fdev->getMem({CoyoteAlloc::HUGE_2M, n_pages}); qpair->local.vaddr = (uint64_t) vaddr; qpair->local.size = n_pages * hugePageSize; }