Revert "Change the architecture of the code and add setup.py"

2022-07-27 12:09:18 +08:00 · 2022-07-27 12:09:18 +08:00 · 7c56d95e6d
parent 0ab603e6a3
commit 7c56d95e6d
31 changed files with 0 additions and 2132 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +0,0 @@
-*.ipynb
-__pycache__
--- a/README.md
+++ b/README.md
@ -1,11 +1,3 @@
 # JSparse

 JSparse is a high-performance auto-differentiation library for sparse tensor. 
-
-
-## How to Install
-
-```
-cd python
-python setup.py install
-```
--- a/examples/example.py
+++ b/examples/example.py
@ -1,73 +0,0 @@
-from pickletools import optimize
-from statistics import mode
-import jittor
-from jittor import nn
-
-import JSparse 
-from JSparse import SparseTensor
-from JSparse import nn as spnn
-from JSparse.utils.quantize import sparse_quantize
-
-import numpy as np
-
-class RandomDataset(jittor.dataset.Dataset):
-
-    def __init__(self, input_size: int, voxel_size: float) -> None:
-        super().__init__()
-        self.set_attrs(total_len = input_size)
-        self.voxel_size = voxel_size
-
-    def __getitem__(self, _: int):
-        inputs = np.random.uniform(-100, 100, size=(self.total_len, 4))
-        labels = np.random.choice(10, size=self.total_len)
-
-        coords, feats = inputs[:, :], inputs
-        coords -= np.min(coords, axis=0, keepdims=True)
-#       coords, indices = sparse_quantize(coords, self.voxel_size, return_index=True)
-
-        coords = jittor.Var(coords)
-        feats = jittor.Var(feats)
-        labels = jittor.Var(labels)
-#       coords = jittor.Var(coords, dtype=jittor.int64)
-#       feats = jittor.Var(feats[indices], dtype=jittor.float64)
-#       labels = jittor.Var(labels[indices], dtype=jittor.int64)
-
-        print(type(coords))
-        inputs = SparseTensor(coords, feats, 1, 1)
-        labels = SparseTensor(coords, labels, 1, 1)
-        return inputs, labels
-
-if __name__ == '__main__':
-    np.random.seed(0)
-
-    dataset = RandomDataset(input_size=10000, voxel_size=0.2)
-
-    model = nn.Sequential(
-        spnn.Conv3d(4, 32, 3),
-        spnn.BatchNorm(32),
-        spnn.ReLU(True),
-        spnn.Conv3d(32, 64, 2, stride=2),
-        spnn.BatchNorm(64),
-        spnn.ReLU(True),
-        spnn.Conv3d(64, 64, 2, stride=2, transposed=True),
-        spnn.BatchNorm(64),
-        spnn.ReLU(True),
-        spnn.Conv3d(64, 32, 3),
-        spnn.BatchNorm(32),
-        spnn.ReLU(True),
-        spnn.Conv3d(32, 10, 1),
-    )
-
-    criterion = nn.CrossEntropyLoss()
-    optimizer = jittor.optim.Adam(model.parameters(), lr=1e-3)
-
-    model.train()
-    lens = len(dataset)
-    for batch_idx, (inputs, labels) in enumerate(dataset):
-        outputs = model(inputs)
-        loss = criterion(outputs, labels)
-        optimizer.setp(loss)
-
-        if batch_idx % 10 == 0:
-            print('Training: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                batch_idx, lens , 100. * batch_idx / lens, loss.numpy()[0]))
--- a/python/JSparse/init.py
+++ b/python/JSparse/init.py
@ -1 +0,0 @@
-from .sparse import *
--- a/python/JSparse/indice_manager.py
+++ b/python/JSparse/indice_manager.py
@ -1,34 +0,0 @@
-import os
-import numpy as np
-from typing import Union, List, Tuple
-
-import jittor as jt
-from jittor import Function
-
-class IndiceManager:
-    def __init__(
-        self,
-        ndim,
-        # indice_map_type,
-        # sparse_alorithm, # set m_hashtable_occupancy for concurrent_unordered_map 
-    ):
-        # if indice_map_type == 'GPU':
-        #     assert(jt.has_cuda)
-
-        self.ndim = ndim
-        # self.indice_map_type = indice_map_type
-        # self.sparse_algorithm = sparse_alorithm
-        self.stride_key_manager = {}
-        self.indice_map_manager = {}
-        self.kernel_map_manager = {}
-    
-    def insert(self, stride, indice_key, indice_hash):
-        self.stride_key_manager[stride] = indice_key
-        self.indice_map_manager[indice_key] = indice_hash
-
-
-# class IndiceMapManager
-        
-        
-
-        
--- a/python/JSparse/nn/init.py
+++ b/python/JSparse/nn/init.py
@ -1 +0,0 @@
-from .modules import *
--- a/python/JSparse/nn/functional/init.py
+++ b/python/JSparse/nn/functional/init.py
@ -1,10 +0,0 @@
-from .activation import *
-from .conv import *
-from .count import *
-# from .crop import *
-from .devoxelize import *
-from .downsample import *
-from .hash import *
-from .pooling import *
-from .query import *
-from .voxelize import *
--- a/python/JSparse/nn/functional/activation.py
+++ b/python/JSparse/nn/functional/activation.py
@ -1,24 +0,0 @@
-import jittor as jt
-import jittor.nn as nn
-
-from JSparse import SparseTensor
-from JSparse.nn.utils import fapply
-
-__all__ = ['relu', 'leaky_relu']
-# __all__ = ['relu', 'leaky_relu', 'ReLU', 'LeakyReLU']
-
-def relu(input: SparseTensor) -> SparseTensor:
-    return fapply(input, nn.relu)
-
-
-def leaky_relu(input: SparseTensor,
-               scale: float = 0.01) -> SparseTensor:
-    return fapply(input,
-                  nn.leaky_relu,
-                  scale=scale)
-
-# Relu = jt.make_module(relu)
-# ReLU = Relu
-# Leaky_relu = jt.make_module(leaky_relu, 2)
-# LeakyReLU = Leaky_relu
-
--- a/python/JSparse/nn/functional/conv.py
+++ b/python/JSparse/nn/functional/conv.py
@ -1,351 +0,0 @@
-from typing import List, Optional, Tuple, Union
-
-import jittor as jt
-from jittor import Function
-from jittor.misc import _pair, _triple
-
-from JSparse import SparseTensor
-from JSparse.nn import functional as F
-from JSparse.nn.utils import get_kernel_offsets
-from JSparse import make_ntuple
-
-__all__ = ['conv3d', 'Convolution']
-
-class Convolution(Function):
-    def execute(
-        self, 
-        input: jt.Var,
-        weight: jt.Var,
-        nbmaps: jt.Var,
-        nbsizes: jt.Var,
-        sizes: Tuple[int, int],
-        transposed: bool = False,
-    ) -> jt.Var:
-        if not transposed:
-            output = jt.zeros((sizes[1], weight.size(-1)))
-        else:
-            output = jt.zeros((sizes[0], weight.size(-1)))
-
-        assert input.size(1) == weight.size(1)
-        in_size = input.size(0)
-        in_channels = input.size(1)
-        out_size = output.size(0)
-        out_channels = output.size(1)
-
-        kernel_volume = weight.size(0)
-        flag = False
-        mid = kernel_volume // 2
-        in_buffer_size = jt.Var(1)
-        if kernel_volume % 2 and out_size == in_size:
-            flag = True
-            in_buffer_size = max(in_buffer_size, jt.max(nbsizes[:mid]))
-            in_buffer_size = max(in_buffer_size, jt.max(nbsizes[mid + 1:]))
-            output = jt.matmul(input, weight[mid, :, :])
-        else:
-            in_buffer_size = jt.max(nbsizes)
-        
-        # in_buffer_activated : in_buffer_size * in_channels
-        # weight              : in_channels    * out_channels
-        # out_buffer_activated: in_buffer_size * out_channels
-        # out_buffer_activated = in_buffer_activated * weight
-        in_buffer_activated = jt.zeros(in_buffer_size.tolist() + [in_channels])
-        cur_offset = jt.Var(0)
-        for i in range(kernel_volume):
-            n_active_feats = nbsizes[i]
-            if (flag and (i == mid)):
-                cur_offset += n_active_feats
-                continue
-            if n_active_feats == 0:
-                continue
-
-            t = 1 if transposed else 0
-            ################
-            ## gather
-            ################
-            # print(n_active_feats, in_channels, cur_offset, t)
-            gather(n_active_feats, in_channels, cur_offset, t, input, in_buffer_activated, nbmaps)
-            ################
-            ## matmul
-            ################
-            out_buffer_activated = jt.matmul(in_buffer_activated, weight[i, :, :])
-            ################
-            ## scatter
-            ################
-            scatter(n_active_feats, out_channels, cur_offset, t, out_buffer_activated, output, nbmaps)
-            cur_offset += n_active_feats
-        self.save_vars = input, weight, nbmaps, nbsizes, transposed
-        return output
-    
-    def grad(
-        self, 
-        grad_output: jt.Var
-    ) -> Tuple[Optional[jt.Var], ...]:
-        input, weight, nbmaps, nbsizes, transposed = self.save_vars
-
-        grad_input = jt.zeros_like(input)
-        grad_weight = jt.zeros_like(weight)
-
-        # n_in_feats = input.size(0)
-        # n_out_feats = grad_output.size(0)
-        n_in_channels = input.size(1)
-        n_out_channels = weight.size(-1)
-
-        kernel_volume = weight.size(0)
-        flag = False
-        in_buffer_size = jt.max(nbsizes)
-        # out_grad_buffer_activated     n_active_feats x n_out_channels
-        # in_grad_buffer_activated      n_active_feats x n_in_channels
-        # in_buffer_activated           n_active_feats x n_in_channels
-        out_grad_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_out_channels])
-        in_grad_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_in_channels])
-        in_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_in_channels])
-
-        cur_offset = jt.Var(0)
-        mid = kernel_volume // 2
-        for i in range(kernel_volume):
-            # kernel_grad_buffer = grad_weight[i, :, :]
-            n_active_feats = nbsizes[i]
-            # if flag and (i == mid):
-            #     cur_offset += n_active_feats
-            #     continue
-
-            if n_active_feats == 0:
-                continue
-
-            t = 1 if transposed else 0
-            ################
-            ## gather
-            ################
-            gather(n_active_feats, n_out_channels, cur_offset, 1 - t, grad_output, out_grad_buffer_activated, nbmaps)
-            gather(n_active_feats, n_in_channels , cur_offset, t    , input      , in_buffer_activated      , nbmaps)
-            ################
-            ## matmul
-            ################
-            # grad for input
-            #    in_grad_buffer_activated     =     out_grad_buffer_activated     @            weight^T
-            # n_active_feats x n_in_channels     n_active_feats x n_out_channels     n_out_channels x n_in_channels
-            in_grad_buffer_activated = jt.nn.matmul_transpose(out_grad_buffer_activated, weight[i, :, :])
-            # grad for weight
-            #       kernel_grad_buffer        =     in_buffer_activated^T         @    out_grad_buffer_activated
-            # n_in_channels x n_out_channels     n_in_channels x n_active_feats      n_active_feats x n_out_channels 
-            grad_weight[i, :, :] = jt.nn.matmul(in_buffer_activated.t(), out_grad_buffer_activated)
-            ################
-            ## scatter
-            ################
-            scatter(n_active_feats, n_in_channels, cur_offset, 1 - t, in_grad_buffer_activated, grad_input, nbmaps)
-            cur_offset += n_active_feats  
-        return grad_input, grad_weight, None, None, None, None
-
-def gather(
-    n_active_feats,
-    channels,
-    cur_offset,
-    transpose,
-    in_feat,
-    out_feat,
-    kmap,
-):
-    shape = n_active_feats.tolist() + cur_offset.tolist() + [channels, transpose, 0]
-    gather_args = jt.zeros(shape, dtype='int32')
-    return jt.code((0, ), out_feat.dtype, [in_feat, out_feat, kmap, gather_args],
-        cuda_header="""
-            @alias(in_feat, in0)
-            @alias(out_feat, in1)
-            @alias(kmap, in2)
-            @alias(args, in3)
-        """,
-        cuda_src="""
-            __global__ void gather_kernel(@ARGS_DEF) {
-                @PRECALC
-                const int n_k = args_shape0;
-                const int st = args_shape1;
-                const int c = args_shape2;
-                const int transpose = args_shape3;
-
-                int idx = blockDim.x * blockIdx.x + threadIdx.x;
-                int i = idx / c;
-                int j = idx % c;
-                if (i >= n_k) return;
-                int in_pos = @kmap(st + i, transpose);
-                // if (in_pos < 0) return;
-                @out_feat(i, j) = @in_feat(in_pos, j);
-            }
-
-            gather_kernel<<< (out_feat_shape0 * out_feat_shape1 + 255) / 256, 256 >>>(@ARGS);
-        """,
-        cpu_header="""
-            @alias(in_feat, in0)
-            @alias(out_feat, in1)
-            @alias(kmap, in2)
-            @alias(args, in3)
-        """,
-        cpu_src="""
-            const int n_k = args_shape0;
-            const int st = args_shape1;
-            const int c = args_shape2;
-            const int transpose = args_shape3;
-
-            for (int i = 0; i < n_k; ++ i ) {
-                int in_pos = @kmap(st + i, transpose);
-                // if (in_pos < 0) {
-                //     continue;
-                // }
-                #pragma omp parallel for 
-                for (int j = 0; j < c; ++ j ) {
-                    @out_feat(i, j) = @in_feat(in_pos, j);
-                    // @out(i, j) = @in_feat(in_pos, j);
-                }
-            }
-        """
-    ).sync()
-    
-
-def scatter(
-    n_active_feats,
-    channels,
-    cur_offset,
-    transpose,
-    in_feat,
-    out_feat,
-    kmap,
-):  
-    shape = n_active_feats.tolist() + cur_offset.tolist() + [channels, transpose, 0]
-    scatter_args = jt.zeros(shape, dtype='int32')
-    return jt.code((0, ), out_feat.dtype, [in_feat, out_feat, kmap, scatter_args], 
-        cuda_header="""
-            @alias(in_feat, in0)
-            @alias(out_feat, in1)
-            @alias(kmap, in2)
-            @alias(args, in3)
-        """,
-        cuda_src="""
-            __global__ void scatter_kernel(@ARGS_DEF) {
-                @PRECALC
-                const int n_k = args_shape0;
-                const int st = args_shape1;
-                const int c = args_shape2;
-                const int transpose = args_shape3;
-
-                int idx = blockDim.x * blockIdx.x + threadIdx.x;
-                int i = idx / c;
-                int j = idx % c;
-                if (i >= n_k) return;
-                int out_pos = @kmap(st + i, 1 - transpose);
-                // if (out_pos < 0) return;
-                @out_feat(out_pos, j) += @in_feat(i, j);
-            }
-
-            scatter_kernel<<< (out_feat_shape0 * out_feat_shape1 + 255) / 256, 256 >>>(@ARGS);
-        """,
-        cpu_header="""
-            @alias(in_feat, in0)
-            @alias(out_feat, in1)
-            @alias(kmap, in2)
-            @alias(args, in3)
-        """,
-        cpu_src="""
-            const int n_k = args_shape0;
-            const int st = args_shape1;
-            const int c = args_shape2;
-            const int transpose = args_shape3;
-
-            for (int i = 0; i < n_k; ++ i ) {
-                int out_pos = @kmap(st + i, 1 - transpose);
-                // if (out_pos < 0) {
-                //     continue;
-                // }
-                #pragma omp parallel for
-                for (int j = 0; j < c; ++ j ) {
-                    @out_feat(out_pos, j) += @in_feat(i, j);
-                    // @out(out_pos, j) += @in_feat(i, j);
-                }
-            }
-        """
-    ).sync()
-
-def conv3d(
-    input: SparseTensor,
-    weight: jt.Var,
-    kernel_size: Union[int, Tuple[int, ...]],
-    bias: Optional[jt.Var] = None,
-    stride: Union[int, Tuple[int, ...]] = 1,
-    dilation: Union[int, Tuple[int, ...]] = 1,
-    group: int = 1,
-    transposed: bool = False,
-) -> SparseTensor:
-    # kernel_size = make_ntuple(kernel_size, ndim=3)
-    # stride = make_ntuple(stride, ndim=3)
-    # dilation = make_ntuple(dilation, ndim=3)
-    kernel_size = _triple(kernel_size)
-    stride = _triple(stride)
-    dilation = _triple(dilation)
-
-    if (kernel_size == _triple(1) and stride == _triple(1) and dilation == _triple(1)):
-        output_stride = input.stride
-        output_indices = input.indices
-        output_values = input.values.matmul(weight)
-    elif not transposed:
-        output_stride = tuple(input.stride[k] * stride[k] for k in range(3))
-
-        if output_stride in input.cmaps:
-            output_indices = input.cmaps[output_stride]
-        elif all(stride[k] == 1 for k in range(3)):
-            output_indices = input.indices
-        else:
-            output_indices = F.spdownsample(
-                input.indices,
-                stride,
-                kernel_size,
-                input.stride,
-            )
-        
-        if (input.stride, kernel_size, stride, dilation) not in input.kmaps:
-            offsets = get_kernel_offsets(
-                kernel_size,
-                stride=input.stride,
-                dilation=dilation,
-            )
-            references = F.sphash(input.indices) # (N,)
-            queries = F.sphash(output_indices, offsets) # (|K|, N)
-            results = F.spquery(queries, references) # (|K|, N)
-
-            nbsizes = jt.sum(results != -1, dim=1)
-            nbmaps = jt.nonzero(results != -1)
-
-            indices = nbmaps[:, 0] * results.size(1) + nbmaps[:, 1]
-            nbmaps[:, 0] = results.view(-1)[indices]
-
-            input.kmaps[(input.stride, kernel_size, stride, dilation)] = [
-                nbmaps, nbsizes, (input.indices.shape[0], output_indices.shape[0])
-            ]
-
-        output_values = Convolution.apply(
-            input.values,
-            weight,
-            *input.kmaps[(input.stride, kernel_size, stride, dilation)],
-            transposed,
-        )
-    else:
-        output_stride = tuple(input.stride[k] // stride[k] for k in range(3))
-        output_indices = input.cmaps[output_stride]
-        output_values = Convolution.apply(
-            input.values,
-            weight,
-            *input.kmaps[(output_stride, kernel_size, stride, dilation)],
-            transposed,
-        )
-    
-    if bias is not None:
-        output_values += bias
-    
-    # size have to be set
-    output = SparseTensor(
-        indices=output_indices,
-        values=output_values,
-        stride=output_stride,
-        size=input.size
-    )
-    output.cmaps = input.cmaps
-    output.cmaps.setdefault(output_stride, output_indices)
-    output.kmaps = input.kmaps
-    return output
--- a/python/JSparse/nn/functional/count.py
+++ b/python/JSparse/nn/functional/count.py
@ -1,30 +0,0 @@
-import jittor as jt
-
-def spcount(idx_query: jt.Var, num: int) -> jt.Var:
-    return jt.code((num,), jt.int32, [idx_query],
-        cuda_src="""
-            __global__ void count_kernel(@ARGS_DEF) {
-                @PRECALC
-                @alias(idx_query, in0)
-                int i = blockDim.x * blockIdx.x + threadIdx.x;
-                int cur_idx = @idx_query(i);
-                if (i < idx_query_shape0 && cur_idx >= 0) {
-                    atomicAdd(out_p + cur_idx, 1);
-                }
-            }
-            @alias(idx_query, in0)
-            count_kernel<<<(idx_query_shape0 + 511) / 512, 512>>>(@ARGS);
-        """,
-        cpu_src="""
-            @alias(idx_query, in0)
-            #pragma omp parallel for
-            for (int i = 0; i < idx_query_shape0; ++ i ) {
-                int cur_idx = @idx_query(i);
-                if (cur_idx < 0) {
-                    continue;
-                }
-                #pragma omp atomic
-                @out(cur_idx) ++;
-            }
-        """
-    )
--- a/python/JSparse/nn/functional/devoxelize.py
+++ b/python/JSparse/nn/functional/devoxelize.py
@ -1,166 +0,0 @@
-import jittor as jt
-from jittor import Function
-
-from JSparse import SparseTensor
-
-__all__ = ['calc_ti_weights', 'spdevoxelize']
-
-def calc_ti_weights(
-    indices: jt.Var,
-    idx_query: jt.Var,
-    scale: float = 1
-) -> jt.Var:
-    with jt.no_grad():
-        p = indices
-        if scale != 1:
-            pf = jt.floor(indices / scale) * scale
-        else:
-            pf = jt.floor(indices)
-        pc = pf + scale
-
-        x = p[:, 1].view(-1, 1)
-        y = p[:, 2].view(-1, 1)
-        z = p[:, 3].view(-1, 1)
-
-        xf = pf[:, 1].view(-1, 1).float()
-        yf = pf[:, 2].view(-1, 1).float()
-        zf = pf[:, 3].view(-1, 1).float()
-
-        xc = pc[:, 1].view(-1, 1).float()
-        yc = pc[:, 2].view(-1, 1).float()
-        zc = pc[:, 3].view(-1, 1).float()
-
-        w0 = (xc - x) * (yc - y) * (zc - z)
-        w1 = (xc - x) * (yc - y) * (z - zf)
-        w2 = (xc - x) * (y - yf) * (zc - z)
-        w3 = (xc - x) * (y - yf) * (z - zf)
-        w4 = (x - xf) * (yc - y) * (zc - z)
-        w5 = (x - xf) * (yc - y) * (z - zf)
-        w6 = (x - xf) * (y - yf) * (zc - z)
-        w7 = (x - xf) * (y - yf) * (z - zf)
-
-        w = jt.concat([w0, w1, w2, w3, w4, w5, w6, w7], dim=1).t()
-        if scale != 1:
-            w /= scale ** 3
-        w[idx_query == -1] = 0
-        w /= jt.sum(w, dim=0) + 1e-8
-    return w
-        
-
-class Devoxelize(Function):
-    def execute(
-        self,
-        values: jt.Var,
-        idx_query: jt.Var,
-        weights: jt.Var
-    ) -> jt.Var:
-        # c = values_shape1
-        # N = idx_query_shape0
-        output = jt.code((idx_query.shape[0], values.shape[1]), jt.float32, [values, idx_query, weights], 
-            cuda_src="""
-                __global__ void devoxelize_forward_kernel(@ARGS_DEF) {
-                    @PRECALC
-                    @alias(values, in0)
-                    @alias(idx_query, in1)
-                    @alias(weights, in2)
-                    int index = blockIdx.x * blockDim.x + threadIdx.x;
-                    int i = index / values_shape1;
-                    int j = index % values_shape1;
-
-                    if (i < idx_query_shape0) {
-                        float cur_values = 0;
-                        for (int k = 0; k < 8; ++ k ) {
-                            int idx = @idx_query(i, k);
-                            cur_values = (idx >= 0) ? @values(idx, j) : 0;
-                            @out(i, j) += @weights(i, k) * cur_values;
-                        }
-                    }
-                }
-                devoxelize_forward_kernel<<<out_shape0, out_shape1>>>(@ARGS);
-            """,
-            cpu_src="""
-                @alias(values, in0)
-                @alias(idx_query, in1)
-                @alias(weights, in2)
-
-                #pragma omp parallel for
-                for (int i = 0; i < idx_query_shape0; ++ i ) {
-                    for (int j = 0; j < values_shape1; ++ j ) {
-                        float cur_values = 0;
-                        for (int k = 0; k < 8; ++ k ) {
-                            int idx = @idx_query(i, k);
-                            cur_values = (idx >= 0) ? @values(idx, j) : 0;
-                            #pragma omp atomic
-                            @out(i ,j) += @weights(i, k) * cur_values;
-                        }
-                    }
-                }
-            """
-        )
-        self.save_vars = (idx_query, weights, values.shape[0])
-        return output
-    
-    def grad(self, grad_output: jt.Var):
-        idx_query, weights, input_size = self.save_vars
-
-        grad_values = jt.code((input_size, grad_output.shape[0]), jt.float, [idx_query, weights, grad_output], 
-            cuda_header="""           
-                #include <stdio.h>    
-                #include <stdlib.h>   
-                #include <cuda_runtime.h>  
-            """,
-            cuda_src="""
-                __global__ void devoxelize_backward_kernel(@ARGS_DEF) {
-                    @PRECALC
-                    @alias(idx_query, in0)
-                    @alias(weights, in1)
-                    @alias(grad_output, in2)
-
-                    int index = blockIdx.x * blockDim.x + threadIdx.x;
-                    int c = grad_output_shape1;
-                    int i = index / c;
-                    int j = index % c;
-                  
-                    if (i < grad_output_shape0) {
-                        float cur_grad_output = @grad_output(i, j);
-
-                        #pragma unroll
-                        for (int k = 0; k < 8; ++ k ) {
-                            int idx = @idx_query(i, k);
-                            if (idx >= 0) {
-                                atomicAdd(&@out(idx, j), @weights(i, k) * cur_grad_output);
-                            }
-                        }
-                    }
-                }
-                @alias(grad_output, in2)
-                devoxelize_backward_kernel<<<grad_output_shape0, grad_output_shape1>>>(@ARGS);
-            """,
-            cpu_src="""
-                @alias(idx_query, in0)
-                @alias(weights, in1)
-                @alias(grad_output, in2)
-
-                for (int i = 0; i < grad_output_shape0; ++ i ) {
-                    #pragma omp parallel for
-                    for (int j = 0; j < grad_output_shape1; ++ j ) {
-                        float cur_grad_output = 0;
-                        for (int k = 0; k < 8; ++ k ) {
-                            int idx = @idx_query(i, k);
-                            cur_grad_output = (idx >= 0) ? @grad_output(i, j) : 0;
-                            #pragma omp atomic
-                            @out(idx, j) += @weights(i, k) * cur_grad_output;
-                        }
-                    }
-                }
-            """
-        )
-        return grad_values, None, None
-
-def spdevoxelize(
-    values: jt.Var,
-    idx_query: jt.Var,
-    weights: jt.Var
-) -> jt.Var:
-    return Devoxelize.apply(values, idx_query, weights)
-
--- a/python/JSparse/nn/functional/downsample.py
+++ b/python/JSparse/nn/functional/downsample.py
@ -1,51 +0,0 @@
-from typing import Tuple, Union
-
-import jittor as jt
-from jittor.misc import _pair, _triple
-
-from JSparse.nn.utils import get_kernel_offsets
-from JSparse.utils import make_ntuple, trunc
-
-__all__ = ['spdownsample']
-
-def spdownsample(
-        indices: jt.Var,
-        stride: Union[int, Tuple[int, ...]] = 2,
-        kernel_size: Union[int, Tuple[int, ...]] = 2,
-        tensor_stride: Union[int, Tuple[int, ...]] = 1) -> jt.Var:
-    # stride = make_ntuple(stride, ndim=3)
-    # kernel_size = make_ntuple(kernel_size, ndim=3)
-    # tensor_stride = make_ntuple(tensor_stride, ndim=3)
-    kernel_size = _triple(kernel_size)
-    stride = _triple(stride)
-    tensor_stride = _triple(tensor_stride)
-
-    sample_stride = [stride[k] * tensor_stride[k] for k in range(3)]
-    sample_stride = jt.Var(sample_stride,
-                           dtype='int32').unsqueeze(dim=0)
-
-    if all(stride[k] in [1, kernel_size[k]] for k in range(3)):
-        indices = indices.clone()
-        indices[:, 1:] = trunc(jt.divide(indices[:, 1:], sample_stride)) * sample_stride
-    else:
-        offsets = get_kernel_offsets(kernel_size,
-                                     tensor_stride)
-        kernel_volume = offsets.size(0)
-
-        indices_min = indices[:, :3].min(dim=0, keepdims=True)
-
-        b = indices[:, :1].repeat(1, kernel_volume)
-        x = indices[:, 1:].unsqueeze(dim=1).repeat(1, kernel_volume, 1) + offsets
-        indices = jt.cat([b.view(-1, 1), x.view(-1, 3)], dim=1)
-
-        # TODO: We need to also filter `indices` based on `indices_max`.
-        mask = (indices[:, 1:] % sample_stride == 0)
-        mask &= (indices[:, 1:] >= indices_min)
-        mask = jt.all(mask, dim=1)
-        indices = indices[mask]
-
-    # we may have to unique the indices when we define the SparesTensor
-    # indices = indices[:, [3, 0, 1, 2]]
-    # indices = jt.unique(indices, dim=0)
-    # indices = indices[:, [1, 2, 3, 0]]
-    return indices
--- a/python/JSparse/nn/functional/hash.py
+++ b/python/JSparse/nn/functional/hash.py
@ -1,204 +0,0 @@
-from typing import Optional
-
-import jittor as jt
-
-__all__ = ['sphash']
-
-def sphash(indices: jt.Var,
-           offsets: Optional[jt.Var] = None) -> jt.Var:
-    assert indices.dtype == jt.int, indices.dtype
-    assert indices.ndim == 2 and indices.shape[1] == 4, indices.shape
-
-    if offsets is None:
-        return jt.code((indices.shape[0],), jt.int64, [indices], 
-            cuda_header="""
-                #include <stdio.h>
-                #include <stdlib.h>
-                #include <cmath>
-                #include <vector>
-                @alias(indices, in0)
-            """,
-            cuda_src="""
-                __global__ static void hash_kernel(@ARGS_DEF) {
-                    @PRECALC
-                    
-                    int i = blockIdx.x * blockDim.x + threadIdx.x;
-                    if (i < indices_shape0) {
-                        uint64_t hash = 14695981039346656037UL;
-                        for (int j = 0; j < 4; ++ j ) {
-                            hash ^= (unsigned int)@indices(i, j);
-                            hash *= 1099511628211UL;
-                        }
-                        hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-                        @out(i) = hash;
-                    }
-                }
-                hash_kernel<<<ceil((double)indices_shape0 / 512), 512>>>(@ARGS);
-            """,
-            cpu_header="""
-                #include <vector>
-                @alias(indices, in0)
-            """,
-            cpu_src="""
-                #pragma omp parallel for
-                for (int i = 0; i < indices_shape0; ++ i ) {
-                    uint64_t hash = 14695981039346656037UL;
-                    for (int j = 0; j < 4; ++ j ) {
-                        hash ^= (unsigned int)@indices(i, j);
-                        hash *= 1099511628211UL;
-                    }
-                    hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-                    @out(i) = hash;
-                }
-            """)
-    else:
-        assert offsets.dtype == jt.int32, offsets.dtype
-        assert offsets.ndim == 2 and offsets.shape[1] == 3, offsets.shape
-
-        return jt.code((offsets.shape[0], indices.shape[0]), jt.int64, [indices, offsets],
-            cuda_header="""
-                #include <stdio.h>
-                #include <stdlib.h>
-                #include <iostream>
-                #include <cmath>
-                #include <vector>
-
-                @alias(indices, in0)
-                @alias(offsets, in1)
-            """,
-            # cuda_src="""
-            #     __global__ void kernel_hash_kernel(@ARGS_DEF){
-            #         @PRECALC
-            #         extern __shared__ int offsets_shared[];
-
-            #         int ix = blockDim.x * blockIdx.x + threadIdx.x;
-            #         int iy = blockIdx.y;
-
-            #         // if (!threadIdx.x) {
-            #         //     for (int j = 0; j < 3; ++ j ) {
-            #         //         offsets_shared[iy * 3 + j] = @offsets(iy, j);
-            #         //     }
-            #         // }
-            #         // __syncthreads();
-
-            #         if (!threadIdx.x) {
-            #             for (int j = 0; j < 3; ++ j ) {
-            #                 offsets_shared[iy * 3 + j] = @offsets(iy, j);
-            #             }
-            #         }
-            #         __syncthreads();
-
-            #         if (ix < indices_shape0 && iy < offsets_shape0) {
-            #             int cur_indices[4];
-            #             for (int j = 1; j <= 3; ++ j ) {
-            #                 // cur_indices[j] = @indices(ix, j) + @offsets(iy, j - 1);
-            #                 cur_indices[j] = @indices(ix, j) + offsets_shared[iy * 3 + j - 1];
-            #             }
-            #             cur_indices[0] = @indices(ix, 0);
-            #             uint64_t hash = 14695981039346656037UL;
-            #             for (int j = 0; j < 4; ++ j ) {
-            #                 hash ^= (unsigned int)cur_indices[j];
-            #                 hash *= 1099511628211UL;
-            #             }
-            #             hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-            #             @out0(iy, ix) = hash;
-            #         }
-            #     }
-            #     dim3 block(512);
-            #     dim3 grid((indices_shape0 + block.x - 1) / block.x, offsets_shape0);
-            #     kernel_hash_kernel<<< grid, block, offsets_shape0 * 3 * sizeof(offsets_type) >>>(@ARGS);
-            # """,
-            # cuda_src="""
-            #     __global__ void kernel_hash_kernel(@ARGS_DEF){
-            #         @PRECALC
-            #         extern __shared__ int offsets_shared[];
-
-            #         int ix = blockDim.x * blockIdx.x + threadIdx.x;
-            #         int iy = blockDim.y * blockIdx.y + threadIdx.y;
-
-            #         // if (!threadIdx.x) {
-            #         //     for (int j = 0; j < 3; ++ j ) {
-            #         //         offsets_shared[iy * 3 + j] = @offsets(iy, j);
-            #         //     }
-            #         // }
-            #         // __syncthreads();
-
-            #         if (iy < indices_shape0 && ix < offsets_shape0) {
-            #             int cur_indices[4];
-            #             // for (int j = 1; j <= 3; ++ j ) {
-            #             //     cur_indices[j] = @indices(iy, j) + @offsets(ix, j - 1);
-            #                 // cur_indices[j] = @indices(iy, j) + offsets_shared[ix * 3 + j - 1];
-            #             // }
-            #             cur_indices[0] = @indices(iy, 0);
-            #             cur_indices[1] = @indices(iy, 1) + @offsets(ix, 0);
-            #             cur_indices[2] = @indices(iy, 2) + @offsets(ix, 1);
-            #             cur_indices[3] = @indices(iy, 3) + @offsets(ix, 2);
-            #             uint64_t hash = 14695981039346656037UL;
-            #             for (int j = 0; j < 4; ++ j ) {
-            #                 hash ^= (unsigned int)cur_indices[j];
-            #                 hash *= 1099511628211UL;
-            #             }
-            #             hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-            #             @out0(ix, iy) = hash;
-            #         }
-            #     }
-            #     dim3 block(16, 64);
-            #     dim3 grid((offsets_shape0 + block.x - 1) / block.x), (indices_shape0 + block.y - 1) / block.y);
-            #     kernel_hash_kernel<<< grid, block, offsets_shape0 * 3 * sizeof(offsets_type) >>>(@ARGS);
-            # """,
-            cuda_src="""
-                __global__ void kernel_hash_kernel(@ARGS_DEF){
-                    @PRECALC
-                    extern __shared__ int offsets_shared[];
-
-                    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-                    int k = idx % offsets_shape0;
-                    int i = idx / offsets_shape0;
-
-                    if (i < indices_shape0) {
-                        int cur_indices[4];
-                        for (int j = 1; j <= 3; ++ j ) {
-                            cur_indices[j] = @indices(i, j) + @offsets(k, j - 1);
-                        }
-                        cur_indices[0] = @indices(i, 0);
-
-                        uint64_t hash = 14695981039346656037UL;
-                        for (int j = 0; j < 4; ++ j ) {
-                            hash ^= (unsigned int)cur_indices[j];
-                            hash *= 1099511628211UL;
-                        }
-                        hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-                        @out(k, i) = hash;
-                    }
-                }
-                int thread_nums = 512;
-                kernel_hash_kernel<<<ceil((double)(indices_shape0 * offsets_shape0) / thread_nums), thread_nums, offsets_shape0 * 3 * sizeof(int)>>>(@ARGS);
-            """,
-            cpu_header="""
-                #include <vector>
-                @alias(indices, in0)
-                @alias(offsets, in1)
-            """,
-            cpu_src="""
-                auto K = offsets_shape0;
-                auto N = indices_shape0;
-
-                for (int k = 0; k < offsets_shape0; ++ k ) {
-                    #pragma omp parallel for
-                    for (int i = 0; i < indices_shape0; ++ i ) {
-                        int cur_indices[4];
-                        for (int j = 1; j <= 3; ++ j ) {
-                            cur_indices[j] = @indices(i, j) + @offsets(k, j - 1);
-                        }
-                        cur_indices[0] = @indices(i, 0);
-                        uint64_t hash = 14695981039346656037UL;
-                        for (int j = 0; j < 4; ++ j ) {
-                            hash ^= (unsigned int)cur_indices[j];
-                            hash *= 1099511628211UL;
-                        }
-                        hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
-                        @out(k, i) = hash;
-                    }
-                }
-            """
-        )
--- a/python/JSparse/nn/functional/pooling.py
+++ b/python/JSparse/nn/functional/pooling.py
@ -1,26 +0,0 @@
-import jittor as jt
-
-from JSparse import SparseTensor
-
-__all__ = ['global_avg_pool', 'global_max_pool']
-
-def global_avg_pool(inputs: SparseTensor) -> jt.Var:
-    batch_size = jt.max(inputs.indices[:, 0]).item() + 1
-    outputs = []
-    for k in range(batch_size):
-        input = inputs.values[inputs.indices[:, 0] == k]
-        output = jt.mean(input, dim=0)
-        outputs.append(output)
-    outputs = jt.stack(outputs, dim=0)
-    return outputs
-
-
-def global_max_pool(inputs: SparseTensor) -> jt.Var:
-    batch_size = jt.max(inputs.indices[:, 0]).item() + 1
-    outputs = []
-    for k in range(batch_size):
-        input = inputs.values[inputs.indices[:, 0] == k]
-        output = jt.max(input, dim=0)[0]
-        outputs.append(output)
-    outputs = jt.stack(outputs, dim=0)
-    return outputs
--- a/python/JSparse/nn/functional/query.py
+++ b/python/JSparse/nn/functional/query.py
@ -1,454 +0,0 @@
-import jittor as jt
-import math
-
-__all__ = ['spquery']
-
-def spquery(queries: jt.Var, 
-            references: jt.Var) -> jt.Var:
-    q_size = queries.size()
-    queries = queries.view(-1)
-
-    indices = jt.arange(len(references), dtype=jt.int64)
-
-    if jt.flags.use_cuda > 0:
-        n = references.shape[0]
-        nextPow2 = pow(2, math.ceil(math.log2(n)))
-
-        table_size = (4 * nextPow2) if (n < 2048) else (2 * nextPow2)
-        if table_size < 512:
-            table_size = 512
-        num_funcs = 3
-        key_buf = jt.zeros((table_size,), dtype='int64')
-        val_buf = jt.zeros((table_size,), dtype='int64')
-        key = jt.zeros((num_funcs * table_size,), dtype='int64')
-        val = jt.zeros((num_funcs * table_size,), dtype='int64')
-
-        output = jt.code(queries.shape, jt.int64, [queries, references, indices, key_buf, val_buf, key, val], 
-            cuda_header="""
-                #include <cmath>
-                #include <iostream>
-                #include <vector>
-                #include <cstdint>
-                #include <chrono>
-                #include <cstdio>
-                #include <cstdlib>
-
-                /** Reserved value for indicating "empty". */
-                #define EMPTY_CELL (0)
-                /** Max rehashing depth, and error depth. */
-                #define MAX_DEPTH (100)
-                #define ERR_DEPTH (-1)
-                /** CUDA naive thread block size. */
-                #define BLOCK_SIZE (256)
-                /** CUDA multi-level thread block size = bucket size. */
-                #define BUCKET_SIZE (512)
-
-                __device__ static uint64_t atomicExch(uint64_t *addr, uint64_t val) {
-                    return (uint64_t)atomicExch((unsigned long long int *)addr,
-                                                (unsigned long long int)val);
-                }
-
-                /** Struct of a hash function config. */
-                typedef struct {
-                    int rv;  // Randomized XOR value.
-                    int ss;  // Randomized shift filter start position.
-                } FuncConfig;
-
-                /** Hard code hash functions and all inline helper functions for CUDA kernels'
-                * use. */
-                inline __device__ int do_1st_hash(const uint64_t val, const int num_buckets) {
-                    return val % num_buckets;
-                }
-
-                inline __device__ int do_2nd_hash(const uint64_t val,
-                                                const FuncConfig *const hash_func_configs,
-                                                const int func_idx, const int size) {
-                    FuncConfig fc = hash_func_configs[func_idx];
-                    return ((val ^ fc.rv) >> fc.ss) % size;  // XOR function as 2nd-level hashing.
-                }
-
-                // trying to ignore EMPTY_CELL by adding 1 at make_data.
-                inline __device__ uint64_t fetch_val(const uint64_t data, const int pos_width) {
-                    return data >> pos_width;
-                }
-
-                inline __device__ int fetch_func(const uint64_t data, const int pos_width) {
-                    return data & ((0x1 << pos_width) - 1);
-                }
-
-                inline __device__ uint64_t make_data(const uint64_t val, const int func,
-                                                    const int pos_width) {
-                    return (val << pos_width) ^ func;
-                }
-
-                class CuckooHashTableCuda_Multi {
-                private:
-                    const int _size;
-                    const int _evict_bound;
-                    const int _num_funcs;
-                    const int _pos_width;
-                    const int _num_buckets;
-
-                    FuncConfig *_d_hash_func_configs;
-
-                    /** Cuckoo hash function set. */
-                    FuncConfig *_hash_func_configs;
-
-                    /** Private operations. */
-                    void gen_hash_funcs() {
-                        // Calculate bit width of value range and table size.
-                        int val_width = 8 * sizeof(uint64_t) - ceil(log2((double)_num_funcs));
-                        int bucket_width = ceil(log2((double)_num_buckets));
-                        int size_width = ceil(log2((double)BUCKET_SIZE));
-                        // Generate randomized configurations.
-                        for (int i = 0; i < _num_funcs; ++i) {  // At index 0 is a dummy function.
-                            if (val_width - bucket_width <= size_width)
-                                _hash_func_configs[i] = {rand(), 0};
-                            else {
-                                _hash_func_configs[i] = {
-                                    rand(), rand() % (val_width - bucket_width - size_width + 1) +
-                                                bucket_width};
-                            }
-                        }
-                    };
-
-                    inline uint64_t fetch_val(const uint64_t data) { return data >> _pos_width; }
-                    inline int fetch_func(const uint64_t data) {
-                        return data & ((0x1 << _pos_width) - 1);
-                    }
-
-                public:
-                    CuckooHashTableCuda_Multi(const int size, const int evict_bound,
-                                                const int num_funcs)
-                        : _size(size),
-                            _evict_bound(evict_bound),
-                            _num_funcs(num_funcs),
-                            _pos_width(ceil(log2((double)_num_funcs))),
-                            _num_buckets(ceil((double)_size / BUCKET_SIZE)) {
-                        srand(time(NULL));
-                        _d_hash_func_configs = NULL;
-                        _hash_func_configs = NULL;
-                        _hash_func_configs = new FuncConfig[num_funcs];
-
-                        gen_hash_funcs();
-
-                        cudaMalloc((void **)&_d_hash_func_configs, _num_funcs * sizeof(FuncConfig));
-                        cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
-                                _num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
-                    };
-                    ~CuckooHashTableCuda_Multi() {
-                        if (_hash_func_configs != NULL) delete[] _hash_func_configs;
-
-                        if (_d_hash_func_configs != NULL) cudaFree(_d_hash_func_configs);
-                    };
-
-                    int insert_vals(const uint64_t *const keys, const uint64_t *const vals,
-                                    uint64_t *d_key_buf, uint64_t *d_val_buf, uint64_t *d_key,
-                                    uint64_t *d_val, const int n);
-
-                    void lookup_vals(const uint64_t *const keys, uint64_t *const results,
-                                    uint64_t *d_key, uint64_t *d_val, const int n);
-                };
-
-                __global__ void cuckooBucketKernel_Multi(
-                    uint64_t *const key_buf, uint64_t *const val_buf, const int size,
-                    const uint64_t *const keys, const uint64_t *const vals, const int n,
-                    int *const counters, const int num_buckets);
-
-                __global__ void cuckooInsertKernel_Multi(
-                    uint64_t *const key, uint64_t *const val, const uint64_t *const key_buf,
-                    const uint64_t *const val_buf, const int size,
-                    const FuncConfig *const hash_func_configs, const int num_funcs,
-                    const int *const counters, const int num_buckets, const int evict_bound,
-                    const int pos_width, int *const rehash_requests);
-
-                __global__ void cuckooLookupKernel_Multi(
-                    const uint64_t *const keys, uint64_t *const results, const int n,
-                    const uint64_t *const all_keys, const uint64_t *const all_vals,
-                    const int size, const FuncConfig *const hash_func_configs,
-                    const int num_funcs, const int num_buckets, const int pos_width);
-                
-                __global__ void cuckooBucketKernel_Multi(
-                    uint64_t *const key_buf, uint64_t *const val_buf, const int size,
-                    const uint64_t *const keys, const uint64_t *const vals, const int n,
-                    int *const counters, const int num_buckets) {
-                    // Get thread index.
-                    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-                    // Only threads within range are active.
-                    if (idx < n) {
-                        // Do 1st-level hashing to get bucket id, then do atomic add to get index
-                        // inside the bucket.
-                        uint64_t key = keys[idx];
-                        uint64_t val = vals[idx];
-
-                        int bucket_num = do_1st_hash(key, num_buckets);
-                        int bucket_ofs = atomicAdd(&counters[bucket_num], 1);
-
-                        // Directly write the key into the table buffer.
-                        if (bucket_ofs >= BUCKET_SIZE) {
-                            printf("%d/%d ERROR: bucket overflow! n=%d, bucket_num=%d/%d, key=%d", bucket_ofs, BUCKET_SIZE, n, bucket_num, num_buckets, key);
-                        } else {
-                            key_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = key;
-                            val_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = val;
-                        }
-                    }
-                }
-
-                __global__ void cuckooInsertKernel_Multi(
-                    uint64_t *const key, uint64_t *const val, const uint64_t *const key_buf,
-                    const uint64_t *const val_buf, const int size,
-                    const FuncConfig *const hash_func_configs, const int num_funcs,
-                    const int *const counters, const int num_buckets, const int evict_bound,
-                    const int pos_width, int *const rehash_requests) {
-                    // Create local cuckoo table in shared memory. Size passed in as the third
-                    // kernel parameter.
-                    extern __shared__ uint64_t local_key[];
-                    for (int i = 0; i < num_funcs; ++i) {
-                        local_key[i * BUCKET_SIZE + threadIdx.x] = EMPTY_CELL;
-                    }
-
-                    // might be useful
-                    __syncthreads();
-
-                    // Get thread index.
-                    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-                    uint64_t cur_idx = idx;
-
-                    // Only threads within local bucket range are active.
-                    if (threadIdx.x < counters[blockIdx.x]) {
-                        // Set initial conditions.
-                        uint64_t cur_key = key_buf[cur_idx];
-                        int cur_func = 0;
-                        int evict_count = 0;
-
-                        // Start the test-kick-and-reinsert loops.
-                        do {
-                        int pos = do_2nd_hash(cur_key, hash_func_configs, cur_func, BUCKET_SIZE);
-
-                        uint64_t new_data = make_data(cur_idx + 1, cur_func, pos_width);
-
-                        uint64_t old_idx =
-                            atomicExch(&local_key[cur_func * BUCKET_SIZE + pos], new_data);
-
-                        if (old_idx != EMPTY_CELL) {
-                            cur_idx = fetch_val(old_idx, pos_width) - 1;
-                            // potential overflow here. It seems that cur_idx < 0 is possible!
-                            cur_key = key_buf[cur_idx];
-                            cur_func = (fetch_func(old_idx, pos_width) + 1) % num_funcs;
-                            evict_count++;
-                        } else {
-                            break;
-                        }
-
-                        } while (evict_count < num_funcs * evict_bound);
-
-                        // If exceeds eviction bound, then needs rehashing.
-                        if (evict_count >= num_funcs * evict_bound) {
-                        atomicAdd(rehash_requests, 1);
-                        }
-                    }
-
-                    // Every thread write its responsible local slot into the global data table.
-                    __syncthreads();
-                    for (int i = 0; i < num_funcs; ++i) {
-                        uint64_t cur_idx = local_key[i * BUCKET_SIZE + threadIdx.x];
-                        if (cur_idx == EMPTY_CELL) {
-                        continue;
-                        }
-                        int cur_func = fetch_func(cur_idx, pos_width);
-                        cur_idx = fetch_val(cur_idx, pos_width) - 1;
-                        key[i * size + idx] = key_buf[cur_idx];
-                        val[i * size + idx] = val_buf[cur_idx];
-                    }
-                }
-
-                __global__ void cuckooLookupKernel_Multi(
-                    const uint64_t *const keys, uint64_t *const results, const int n,
-                    const uint64_t *const all_keys, const uint64_t *const all_vals,
-                    const int size, const FuncConfig *const hash_func_configs,
-                    const int num_funcs, const int num_buckets, const int pos_width) {
-                    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-                    // Only threads within range are active.
-                    if (idx < n) {
-                        uint64_t key = keys[idx];
-                        int bucket_num = do_1st_hash(key, num_buckets);
-                        for (int i = 0; i < num_funcs; ++i) {
-                        int pos = bucket_num * BUCKET_SIZE +
-                                    do_2nd_hash(key, hash_func_configs, i, BUCKET_SIZE);
-                        if (all_keys[i * size + pos] == key) {
-                            results[idx] = all_vals[i * size + pos] + 1;
-                            return;
-                        }
-                        }
-
-                        // TODO(Haotian): should be a value that will not be encountered.
-                        results[idx] = EMPTY_CELL;
-                    }
-                }
-
-                void CuckooHashTableCuda_Multi::lookup_vals(const uint64_t *const keys,
-                                                            uint64_t *d_key, uint64_t *d_val,
-                                                            uint64_t *const results,
-                                                            const int n) {
-                    // Launch the lookup kernel.
-                    cuckooLookupKernel_Multi<<<ceil((double)n / BUCKET_SIZE), BUCKET_SIZE>>>(
-                        keys, results, n, d_key, d_val, _size, _d_hash_func_configs, _num_funcs,
-                        _num_buckets, _pos_width);
-                    }
-
-                int CuckooHashTableCuda_Multi::insert_vals(const uint64_t *const keys,
-                                                            const uint64_t *const vals,
-                                                            uint64_t *d_key_buf,
-                                                            uint64_t *d_val_buf, uint64_t *d_key,
-                                                            uint64_t *d_val, const int n) {
-                    //
-                    // Phase 1: Distribute keys into buckets.
-                    //
-
-                    // Allocate GPU memory.
-
-                    int *d_counters = NULL;
-
-                    cudaMalloc((void **)&d_counters, _num_buckets * sizeof(int));
-
-                    cudaMemset(d_counters, 0, _num_buckets * sizeof(int));
-
-                    // Invoke bucket kernel.
-                    cuckooBucketKernel_Multi<<<ceil((double)n / BUCKET_SIZE), BUCKET_SIZE>>>(
-                        d_key_buf, d_val_buf, _size, keys, vals, n, d_counters, _num_buckets);
-
-                    //
-                    // Phase 2: Local cuckoo hashing.
-                    //
-
-                    // Allocate GPU memory.
-
-                    cudaDeviceSynchronize();
-                    int *d_rehash_requests = NULL;
-
-                    cudaMalloc((void **)&d_rehash_requests, sizeof(int));
-
-                    // Copy values onto GPU memory.
-                    cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
-                                _num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
-
-                    // Invoke insert kernel. Passes shared memory table size by the third
-                    // argument. Loops until no rehashing needed.
-
-                    int rehash_count = 0;
-                    do {
-                        int rehash_requests = 0;
-                        cudaMemset(d_rehash_requests, 0, sizeof(int));
-                        cuckooInsertKernel_Multi<<<ceil((double)_size / BUCKET_SIZE), BUCKET_SIZE,
-                                                _num_funcs * BUCKET_SIZE * sizeof(uint64_t)>>>(
-                            d_key, d_val, d_key_buf, d_val_buf, _size, _d_hash_func_configs,
-                            _num_funcs, d_counters, _num_buckets, _evict_bound, _pos_width,
-                            d_rehash_requests);
-                        cudaMemcpy(&rehash_requests, d_rehash_requests, sizeof(int),
-                                cudaMemcpyDeviceToHost);
-
-                        if (rehash_requests == 0) {
-                        break;
-                        } else {
-                        rehash_count++;
-                        gen_hash_funcs();
-                        cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
-                                    _num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
-                        }
-                    } while (rehash_count < MAX_DEPTH);
-
-                    cudaDeviceSynchronize();
-
-                    // Free GPU resources.
-
-                    if (d_counters != NULL) {
-                        cudaFree(d_counters);
-                    }
-                    if (d_rehash_requests != NULL) {
-                        cudaFree(d_rehash_requests);
-                    }
-
-                    return (rehash_count < MAX_DEPTH) ? rehash_count : ERR_DEPTH;
-                }
-
-                @alias(hash_query, in0)
-                @alias(hash_target, in1)
-                @alias(idx_target, in2)
-                @alias(key_buf, in3)
-                @alias(val_buf, in4)
-                @alias(key, in5)
-                @alias(val, in6)
-            """,
-            cuda_src="""
-                int n = hash_target_shape0;
-                int n1 = hash_query_shape0;
-                const int nextPow2 = pow(2, ceil(log2((double)n)));
-                // When n is large, the hash values tend to be more evenly distrubuted and
-                // choosing table_size to be 2 * nextPow2 typically suffices. For smaller n,
-                // the effect of uneven distribution of hash values is more pronounced and
-                // hence we choose table_size to be 4 * nextPow2 to reduce the chance of
-                // bucket overflow.
-                int table_size = (n < 2048) ? 4 * nextPow2 : 2 * nextPow2;
-                if (table_size < 512) {
-                    table_size = 512;
-                }
-                int num_funcs = 3;
-
-                CuckooHashTableCuda_Multi in_hash_table(table_size, 8 * ceil(log2((double)n)),
-                                                        num_funcs);
-                in_hash_table.insert_vals((uint64_t *)(hash_target_p),
-                                          (uint64_t *)(idx_target_p),
-                                          (uint64_t *)(key_buf_p),
-                                          (uint64_t *)(val_buf_p),
-                                          (uint64_t *)(key_p),
-                                          (uint64_t *)(val_p), n);
-                in_hash_table.lookup_vals((uint64_t *)(hash_query_p),
-                                          (uint64_t *)(key_p),
-                                          (uint64_t *)(val_p),
-                                          (uint64_t *)(out_p), n1);
-                
-            """
-        )
-    else:
-        output = jt.code(queries.shape, jt.int64, [queries, references, indices],
-            cpu_header="""
-                #include <cmath>
-                #include <google/dense_hash_map>
-                #include <iostream>
-                #include <vector>
-                #include <cstdint>
-                #include <cstdio>
-                #include <cstdlib>
-
-                @alias(hash_query, in0)
-                @alias(hash_target, in1)
-                @alias(idx_target, in2)
-            """,
-            cpu_src="""
-                int n = hash_target_shape0;
-                int n1 = hash_query_shape0;
-
-                google::dense_hash_map<int64_t, int64_t> hashmap; // Google Sparse Hash library
-                hashmap.set_empty_key(0);
-
-                for (int idx = 0; idx < n; ++ idx ) {
-                    int64_t key = @hash_target(idx);
-                    int64_t val = @idx_target(idx) + 1;
-                    hashmap.insert(std::make_pair(key, val));
-                } 
-
-                #pragma omp parallel for
-                for (int idx = 0; idx < n1; ++ idx ) {
-                    int64_t key = @hash_query(idx);
-                    auto iter = hashmap.find(key);
-                    if (iter != hashmap.end()) {
-                        @out(idx) = iter->second;
-                    } else @out(idx) = 0;
-                }
-            """
-        )
-    output = (output - 1).view(*q_size)
-    return output
-    # return indices
--- a/python/JSparse/nn/functional/voxelize.py
+++ b/python/JSparse/nn/functional/voxelize.py
@ -1,114 +0,0 @@
-import jittor as jt
-from jittor import Function
-
-from JSparse import SparseTensor
-
-__all__ = ['spvoxelize']
-
-class Voxelize(Function):
-    def execute(
-        self,
-        values: jt.Var,
-        idx_query: jt.Var,
-        counts: jt.Var
-    ) -> jt.Var:
-        # N = values_shape0
-        # c = values_shape1
-        # N1 = counts_shape0
-        # out: N1 x c
-        output = jt.code((counts.shape[0], values.shape[1]), "float32", [values, idx_query, counts],
-            cuda_header="""           
-                #include <stdio.h>    
-                #include <stdlib.h>   
-                #include <cuda_runtime.h>  
-            """,
-            cuda_src="""
-                __global__ void voxelize_forward_kernel(@ARGS_DEF) {
-                    @PRECALC
-                    @alias(values, in0)
-                    @alias(idx_query, in1)
-                    @alias(counts, in2)
-                    
-                    int index = blockDim.x * blockIdx.x + threadIdx.x;
-                    int c = values_shape1;
-                    int i = index / c;
-                    int j = index % c;
-
-                    if (i < values_shape0) {
-                        int pos = @idx_query(i);
-                        if (pos < 0 || pos >= counts_shape0 || @counts(pos) == 0) return;
-                        atomicAdd(&@out(pos, j), @values(i, j) / (float)(@counts(pos)));
-                    }
-                }
-                @alias(values, in0)
-                voxelize_forward_kernel<<< values_shape0, values_shape1 >>>(@ARGS);
-            """,
-            cpu_src="""
-                @alias(values, in0)
-                @alias(idx_query, in1)
-                @alias(counts, in2)
-                for (int i = 0; i < values_shape0; ++ i ) {
-                    int pos = @idx_query(i);
-                    if (@counts(pos) == 0)
-                        continue;
-                    #pragma omp parallel for
-                    for (int j = 0; j < values_shape1; ++ j ) {
-                        #pragma omp atomic
-                        @out(pos, j) += @values(i, j) / (float)@counts(pos);
-                    }
-                }
-            """
-        )
-        self.save_vars = idx_query, counts, values.shape[0]
-        return output
-    
-    def grad(self, grad_output: jt.Var):
-        idx_query, counts, input_size = self.save_vars
-
-        grad_values = jt.code((input_size, grad_output.shape[1]), jt.float32, [idx_query, counts, grad_output],
-            cuda_header="""
-                #include <stdio.h>    
-                #include <stdlib.h>   
-                #include <cuda_runtime.h>  
-            """,
-            cuda_src="""
-                __global__ void voxelize_backward_kernel(@ARGS_DEF) {
-                    @PRECALC
-                    @alias(idx_query, in0)
-                    @alias(counts, in1)
-                    @alias(grad_output, in2)
-                    int index = blockDim.x * blockIdx.x + threadIdx.x;
-                    int i = index / grad_output_shape1;
-                    int j = index % grad_output_shape1;
-                    if (i < out_shape0) {
-                        int pos = @idx_query(i);
-                        if (pos < 0 || pos >= counts_shape0 || @counts(pos) == 0) return;
-                        atomicAdd(&@out(pos, j), @grad_output(pos, j) / @counts(pos));
-                    }
-                }
-
-                voxelize_backward_kernel<<<out_shape0, out_shape1>>>(@ARGS);
-            """,
-            cpu_src="""
-                @alias(idx_query, in0)
-                @alias(counts, in1)
-                @alias(grad_output, in2)
-
-                for (int i = 0; i < out_shape0; ++ i ) {
-                    int pos = @idx_query(i);
-                    if (@counts(pos) == 0) continue;
-                    #pragma omp parallel for 
-                    for (int j = 0; j < grad_output_shape1; ++ j ) {
-                        @out(i, j) = @grad_output(pos, j) / (float)@counts(pos);
-                    }
-                }
-            """
-        )
-        return grad_values, None, None
-
-def spvoxelize(
-    values: jt.Var,
-    idx_query: jt.Var,
-    counts: jt.Var
-) -> jt.Var:
-    return Voxelize.apply(values, idx_query, counts)
--- a/python/JSparse/nn/modules/init.py
+++ b/python/JSparse/nn/modules/init.py
@ -1,6 +0,0 @@
-from .activation import *
-# from .bev import *
-from .conv import *
-# from .crop import *
-from .norm import *
-from .pooling import *
--- a/python/JSparse/nn/modules/activation.py
+++ b/python/JSparse/nn/modules/activation.py
@ -1,21 +0,0 @@
-import jittor as jt
-from jittor import nn
-
-from JSparse import SparseTensor
-from JSparse.nn.functional import relu, leaky_relu
-# from nn.utils import fapply
-
-__all__ = ['ReLU', 'LeakyReLU']
-
-# class ReLU(nn.ReLU):
-#     def execute(self, input: SparseTensor) -> SparseTensor:
-#         return fapply(input, super().execute)
-
-# class LeakyReLU(nn.LeakyReLU):
-#     def execute(self, input: SparseTensor) -> SparseTensor:
-#         return fapply(input, super().execute)
-
-Relu = jt.make_module(relu)
-ReLU = Relu
-Leaky_relu = jt.make_module(leaky_relu, 2)
-LeakyReLU = Leaky_relu
--- a/python/JSparse/nn/modules/conv.py
+++ b/python/JSparse/nn/modules/conv.py
@ -1,85 +0,0 @@
-import math
-from typing import List, Tuple, Union
-
-import numpy as np
-import jittor as jt
-from jittor import nn
-from jittor import init
-from jittor.misc import _pair, _triple
-
-from JSparse import SparseTensor
-from JSparse.nn import functional as F
-# from utils import make_ntuple
-
-__all__ = ['Conv3d']
-
-class Conv3d(nn.Module):
-
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: Union[int, Tuple[int, ...]] = 3,
-                 stride: Union[int, Tuple[int, ...]] = 1,
-                 dilation: int = 1,
-                 groups: int = 1,
-                 bias: bool = False,
-                 transposed: bool = False) -> None:
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        # self.kernel_size = make_ntuple(kernel_size, ndim=3)
-        # self.stride = make_ntuple(stride, ndim=3)
-        # self.dilation = dilation
-        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
-        self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
-        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation, dilation)
-        self.groups = groups
-        assert in_channels % groups == 0, 'in_channels must be divisible by groups'
-        assert out_channels % groups == 0, 'out_channels must be divisible by groups'
-        self.transposed = transposed
-
-        self.kernel_volume = int(np.prod(self.kernel_size))
-        # if self.kernel_volume > 1:
-        #     self.kernel = nn.Parameter(
-        #         jt.zeros(self.kernel_volume, in_channels, out_channels))
-        # else:
-        #     self.kernel = nn.Parameter(jt.zeros(in_channels, out_channels))
-        # if bias:
-        #     self.bias = nn.Parameter(jt.zeros(out_channels))
-        # else:
-        #     self.register_parameter('bias', None)
-        # self.reset_parameters()
-        fan = (self.out_channels if self.transposed else self.in_channels) * self.kernel_volume
-        std = 1 / math.sqrt(fan)
-            
-        if self.kernel_volume > 1:
-            self.weight = init.uniform([self.kernel_volume, in_channels, out_channels], 'float32', -std, std)
-        else:
-            self.weight = init.uniform([in_channels, out_channels], 'float32')
-        if bias:
-            self.bias = init.uniform([out_channels], "float32", -std, std)
-        else:
-            self.bias = None
-        # self.reset_parameters()
-
-    def execute(self, input: SparseTensor) -> SparseTensor:
-        return F.conv3d(input,
-                        weight=self.weight,
-                        kernel_size=self.kernel_size,
-                        bias=self.bias,
-                        stride=self.stride,
-                        dilation=self.dilation,
-                        groups=self.groups,
-                        transposed=self.transposed)
-            
-    # def set_parameters(self) -> None:
-    #     std = 1 / math.sqrt(
-    #         (self.out_channels if self.transposed else self.in_channels)
-    #         * self.kernel_volume)
-    #     self.weight *= std
-    #     if self.bias is not None:
-    #         self.bias *= std
-    
-
-
-
--- a/python/JSparse/nn/modules/norm.py
+++ b/python/JSparse/nn/modules/norm.py
@ -1,34 +0,0 @@
-import jittor as jt
-from jittor import nn
-from numpy import kaiser
-
-from JSparse import SparseTensor
-from JSparse.nn.utils import fapply
-
-__all__ = ['BatchNorm', 'GroupNorm']
-
-class BatchNorm(nn.BatchNorm):
-    def execute(self, input: SparseTensor) -> SparseTensor:
-        return fapply(input, super().execute)
-
-class GroupNorm(nn.GroupNorm):
-    def execute(self, input: SparseTensor) -> SparseTensor:
-        indices, values, stride, size = input.indices, input.values, input.stride, input.size
-
-        batch_size = jt.max(indices[:, 0]).item() + 1
-        num_channels = values.shape[1]
-
-        n_values = jt.zeros_like(values)
-        for k in range(batch_size):
-            idx = indices[:, 0] == k
-            b_values = values[idx]
-            b_values = b_values.t().reshape(1, num_channels, -1)
-            b_values = super().execute(b_values)
-            b_values = b_values.reshape(num_channels, -1).t()
-            n_values[idx] = b_values
-        
-        output = SparseTensor(indices=indices, values=n_values, stride=stride, size=size)
-        output.cmaps = input.cmaps
-        output.kmaps = input.kmaps
-        return output
-
--- a/python/JSparse/nn/modules/pooling.py
+++ b/python/JSparse/nn/modules/pooling.py
@ -1,11 +0,0 @@
-from ast import Global
-import jittor as jt
-from jittor import nn
-
-from JSparse import SparseTensor
-from JSparse.nn.functional import global_avg_pool, global_max_pool
-
-__all__ = ['GlobalAvgPool', 'GlobalMaxPool']
-
-GlobalAvgPool = jt.make_module(global_avg_pool)
-GlobalMaxPool = jt.make_module(global_max_pool)
--- a/python/JSparse/nn/utils/init.py
+++ b/python/JSparse/nn/utils/init.py
@ -1,2 +0,0 @@
-from .apply import *
-from .kernel import *
--- a/python/JSparse/nn/utils/apply.py
+++ b/python/JSparse/nn/utils/apply.py
@ -1,15 +0,0 @@
-from typing import Callable
-
-import jittor as jt
-
-from JSparse import SparseTensor
-
-__all__ = ['fapply']
-
-def fapply(input: SparseTensor, fn: Callable[..., jt.Var], *args,
-           **kwargs) -> SparseTensor:
-    values = fn(input.values, *args, **kwargs)
-    output = SparseTensor(indices=input.indices, values=values, stride=input.stride, size=input.size)
-    output.cmaps = input.cmaps
-    output.kmaps = input.kmaps
-    return output
--- a/python/JSparse/nn/utils/kernel.py
+++ b/python/JSparse/nn/utils/kernel.py
@ -1,28 +0,0 @@
-from typing import Tuple, Union
-
-import numpy as np
-import jittor as jt
-
-from JSparse.utils import make_ntuple, trunc
-
-__all__ = ['get_kernel_offsets']
-
-def get_kernel_offsets(kernel_size: Union[int, Tuple[int, ...]],
-                       stride: Union[int, Tuple[int, ...]] = 1,
-                       dilation: Union[int, Tuple[int, ...]] = 1) -> jt.Var:
-    kernel_size = make_ntuple(kernel_size, ndim=3)
-    stride = make_ntuple(stride, ndim=3)
-    dilation = make_ntuple(dilation, ndim=3)
-
-    offsets = [(np.arange(-kernel_size[k] // 2 + 1, kernel_size[k] // 2 + 1) * stride[k]
-                * dilation[k]) for k in range(3)]
-
-    if np.prod(kernel_size) % 2 == 1:
-        offsets = [[x, y, z] for z in offsets[2] for y in offsets[1]
-                   for x in offsets[0]]
-    else:
-        offsets = [[x, y, z] for x in offsets[0] for y in offsets[1]
-                   for z in offsets[2]]
-
-    offsets = jt.int32(offsets)
-    return offsets
--- a/python/JSparse/sparse.py
+++ b/python/JSparse/sparse.py
@ -1,125 +0,0 @@
-from itertools import count
-import numpy as np
-
-import jittor as jt
-from jittor.misc import _pair, _triple
-
-from typing import Any, Dict, Tuple, Union
-from type_check import type_check
-
-from JSparse.utils import make_ntuple, sparse_quantize, set_hash
-# from .utils.quantize import sparse_quantize
-# from indice_manager import IndiceManager
-
-class SparseTensor:
-
-    @type_check
-    def __init__(
-        self, 
-        indices: jt.Var, 
-        values: jt.Var,
-        stride: Union[int, Tuple[int, ...]],
-        size=None,
-        quantize=True,
-        voxel_size=1,
-        coalesce_mode:str='sum',
-        indice_manager=None,
-        device=None,
-    ):
-        assert (values.ndim == 2)
-        # self.indices = indices
-        # self.values = values
-        self.size = size
-        self.ndim = indices.shape[1] - 1
-        self.stride =_triple(stride)
-        self.voxel_size = voxel_size
-        self.coalesce_mode = coalesce_mode
-        self.cmaps = {}
-        self.kmaps = {}
-
-        ##########################
-        # Setup CoordsManager
-        ##########################
-        # if indice_manager is None:
-        #     self.indice_manager = IndiceManager(
-        #         ndim=self.ndim,
-
-        #     )
-
-        ##########################
-        # Initialize coords 
-        ##########################
-        if quantize:
-            self.seed = 1
-            for i in range(len(self.stride)):
-                self.seed += i
-                self.seed *= self.stride[i]
-            self.hash_multiplier = set_hash(self.ndim, self.seed)
-
-            self.hash_num, self.indices, mapping, inverse_mapping, count = sparse_quantize(indices, self.hash_multiplier, self.voxel_size, return_index=True, return_inverse=True, return_count=True)
-            self.inverse_mapping = inverse_mapping
-
-            if len(values.shape) == 1:
-                out_size = (self.indices.shape[0], )
-            elif len(values.shape) == 2:
-                out_size = (self.indices.shape[0], values.shape[-1])
-
-            if self.coalesce_mode == 'sum':
-                out_size = (self.indices.shape[0], values.shape[-1])
-                self.values = jt.zeros(out_size, dtype=values.dtype).scatter_(0, inverse_mapping, values, reduce='add')
-            elif self.coalesce_mode == 'average':
-                out_size = (self.indices.shape[0], values.shape[-1])
-                self.values = jt.zeros(out_size, dtype=values.dtype).scatter_(0, inverse_mapping, values, reduce='add')
-                self.values /= count
-            elif self.coalesce_mode == 'sample':
-                self.values = values[self.indices]
-        else:
-            self.indices = indices
-            self.values = values
-
-        # if indice_manager is None:
-        #     # TODO If set to share the indices man, use the global indices man
-
-        #     # init the indices
-        #     indice_manager = Indice
-
-
-    def _indices(self):
-        return self.indices
-    
-    def _values(self):
-        return self.values
-
-    def _binary_operation(self, other, _binary_op):
-        assert isinstance(other, self.__class__)
-        return
-        # TODO set up the indices dict
-        # so that wedo not need to merge the indice group
-        # which has already been merged
-
-        # if the indices of self and other should be merged
-
-        
-class PointTensor:
-
-    def __init__(self, values, indices, idx_query=None, weights=None):
-        self.values = values
-        self.indices = indices
-        self.idx_query = idx_query if idx_query is not None else {}
-        self.weights = weights if weights is not None else {}
-        self.additional_values = {}
-        self.additional_values['idx_query'] = {}
-        self.additional_values['counts'] = {}
-
-    def detach(self):
-        self.values = self.values.detach()
-        self.indices = self.indices.detach()
-        return self
-
-    def __add__(self, other):
-        pt = PointTensor(self.values + other.values, self.indices, self.idx_query,
-                             self.weights)
-        pt.additional_values = self.additional_values
-        return pt
-
-        
--- a/python/JSparse/sparse_dense_function.py
+++ b/python/JSparse/sparse_dense_function.py
@ -1,42 +0,0 @@
-import jittor as jt
-from jittor import Function
-
-
-
-
-def spmm(
-    rows: jt.Var,
-    cols: jt.Var,
-    vals: jt.Var,
-    size: jt.NanoVector,
-    mat: jt.Var,
-    spmm_mode='scatter',
-    is_sorted: bool = False,
-    cuda_spmm_alg: int = 1,
-) -> jt.Var:
-    assert len(rows) == len(cols), "Invalid length"
-    assert len(rows) == len(vals), "Invalid length"
-    assert vals.dtype == mat.dtype, "dtype mismatch"
-
-    if jt.flags.use_cuda > 1:
-        assert jt.has_cuda == 1, "No GPUs available"
-        rows = rows.int()
-        cols = cols.int()
-        ''' 
-        TODO: Using the coo_spmm of cuSPARSE on GPU
-        result = coo_spmm_int32(
-            rows, cols, vals, size[0], size[1], mat, cuda_spmm_alg, is_sorted
-        )
-        '''
-    else:
-        if (spmm_mode == 'scatter'):
-
-
-        
-        
-        
-
-
-
-
-class SPMM(Function):
--- a/python/JSparse/utils/init.py
+++ b/python/JSparse/utils/init.py
@ -1,2 +0,0 @@
-from .quantize import *
-from .utils import *
--- a/python/JSparse/utils/quantize.py
+++ b/python/JSparse/utils/quantize.py
@ -1,58 +0,0 @@
-from itertools import repeat
-from typing import List, Tuple, Union
-
-import jittor as jt
-import numpy as np
-
-from .utils import unique1d
-
-__all__ = ['sparse_quantize', 'set_hash']
-
-def set_hash(ndim, seed, low=100, high=1000):
-    jt.set_seed(seed)
-    return jt.randint(low, high, shape=(ndim + 1,), dtype='uint64')
-
-def hash(x: np.ndarray, multiplier: np.ndarray) -> jt.Var:
-    assert x.ndim == 2, x.shape
-
-    x = x - x.min(dim=0)
-    x = x.uint64()
-
-    h = jt.zeros(x.shape[0], dtype='uint64')
-    for k in range(x.shape[1] - 1):
-        h += x[:, k]
-        h *= multiplier[k]
-    h += x[:, -1]
-    return h
-
-def sparse_quantize(indices,
-                    hash_multiplier,
-                    voxel_size: Union[float, Tuple[float, ...]] = 1,
-                    *,
-                    return_index: bool = False,
-                    return_inverse: bool = False,
-                    return_count: bool = False) -> List[np.ndarray]:
-    if indices.dtype.is_int() and voxel_size == 1:
-        pass
-    else:
-        if isinstance(voxel_size, (float, int)):
-            voxel_size = tuple(repeat(voxel_size, 3))
-        assert isinstance(voxel_size, tuple) and len(voxel_size) == 3
-
-        voxel_size = jt.Var(voxel_size)
-        indices[:, 1:] /= voxel_size
-        indices = jt.floor(indices).astype(jt.int32)
-
-    hash_num, mapping, inverse_mapping, count = unique1d(hash(indices, hash_multiplier))
-    indices = indices[mapping]
-
-    outputs = [hash_num, indices]
-    if return_index:
-        outputs += [mapping]
-    if return_inverse:
-        outputs += [inverse_mapping]
-    if return_count:
-        outputs += [count]
-    return outputs[0] if len(outputs) == 1 else outputs
-
-
--- a/python/JSparse/utils/utils.py
+++ b/python/JSparse/utils/utils.py
@ -1,43 +0,0 @@
-from itertools import repeat
-from typing import List, Tuple, Union
-
-import jittor as jt
-
-__all__ = ['make_ntuple', 'trunc', 'unique1d']
-
-def make_ntuple(x: Union[int, List[int], Tuple[int, ...], jt.Var],
-                ndim: int) -> Tuple[int, ...]:
-    if isinstance(x, int):
-        x = tuple(repeat(x, ndim))
-    elif isinstance(x, list):
-        x = tuple(x)
-    elif isinstance(x, jt.Var):
-        x = tuple(x.view(-1).numpy().tolist())
-
-    assert isinstance(x, tuple) and len(x) == ndim, x
-    return x
-
-def trunc(x: jt.Var):
-    if x >= 0:
-        return jt.floor(x)
-    else:
-        return jt.ceil(x)
-
-
-def unique1d(var):
-    assert len(var.shape) == 1
-    perm, aux = jt.argsort(var)
-    mask = jt.empty(aux.shape, dtype='bool')
-    mask[:1] = True
-    mask[1:] = aux[1:] != aux[:-1]
-    ret = (aux[mask],)
-    ret += (perm[mask],)
-    imask = jt.cumsum(mask.astype(perm.dtype)) - 1
-    inv_idx = jt.empty(mask.shape, dtype=perm.dtype)
-    inv_idx[perm] = imask
-    ret += (inv_idx,)
-    idx = jt.concat([jt.nonzero(mask).view(-1), jt.Var(mask.shape[0])])
-    ret += (idx[1:] - idx[:-1],)
-    return ret
-
-
--- a/python/setup.py
+++ b/python/setup.py
@ -1,15 +0,0 @@
-from gettext import install
-
-from setuptools import find_packages, setup
-
-with open("../README.md", "r") as file:
-    description = file.read()
-
-print(find_packages())
-setup(
-    name='JSparse',
-    version='0.1',
-    description=description,
-    packages=find_packages(),
-    install_requires=["jittor", "type_check"]
-)
--- a/voxelize_test.py
+++ b/voxelize_test.py
@ -1,96 +0,0 @@
-import time
-import math
-import numpy as np
-import torch
-
-import jittor as jt
-import jittor.nn as nn
-from jittor import init
-from jittor.misc import _pair, _triple
-
-from itertools import repeat
-from typing import List, Tuple, Union
-
-from JSparse import SparseTensor
-from JSparse import PointTensor
-from JSparse.utils import make_ntuple
-from JSparse.nn import functional as F
-from JSparse.nn.utils import get_kernel_offsets
-from JSparse.nn.functional import Convolution
-
-import torchsparse
-from torchsparse import nn as spnn
-
-jt.flags.use_cuda = 1
-
-in_channels = 3
-out_channels = 64
-kernel_size = 3
-stride = 1
-dilation = 1
-groups = 1
-bias = False
-transposed = False
-
-kernel_size = _triple(kernel_size)
-stride = _triple(stride)
-dilation = _triple(dilation)
-kernel_volume = int(np.prod(kernel_size))
-
-N = 10
-coords = np.random.uniform(0, 10, size=(N, 4))
-feats = np.random.randn(N, 3)
-labels = np.random.choice(5, N)
-print(coords.shape)
-print(feats.shape)
-
-coo = jt.Var(coords)
-val = jt.Var(feats)
-size = (10, 10, 10)
-
-fan = (out_channels if transposed else in_channels) * kernel_volume
-std = 1 / math.sqrt(fan)
-    
-if kernel_volume > 1:
-    weight = init.uniform([kernel_volume, in_channels, out_channels], 'float32', -std, std)
-else:
-    weight = init.uniform([in_channels, out_channels], 'float32')
-if bias:
-    bias = init.uniform([out_channels], "float32", -std, std)
-else:
-    bias = None
-
-
-x = SparseTensor(coo, val, 1, size)
-z = PointTensor(x.values, x.indices.float())
-
-pc_hash = F.sphash(
-    jt.concat([
-        z.indices[:, 0].int().view(-1, 1),
-        jt.floor(z.indices[:, 1:] / x.stride[0]).int() * x.stride[0]
-    ], 1))
-sparse_hash = F.sphash(x.indices)
-idx_query = F.spquery(pc_hash, sparse_hash).int()
-counts = F.spcount(idx_query, x.indices.shape[0])
-z.additional_values['idx_query'][x.stride] = idx_query
-z.additional_values['counts'][x.stride] = counts
-inserted_values = F.spvoxelize(z.values, idx_query, counts)
-new_tensor = SparseTensor(inserted_values, x.indices, x.stride, x.size, False)
-new_tensor.cmaps = x.cmaps
-new_tensor.kmaps = x.kmaps
-print(inserted_values)
-
-offsets = get_kernel_offsets(kernel_size=2, stride=x.stride, dilation=1)
-old_hash = F.sphash(
-    jt.concat([
-        z.indices[:, 0].int().view(-1, 1),
-        jt.floor(z.indices[:, 1:] / x.stride[0]).int() * x.stride[0]
-    ], 1), offsets)
-pc_hash = F.sphash(x.indices)
-idx_query = F.spquery(old_hash, pc_hash).int()
-weights = F.calc_ti_weights(z.indices, idx_query,
-                            scale=x.stride[0]).t()
-idx_query = idx_query.t()
-new_values = F.spdevoxelize(x.values, idx_query, weights)
-
-print(jt.grad(new_values, x.values))