Revert "Change the architecture of the code and add setup.py"

This commit is contained in:
zjp-shadow 2022-07-27 12:09:18 +08:00 committed by GitHub
parent 0ab603e6a3
commit 7c56d95e6d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 0 additions and 2132 deletions

2
.gitignore vendored
View File

@ -1,2 +0,0 @@
*.ipynb
__pycache__

View File

@ -1,11 +1,3 @@
# JSparse
JSparse is a high-performance auto-differentiation library for sparse tensor.
## How to Install
```
cd python
python setup.py install
```

View File

@ -1,73 +0,0 @@
from pickletools import optimize
from statistics import mode
import jittor
from jittor import nn
import JSparse
from JSparse import SparseTensor
from JSparse import nn as spnn
from JSparse.utils.quantize import sparse_quantize
import numpy as np
class RandomDataset(jittor.dataset.Dataset):
def __init__(self, input_size: int, voxel_size: float) -> None:
super().__init__()
self.set_attrs(total_len = input_size)
self.voxel_size = voxel_size
def __getitem__(self, _: int):
inputs = np.random.uniform(-100, 100, size=(self.total_len, 4))
labels = np.random.choice(10, size=self.total_len)
coords, feats = inputs[:, :], inputs
coords -= np.min(coords, axis=0, keepdims=True)
# coords, indices = sparse_quantize(coords, self.voxel_size, return_index=True)
coords = jittor.Var(coords)
feats = jittor.Var(feats)
labels = jittor.Var(labels)
# coords = jittor.Var(coords, dtype=jittor.int64)
# feats = jittor.Var(feats[indices], dtype=jittor.float64)
# labels = jittor.Var(labels[indices], dtype=jittor.int64)
print(type(coords))
inputs = SparseTensor(coords, feats, 1, 1)
labels = SparseTensor(coords, labels, 1, 1)
return inputs, labels
if __name__ == '__main__':
np.random.seed(0)
dataset = RandomDataset(input_size=10000, voxel_size=0.2)
model = nn.Sequential(
spnn.Conv3d(4, 32, 3),
spnn.BatchNorm(32),
spnn.ReLU(True),
spnn.Conv3d(32, 64, 2, stride=2),
spnn.BatchNorm(64),
spnn.ReLU(True),
spnn.Conv3d(64, 64, 2, stride=2, transposed=True),
spnn.BatchNorm(64),
spnn.ReLU(True),
spnn.Conv3d(64, 32, 3),
spnn.BatchNorm(32),
spnn.ReLU(True),
spnn.Conv3d(32, 10, 1),
)
criterion = nn.CrossEntropyLoss()
optimizer = jittor.optim.Adam(model.parameters(), lr=1e-3)
model.train()
lens = len(dataset)
for batch_idx, (inputs, labels) in enumerate(dataset):
outputs = model(inputs)
loss = criterion(outputs, labels)
optimizer.setp(loss)
if batch_idx % 10 == 0:
print('Training: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
batch_idx, lens , 100. * batch_idx / lens, loss.numpy()[0]))

View File

@ -1 +0,0 @@
from .sparse import *

View File

@ -1,34 +0,0 @@
import os
import numpy as np
from typing import Union, List, Tuple
import jittor as jt
from jittor import Function
class IndiceManager:
def __init__(
self,
ndim,
# indice_map_type,
# sparse_alorithm, # set m_hashtable_occupancy for concurrent_unordered_map
):
# if indice_map_type == 'GPU':
# assert(jt.has_cuda)
self.ndim = ndim
# self.indice_map_type = indice_map_type
# self.sparse_algorithm = sparse_alorithm
self.stride_key_manager = {}
self.indice_map_manager = {}
self.kernel_map_manager = {}
def insert(self, stride, indice_key, indice_hash):
self.stride_key_manager[stride] = indice_key
self.indice_map_manager[indice_key] = indice_hash
# class IndiceMapManager

View File

@ -1 +0,0 @@
from .modules import *

View File

@ -1,10 +0,0 @@
from .activation import *
from .conv import *
from .count import *
# from .crop import *
from .devoxelize import *
from .downsample import *
from .hash import *
from .pooling import *
from .query import *
from .voxelize import *

View File

@ -1,24 +0,0 @@
import jittor as jt
import jittor.nn as nn
from JSparse import SparseTensor
from JSparse.nn.utils import fapply
__all__ = ['relu', 'leaky_relu']
# __all__ = ['relu', 'leaky_relu', 'ReLU', 'LeakyReLU']
def relu(input: SparseTensor) -> SparseTensor:
return fapply(input, nn.relu)
def leaky_relu(input: SparseTensor,
scale: float = 0.01) -> SparseTensor:
return fapply(input,
nn.leaky_relu,
scale=scale)
# Relu = jt.make_module(relu)
# ReLU = Relu
# Leaky_relu = jt.make_module(leaky_relu, 2)
# LeakyReLU = Leaky_relu

View File

@ -1,351 +0,0 @@
from typing import List, Optional, Tuple, Union
import jittor as jt
from jittor import Function
from jittor.misc import _pair, _triple
from JSparse import SparseTensor
from JSparse.nn import functional as F
from JSparse.nn.utils import get_kernel_offsets
from JSparse import make_ntuple
__all__ = ['conv3d', 'Convolution']
class Convolution(Function):
def execute(
self,
input: jt.Var,
weight: jt.Var,
nbmaps: jt.Var,
nbsizes: jt.Var,
sizes: Tuple[int, int],
transposed: bool = False,
) -> jt.Var:
if not transposed:
output = jt.zeros((sizes[1], weight.size(-1)))
else:
output = jt.zeros((sizes[0], weight.size(-1)))
assert input.size(1) == weight.size(1)
in_size = input.size(0)
in_channels = input.size(1)
out_size = output.size(0)
out_channels = output.size(1)
kernel_volume = weight.size(0)
flag = False
mid = kernel_volume // 2
in_buffer_size = jt.Var(1)
if kernel_volume % 2 and out_size == in_size:
flag = True
in_buffer_size = max(in_buffer_size, jt.max(nbsizes[:mid]))
in_buffer_size = max(in_buffer_size, jt.max(nbsizes[mid + 1:]))
output = jt.matmul(input, weight[mid, :, :])
else:
in_buffer_size = jt.max(nbsizes)
# in_buffer_activated : in_buffer_size * in_channels
# weight : in_channels * out_channels
# out_buffer_activated: in_buffer_size * out_channels
# out_buffer_activated = in_buffer_activated * weight
in_buffer_activated = jt.zeros(in_buffer_size.tolist() + [in_channels])
cur_offset = jt.Var(0)
for i in range(kernel_volume):
n_active_feats = nbsizes[i]
if (flag and (i == mid)):
cur_offset += n_active_feats
continue
if n_active_feats == 0:
continue
t = 1 if transposed else 0
################
## gather
################
# print(n_active_feats, in_channels, cur_offset, t)
gather(n_active_feats, in_channels, cur_offset, t, input, in_buffer_activated, nbmaps)
################
## matmul
################
out_buffer_activated = jt.matmul(in_buffer_activated, weight[i, :, :])
################
## scatter
################
scatter(n_active_feats, out_channels, cur_offset, t, out_buffer_activated, output, nbmaps)
cur_offset += n_active_feats
self.save_vars = input, weight, nbmaps, nbsizes, transposed
return output
def grad(
self,
grad_output: jt.Var
) -> Tuple[Optional[jt.Var], ...]:
input, weight, nbmaps, nbsizes, transposed = self.save_vars
grad_input = jt.zeros_like(input)
grad_weight = jt.zeros_like(weight)
# n_in_feats = input.size(0)
# n_out_feats = grad_output.size(0)
n_in_channels = input.size(1)
n_out_channels = weight.size(-1)
kernel_volume = weight.size(0)
flag = False
in_buffer_size = jt.max(nbsizes)
# out_grad_buffer_activated n_active_feats x n_out_channels
# in_grad_buffer_activated n_active_feats x n_in_channels
# in_buffer_activated n_active_feats x n_in_channels
out_grad_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_out_channels])
in_grad_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_in_channels])
in_buffer_activated = jt.zeros(in_buffer_size.tolist() + [n_in_channels])
cur_offset = jt.Var(0)
mid = kernel_volume // 2
for i in range(kernel_volume):
# kernel_grad_buffer = grad_weight[i, :, :]
n_active_feats = nbsizes[i]
# if flag and (i == mid):
# cur_offset += n_active_feats
# continue
if n_active_feats == 0:
continue
t = 1 if transposed else 0
################
## gather
################
gather(n_active_feats, n_out_channels, cur_offset, 1 - t, grad_output, out_grad_buffer_activated, nbmaps)
gather(n_active_feats, n_in_channels , cur_offset, t , input , in_buffer_activated , nbmaps)
################
## matmul
################
# grad for input
# in_grad_buffer_activated = out_grad_buffer_activated @ weight^T
# n_active_feats x n_in_channels n_active_feats x n_out_channels n_out_channels x n_in_channels
in_grad_buffer_activated = jt.nn.matmul_transpose(out_grad_buffer_activated, weight[i, :, :])
# grad for weight
# kernel_grad_buffer = in_buffer_activated^T @ out_grad_buffer_activated
# n_in_channels x n_out_channels n_in_channels x n_active_feats n_active_feats x n_out_channels
grad_weight[i, :, :] = jt.nn.matmul(in_buffer_activated.t(), out_grad_buffer_activated)
################
## scatter
################
scatter(n_active_feats, n_in_channels, cur_offset, 1 - t, in_grad_buffer_activated, grad_input, nbmaps)
cur_offset += n_active_feats
return grad_input, grad_weight, None, None, None, None
def gather(
n_active_feats,
channels,
cur_offset,
transpose,
in_feat,
out_feat,
kmap,
):
shape = n_active_feats.tolist() + cur_offset.tolist() + [channels, transpose, 0]
gather_args = jt.zeros(shape, dtype='int32')
return jt.code((0, ), out_feat.dtype, [in_feat, out_feat, kmap, gather_args],
cuda_header="""
@alias(in_feat, in0)
@alias(out_feat, in1)
@alias(kmap, in2)
@alias(args, in3)
""",
cuda_src="""
__global__ void gather_kernel(@ARGS_DEF) {
@PRECALC
const int n_k = args_shape0;
const int st = args_shape1;
const int c = args_shape2;
const int transpose = args_shape3;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int i = idx / c;
int j = idx % c;
if (i >= n_k) return;
int in_pos = @kmap(st + i, transpose);
// if (in_pos < 0) return;
@out_feat(i, j) = @in_feat(in_pos, j);
}
gather_kernel<<< (out_feat_shape0 * out_feat_shape1 + 255) / 256, 256 >>>(@ARGS);
""",
cpu_header="""
@alias(in_feat, in0)
@alias(out_feat, in1)
@alias(kmap, in2)
@alias(args, in3)
""",
cpu_src="""
const int n_k = args_shape0;
const int st = args_shape1;
const int c = args_shape2;
const int transpose = args_shape3;
for (int i = 0; i < n_k; ++ i ) {
int in_pos = @kmap(st + i, transpose);
// if (in_pos < 0) {
// continue;
// }
#pragma omp parallel for
for (int j = 0; j < c; ++ j ) {
@out_feat(i, j) = @in_feat(in_pos, j);
// @out(i, j) = @in_feat(in_pos, j);
}
}
"""
).sync()
def scatter(
n_active_feats,
channels,
cur_offset,
transpose,
in_feat,
out_feat,
kmap,
):
shape = n_active_feats.tolist() + cur_offset.tolist() + [channels, transpose, 0]
scatter_args = jt.zeros(shape, dtype='int32')
return jt.code((0, ), out_feat.dtype, [in_feat, out_feat, kmap, scatter_args],
cuda_header="""
@alias(in_feat, in0)
@alias(out_feat, in1)
@alias(kmap, in2)
@alias(args, in3)
""",
cuda_src="""
__global__ void scatter_kernel(@ARGS_DEF) {
@PRECALC
const int n_k = args_shape0;
const int st = args_shape1;
const int c = args_shape2;
const int transpose = args_shape3;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int i = idx / c;
int j = idx % c;
if (i >= n_k) return;
int out_pos = @kmap(st + i, 1 - transpose);
// if (out_pos < 0) return;
@out_feat(out_pos, j) += @in_feat(i, j);
}
scatter_kernel<<< (out_feat_shape0 * out_feat_shape1 + 255) / 256, 256 >>>(@ARGS);
""",
cpu_header="""
@alias(in_feat, in0)
@alias(out_feat, in1)
@alias(kmap, in2)
@alias(args, in3)
""",
cpu_src="""
const int n_k = args_shape0;
const int st = args_shape1;
const int c = args_shape2;
const int transpose = args_shape3;
for (int i = 0; i < n_k; ++ i ) {
int out_pos = @kmap(st + i, 1 - transpose);
// if (out_pos < 0) {
// continue;
// }
#pragma omp parallel for
for (int j = 0; j < c; ++ j ) {
@out_feat(out_pos, j) += @in_feat(i, j);
// @out(out_pos, j) += @in_feat(i, j);
}
}
"""
).sync()
def conv3d(
input: SparseTensor,
weight: jt.Var,
kernel_size: Union[int, Tuple[int, ...]],
bias: Optional[jt.Var] = None,
stride: Union[int, Tuple[int, ...]] = 1,
dilation: Union[int, Tuple[int, ...]] = 1,
group: int = 1,
transposed: bool = False,
) -> SparseTensor:
# kernel_size = make_ntuple(kernel_size, ndim=3)
# stride = make_ntuple(stride, ndim=3)
# dilation = make_ntuple(dilation, ndim=3)
kernel_size = _triple(kernel_size)
stride = _triple(stride)
dilation = _triple(dilation)
if (kernel_size == _triple(1) and stride == _triple(1) and dilation == _triple(1)):
output_stride = input.stride
output_indices = input.indices
output_values = input.values.matmul(weight)
elif not transposed:
output_stride = tuple(input.stride[k] * stride[k] for k in range(3))
if output_stride in input.cmaps:
output_indices = input.cmaps[output_stride]
elif all(stride[k] == 1 for k in range(3)):
output_indices = input.indices
else:
output_indices = F.spdownsample(
input.indices,
stride,
kernel_size,
input.stride,
)
if (input.stride, kernel_size, stride, dilation) not in input.kmaps:
offsets = get_kernel_offsets(
kernel_size,
stride=input.stride,
dilation=dilation,
)
references = F.sphash(input.indices) # (N,)
queries = F.sphash(output_indices, offsets) # (|K|, N)
results = F.spquery(queries, references) # (|K|, N)
nbsizes = jt.sum(results != -1, dim=1)
nbmaps = jt.nonzero(results != -1)
indices = nbmaps[:, 0] * results.size(1) + nbmaps[:, 1]
nbmaps[:, 0] = results.view(-1)[indices]
input.kmaps[(input.stride, kernel_size, stride, dilation)] = [
nbmaps, nbsizes, (input.indices.shape[0], output_indices.shape[0])
]
output_values = Convolution.apply(
input.values,
weight,
*input.kmaps[(input.stride, kernel_size, stride, dilation)],
transposed,
)
else:
output_stride = tuple(input.stride[k] // stride[k] for k in range(3))
output_indices = input.cmaps[output_stride]
output_values = Convolution.apply(
input.values,
weight,
*input.kmaps[(output_stride, kernel_size, stride, dilation)],
transposed,
)
if bias is not None:
output_values += bias
# size have to be set
output = SparseTensor(
indices=output_indices,
values=output_values,
stride=output_stride,
size=input.size
)
output.cmaps = input.cmaps
output.cmaps.setdefault(output_stride, output_indices)
output.kmaps = input.kmaps
return output

View File

@ -1,30 +0,0 @@
import jittor as jt
def spcount(idx_query: jt.Var, num: int) -> jt.Var:
return jt.code((num,), jt.int32, [idx_query],
cuda_src="""
__global__ void count_kernel(@ARGS_DEF) {
@PRECALC
@alias(idx_query, in0)
int i = blockDim.x * blockIdx.x + threadIdx.x;
int cur_idx = @idx_query(i);
if (i < idx_query_shape0 && cur_idx >= 0) {
atomicAdd(out_p + cur_idx, 1);
}
}
@alias(idx_query, in0)
count_kernel<<<(idx_query_shape0 + 511) / 512, 512>>>(@ARGS);
""",
cpu_src="""
@alias(idx_query, in0)
#pragma omp parallel for
for (int i = 0; i < idx_query_shape0; ++ i ) {
int cur_idx = @idx_query(i);
if (cur_idx < 0) {
continue;
}
#pragma omp atomic
@out(cur_idx) ++;
}
"""
)

View File

@ -1,166 +0,0 @@
import jittor as jt
from jittor import Function
from JSparse import SparseTensor
__all__ = ['calc_ti_weights', 'spdevoxelize']
def calc_ti_weights(
indices: jt.Var,
idx_query: jt.Var,
scale: float = 1
) -> jt.Var:
with jt.no_grad():
p = indices
if scale != 1:
pf = jt.floor(indices / scale) * scale
else:
pf = jt.floor(indices)
pc = pf + scale
x = p[:, 1].view(-1, 1)
y = p[:, 2].view(-1, 1)
z = p[:, 3].view(-1, 1)
xf = pf[:, 1].view(-1, 1).float()
yf = pf[:, 2].view(-1, 1).float()
zf = pf[:, 3].view(-1, 1).float()
xc = pc[:, 1].view(-1, 1).float()
yc = pc[:, 2].view(-1, 1).float()
zc = pc[:, 3].view(-1, 1).float()
w0 = (xc - x) * (yc - y) * (zc - z)
w1 = (xc - x) * (yc - y) * (z - zf)
w2 = (xc - x) * (y - yf) * (zc - z)
w3 = (xc - x) * (y - yf) * (z - zf)
w4 = (x - xf) * (yc - y) * (zc - z)
w5 = (x - xf) * (yc - y) * (z - zf)
w6 = (x - xf) * (y - yf) * (zc - z)
w7 = (x - xf) * (y - yf) * (z - zf)
w = jt.concat([w0, w1, w2, w3, w4, w5, w6, w7], dim=1).t()
if scale != 1:
w /= scale ** 3
w[idx_query == -1] = 0
w /= jt.sum(w, dim=0) + 1e-8
return w
class Devoxelize(Function):
def execute(
self,
values: jt.Var,
idx_query: jt.Var,
weights: jt.Var
) -> jt.Var:
# c = values_shape1
# N = idx_query_shape0
output = jt.code((idx_query.shape[0], values.shape[1]), jt.float32, [values, idx_query, weights],
cuda_src="""
__global__ void devoxelize_forward_kernel(@ARGS_DEF) {
@PRECALC
@alias(values, in0)
@alias(idx_query, in1)
@alias(weights, in2)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int i = index / values_shape1;
int j = index % values_shape1;
if (i < idx_query_shape0) {
float cur_values = 0;
for (int k = 0; k < 8; ++ k ) {
int idx = @idx_query(i, k);
cur_values = (idx >= 0) ? @values(idx, j) : 0;
@out(i, j) += @weights(i, k) * cur_values;
}
}
}
devoxelize_forward_kernel<<<out_shape0, out_shape1>>>(@ARGS);
""",
cpu_src="""
@alias(values, in0)
@alias(idx_query, in1)
@alias(weights, in2)
#pragma omp parallel for
for (int i = 0; i < idx_query_shape0; ++ i ) {
for (int j = 0; j < values_shape1; ++ j ) {
float cur_values = 0;
for (int k = 0; k < 8; ++ k ) {
int idx = @idx_query(i, k);
cur_values = (idx >= 0) ? @values(idx, j) : 0;
#pragma omp atomic
@out(i ,j) += @weights(i, k) * cur_values;
}
}
}
"""
)
self.save_vars = (idx_query, weights, values.shape[0])
return output
def grad(self, grad_output: jt.Var):
idx_query, weights, input_size = self.save_vars
grad_values = jt.code((input_size, grad_output.shape[0]), jt.float, [idx_query, weights, grad_output],
cuda_header="""
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
""",
cuda_src="""
__global__ void devoxelize_backward_kernel(@ARGS_DEF) {
@PRECALC
@alias(idx_query, in0)
@alias(weights, in1)
@alias(grad_output, in2)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int c = grad_output_shape1;
int i = index / c;
int j = index % c;
if (i < grad_output_shape0) {
float cur_grad_output = @grad_output(i, j);
#pragma unroll
for (int k = 0; k < 8; ++ k ) {
int idx = @idx_query(i, k);
if (idx >= 0) {
atomicAdd(&@out(idx, j), @weights(i, k) * cur_grad_output);
}
}
}
}
@alias(grad_output, in2)
devoxelize_backward_kernel<<<grad_output_shape0, grad_output_shape1>>>(@ARGS);
""",
cpu_src="""
@alias(idx_query, in0)
@alias(weights, in1)
@alias(grad_output, in2)
for (int i = 0; i < grad_output_shape0; ++ i ) {
#pragma omp parallel for
for (int j = 0; j < grad_output_shape1; ++ j ) {
float cur_grad_output = 0;
for (int k = 0; k < 8; ++ k ) {
int idx = @idx_query(i, k);
cur_grad_output = (idx >= 0) ? @grad_output(i, j) : 0;
#pragma omp atomic
@out(idx, j) += @weights(i, k) * cur_grad_output;
}
}
}
"""
)
return grad_values, None, None
def spdevoxelize(
values: jt.Var,
idx_query: jt.Var,
weights: jt.Var
) -> jt.Var:
return Devoxelize.apply(values, idx_query, weights)

View File

@ -1,51 +0,0 @@
from typing import Tuple, Union
import jittor as jt
from jittor.misc import _pair, _triple
from JSparse.nn.utils import get_kernel_offsets
from JSparse.utils import make_ntuple, trunc
__all__ = ['spdownsample']
def spdownsample(
indices: jt.Var,
stride: Union[int, Tuple[int, ...]] = 2,
kernel_size: Union[int, Tuple[int, ...]] = 2,
tensor_stride: Union[int, Tuple[int, ...]] = 1) -> jt.Var:
# stride = make_ntuple(stride, ndim=3)
# kernel_size = make_ntuple(kernel_size, ndim=3)
# tensor_stride = make_ntuple(tensor_stride, ndim=3)
kernel_size = _triple(kernel_size)
stride = _triple(stride)
tensor_stride = _triple(tensor_stride)
sample_stride = [stride[k] * tensor_stride[k] for k in range(3)]
sample_stride = jt.Var(sample_stride,
dtype='int32').unsqueeze(dim=0)
if all(stride[k] in [1, kernel_size[k]] for k in range(3)):
indices = indices.clone()
indices[:, 1:] = trunc(jt.divide(indices[:, 1:], sample_stride)) * sample_stride
else:
offsets = get_kernel_offsets(kernel_size,
tensor_stride)
kernel_volume = offsets.size(0)
indices_min = indices[:, :3].min(dim=0, keepdims=True)
b = indices[:, :1].repeat(1, kernel_volume)
x = indices[:, 1:].unsqueeze(dim=1).repeat(1, kernel_volume, 1) + offsets
indices = jt.cat([b.view(-1, 1), x.view(-1, 3)], dim=1)
# TODO: We need to also filter `indices` based on `indices_max`.
mask = (indices[:, 1:] % sample_stride == 0)
mask &= (indices[:, 1:] >= indices_min)
mask = jt.all(mask, dim=1)
indices = indices[mask]
# we may have to unique the indices when we define the SparesTensor
# indices = indices[:, [3, 0, 1, 2]]
# indices = jt.unique(indices, dim=0)
# indices = indices[:, [1, 2, 3, 0]]
return indices

View File

@ -1,204 +0,0 @@
from typing import Optional
import jittor as jt
__all__ = ['sphash']
def sphash(indices: jt.Var,
offsets: Optional[jt.Var] = None) -> jt.Var:
assert indices.dtype == jt.int, indices.dtype
assert indices.ndim == 2 and indices.shape[1] == 4, indices.shape
if offsets is None:
return jt.code((indices.shape[0],), jt.int64, [indices],
cuda_header="""
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <vector>
@alias(indices, in0)
""",
cuda_src="""
__global__ static void hash_kernel(@ARGS_DEF) {
@PRECALC
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < indices_shape0) {
uint64_t hash = 14695981039346656037UL;
for (int j = 0; j < 4; ++ j ) {
hash ^= (unsigned int)@indices(i, j);
hash *= 1099511628211UL;
}
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
@out(i) = hash;
}
}
hash_kernel<<<ceil((double)indices_shape0 / 512), 512>>>(@ARGS);
""",
cpu_header="""
#include <vector>
@alias(indices, in0)
""",
cpu_src="""
#pragma omp parallel for
for (int i = 0; i < indices_shape0; ++ i ) {
uint64_t hash = 14695981039346656037UL;
for (int j = 0; j < 4; ++ j ) {
hash ^= (unsigned int)@indices(i, j);
hash *= 1099511628211UL;
}
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
@out(i) = hash;
}
""")
else:
assert offsets.dtype == jt.int32, offsets.dtype
assert offsets.ndim == 2 and offsets.shape[1] == 3, offsets.shape
return jt.code((offsets.shape[0], indices.shape[0]), jt.int64, [indices, offsets],
cuda_header="""
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cmath>
#include <vector>
@alias(indices, in0)
@alias(offsets, in1)
""",
# cuda_src="""
# __global__ void kernel_hash_kernel(@ARGS_DEF){
# @PRECALC
# extern __shared__ int offsets_shared[];
# int ix = blockDim.x * blockIdx.x + threadIdx.x;
# int iy = blockIdx.y;
# // if (!threadIdx.x) {
# // for (int j = 0; j < 3; ++ j ) {
# // offsets_shared[iy * 3 + j] = @offsets(iy, j);
# // }
# // }
# // __syncthreads();
# if (!threadIdx.x) {
# for (int j = 0; j < 3; ++ j ) {
# offsets_shared[iy * 3 + j] = @offsets(iy, j);
# }
# }
# __syncthreads();
# if (ix < indices_shape0 && iy < offsets_shape0) {
# int cur_indices[4];
# for (int j = 1; j <= 3; ++ j ) {
# // cur_indices[j] = @indices(ix, j) + @offsets(iy, j - 1);
# cur_indices[j] = @indices(ix, j) + offsets_shared[iy * 3 + j - 1];
# }
# cur_indices[0] = @indices(ix, 0);
# uint64_t hash = 14695981039346656037UL;
# for (int j = 0; j < 4; ++ j ) {
# hash ^= (unsigned int)cur_indices[j];
# hash *= 1099511628211UL;
# }
# hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
# @out0(iy, ix) = hash;
# }
# }
# dim3 block(512);
# dim3 grid((indices_shape0 + block.x - 1) / block.x, offsets_shape0);
# kernel_hash_kernel<<< grid, block, offsets_shape0 * 3 * sizeof(offsets_type) >>>(@ARGS);
# """,
# cuda_src="""
# __global__ void kernel_hash_kernel(@ARGS_DEF){
# @PRECALC
# extern __shared__ int offsets_shared[];
# int ix = blockDim.x * blockIdx.x + threadIdx.x;
# int iy = blockDim.y * blockIdx.y + threadIdx.y;
# // if (!threadIdx.x) {
# // for (int j = 0; j < 3; ++ j ) {
# // offsets_shared[iy * 3 + j] = @offsets(iy, j);
# // }
# // }
# // __syncthreads();
# if (iy < indices_shape0 && ix < offsets_shape0) {
# int cur_indices[4];
# // for (int j = 1; j <= 3; ++ j ) {
# // cur_indices[j] = @indices(iy, j) + @offsets(ix, j - 1);
# // cur_indices[j] = @indices(iy, j) + offsets_shared[ix * 3 + j - 1];
# // }
# cur_indices[0] = @indices(iy, 0);
# cur_indices[1] = @indices(iy, 1) + @offsets(ix, 0);
# cur_indices[2] = @indices(iy, 2) + @offsets(ix, 1);
# cur_indices[3] = @indices(iy, 3) + @offsets(ix, 2);
# uint64_t hash = 14695981039346656037UL;
# for (int j = 0; j < 4; ++ j ) {
# hash ^= (unsigned int)cur_indices[j];
# hash *= 1099511628211UL;
# }
# hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
# @out0(ix, iy) = hash;
# }
# }
# dim3 block(16, 64);
# dim3 grid((offsets_shape0 + block.x - 1) / block.x), (indices_shape0 + block.y - 1) / block.y);
# kernel_hash_kernel<<< grid, block, offsets_shape0 * 3 * sizeof(offsets_type) >>>(@ARGS);
# """,
cuda_src="""
__global__ void kernel_hash_kernel(@ARGS_DEF){
@PRECALC
extern __shared__ int offsets_shared[];
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int k = idx % offsets_shape0;
int i = idx / offsets_shape0;
if (i < indices_shape0) {
int cur_indices[4];
for (int j = 1; j <= 3; ++ j ) {
cur_indices[j] = @indices(i, j) + @offsets(k, j - 1);
}
cur_indices[0] = @indices(i, 0);
uint64_t hash = 14695981039346656037UL;
for (int j = 0; j < 4; ++ j ) {
hash ^= (unsigned int)cur_indices[j];
hash *= 1099511628211UL;
}
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
@out(k, i) = hash;
}
}
int thread_nums = 512;
kernel_hash_kernel<<<ceil((double)(indices_shape0 * offsets_shape0) / thread_nums), thread_nums, offsets_shape0 * 3 * sizeof(int)>>>(@ARGS);
""",
cpu_header="""
#include <vector>
@alias(indices, in0)
@alias(offsets, in1)
""",
cpu_src="""
auto K = offsets_shape0;
auto N = indices_shape0;
for (int k = 0; k < offsets_shape0; ++ k ) {
#pragma omp parallel for
for (int i = 0; i < indices_shape0; ++ i ) {
int cur_indices[4];
for (int j = 1; j <= 3; ++ j ) {
cur_indices[j] = @indices(i, j) + @offsets(k, j - 1);
}
cur_indices[0] = @indices(i, 0);
uint64_t hash = 14695981039346656037UL;
for (int j = 0; j < 4; ++ j ) {
hash ^= (unsigned int)cur_indices[j];
hash *= 1099511628211UL;
}
hash = (hash >> 60) ^ (hash & 0xFFFFFFFFFFFFFFF);
@out(k, i) = hash;
}
}
"""
)

View File

@ -1,26 +0,0 @@
import jittor as jt
from JSparse import SparseTensor
__all__ = ['global_avg_pool', 'global_max_pool']
def global_avg_pool(inputs: SparseTensor) -> jt.Var:
batch_size = jt.max(inputs.indices[:, 0]).item() + 1
outputs = []
for k in range(batch_size):
input = inputs.values[inputs.indices[:, 0] == k]
output = jt.mean(input, dim=0)
outputs.append(output)
outputs = jt.stack(outputs, dim=0)
return outputs
def global_max_pool(inputs: SparseTensor) -> jt.Var:
batch_size = jt.max(inputs.indices[:, 0]).item() + 1
outputs = []
for k in range(batch_size):
input = inputs.values[inputs.indices[:, 0] == k]
output = jt.max(input, dim=0)[0]
outputs.append(output)
outputs = jt.stack(outputs, dim=0)
return outputs

View File

@ -1,454 +0,0 @@
import jittor as jt
import math
__all__ = ['spquery']
def spquery(queries: jt.Var,
references: jt.Var) -> jt.Var:
q_size = queries.size()
queries = queries.view(-1)
indices = jt.arange(len(references), dtype=jt.int64)
if jt.flags.use_cuda > 0:
n = references.shape[0]
nextPow2 = pow(2, math.ceil(math.log2(n)))
table_size = (4 * nextPow2) if (n < 2048) else (2 * nextPow2)
if table_size < 512:
table_size = 512
num_funcs = 3
key_buf = jt.zeros((table_size,), dtype='int64')
val_buf = jt.zeros((table_size,), dtype='int64')
key = jt.zeros((num_funcs * table_size,), dtype='int64')
val = jt.zeros((num_funcs * table_size,), dtype='int64')
output = jt.code(queries.shape, jt.int64, [queries, references, indices, key_buf, val_buf, key, val],
cuda_header="""
#include <cmath>
#include <iostream>
#include <vector>
#include <cstdint>
#include <chrono>
#include <cstdio>
#include <cstdlib>
/** Reserved value for indicating "empty". */
#define EMPTY_CELL (0)
/** Max rehashing depth, and error depth. */
#define MAX_DEPTH (100)
#define ERR_DEPTH (-1)
/** CUDA naive thread block size. */
#define BLOCK_SIZE (256)
/** CUDA multi-level thread block size = bucket size. */
#define BUCKET_SIZE (512)
__device__ static uint64_t atomicExch(uint64_t *addr, uint64_t val) {
return (uint64_t)atomicExch((unsigned long long int *)addr,
(unsigned long long int)val);
}
/** Struct of a hash function config. */
typedef struct {
int rv; // Randomized XOR value.
int ss; // Randomized shift filter start position.
} FuncConfig;
/** Hard code hash functions and all inline helper functions for CUDA kernels'
* use. */
inline __device__ int do_1st_hash(const uint64_t val, const int num_buckets) {
return val % num_buckets;
}
inline __device__ int do_2nd_hash(const uint64_t val,
const FuncConfig *const hash_func_configs,
const int func_idx, const int size) {
FuncConfig fc = hash_func_configs[func_idx];
return ((val ^ fc.rv) >> fc.ss) % size; // XOR function as 2nd-level hashing.
}
// trying to ignore EMPTY_CELL by adding 1 at make_data.
inline __device__ uint64_t fetch_val(const uint64_t data, const int pos_width) {
return data >> pos_width;
}
inline __device__ int fetch_func(const uint64_t data, const int pos_width) {
return data & ((0x1 << pos_width) - 1);
}
inline __device__ uint64_t make_data(const uint64_t val, const int func,
const int pos_width) {
return (val << pos_width) ^ func;
}
class CuckooHashTableCuda_Multi {
private:
const int _size;
const int _evict_bound;
const int _num_funcs;
const int _pos_width;
const int _num_buckets;
FuncConfig *_d_hash_func_configs;
/** Cuckoo hash function set. */
FuncConfig *_hash_func_configs;
/** Private operations. */
void gen_hash_funcs() {
// Calculate bit width of value range and table size.
int val_width = 8 * sizeof(uint64_t) - ceil(log2((double)_num_funcs));
int bucket_width = ceil(log2((double)_num_buckets));
int size_width = ceil(log2((double)BUCKET_SIZE));
// Generate randomized configurations.
for (int i = 0; i < _num_funcs; ++i) { // At index 0 is a dummy function.
if (val_width - bucket_width <= size_width)
_hash_func_configs[i] = {rand(), 0};
else {
_hash_func_configs[i] = {
rand(), rand() % (val_width - bucket_width - size_width + 1) +
bucket_width};
}
}
};
inline uint64_t fetch_val(const uint64_t data) { return data >> _pos_width; }
inline int fetch_func(const uint64_t data) {
return data & ((0x1 << _pos_width) - 1);
}
public:
CuckooHashTableCuda_Multi(const int size, const int evict_bound,
const int num_funcs)
: _size(size),
_evict_bound(evict_bound),
_num_funcs(num_funcs),
_pos_width(ceil(log2((double)_num_funcs))),
_num_buckets(ceil((double)_size / BUCKET_SIZE)) {
srand(time(NULL));
_d_hash_func_configs = NULL;
_hash_func_configs = NULL;
_hash_func_configs = new FuncConfig[num_funcs];
gen_hash_funcs();
cudaMalloc((void **)&_d_hash_func_configs, _num_funcs * sizeof(FuncConfig));
cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
_num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
};
~CuckooHashTableCuda_Multi() {
if (_hash_func_configs != NULL) delete[] _hash_func_configs;
if (_d_hash_func_configs != NULL) cudaFree(_d_hash_func_configs);
};
int insert_vals(const uint64_t *const keys, const uint64_t *const vals,
uint64_t *d_key_buf, uint64_t *d_val_buf, uint64_t *d_key,
uint64_t *d_val, const int n);
void lookup_vals(const uint64_t *const keys, uint64_t *const results,
uint64_t *d_key, uint64_t *d_val, const int n);
};
__global__ void cuckooBucketKernel_Multi(
uint64_t *const key_buf, uint64_t *const val_buf, const int size,
const uint64_t *const keys, const uint64_t *const vals, const int n,
int *const counters, const int num_buckets);
__global__ void cuckooInsertKernel_Multi(
uint64_t *const key, uint64_t *const val, const uint64_t *const key_buf,
const uint64_t *const val_buf, const int size,
const FuncConfig *const hash_func_configs, const int num_funcs,
const int *const counters, const int num_buckets, const int evict_bound,
const int pos_width, int *const rehash_requests);
__global__ void cuckooLookupKernel_Multi(
const uint64_t *const keys, uint64_t *const results, const int n,
const uint64_t *const all_keys, const uint64_t *const all_vals,
const int size, const FuncConfig *const hash_func_configs,
const int num_funcs, const int num_buckets, const int pos_width);
__global__ void cuckooBucketKernel_Multi(
uint64_t *const key_buf, uint64_t *const val_buf, const int size,
const uint64_t *const keys, const uint64_t *const vals, const int n,
int *const counters, const int num_buckets) {
// Get thread index.
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// Only threads within range are active.
if (idx < n) {
// Do 1st-level hashing to get bucket id, then do atomic add to get index
// inside the bucket.
uint64_t key = keys[idx];
uint64_t val = vals[idx];
int bucket_num = do_1st_hash(key, num_buckets);
int bucket_ofs = atomicAdd(&counters[bucket_num], 1);
// Directly write the key into the table buffer.
if (bucket_ofs >= BUCKET_SIZE) {
printf("%d/%d ERROR: bucket overflow! n=%d, bucket_num=%d/%d, key=%d", bucket_ofs, BUCKET_SIZE, n, bucket_num, num_buckets, key);
} else {
key_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = key;
val_buf[bucket_num * BUCKET_SIZE + bucket_ofs] = val;
}
}
}
__global__ void cuckooInsertKernel_Multi(
uint64_t *const key, uint64_t *const val, const uint64_t *const key_buf,
const uint64_t *const val_buf, const int size,
const FuncConfig *const hash_func_configs, const int num_funcs,
const int *const counters, const int num_buckets, const int evict_bound,
const int pos_width, int *const rehash_requests) {
// Create local cuckoo table in shared memory. Size passed in as the third
// kernel parameter.
extern __shared__ uint64_t local_key[];
for (int i = 0; i < num_funcs; ++i) {
local_key[i * BUCKET_SIZE + threadIdx.x] = EMPTY_CELL;
}
// might be useful
__syncthreads();
// Get thread index.
int idx = threadIdx.x + blockIdx.x * blockDim.x;
uint64_t cur_idx = idx;
// Only threads within local bucket range are active.
if (threadIdx.x < counters[blockIdx.x]) {
// Set initial conditions.
uint64_t cur_key = key_buf[cur_idx];
int cur_func = 0;
int evict_count = 0;
// Start the test-kick-and-reinsert loops.
do {
int pos = do_2nd_hash(cur_key, hash_func_configs, cur_func, BUCKET_SIZE);
uint64_t new_data = make_data(cur_idx + 1, cur_func, pos_width);
uint64_t old_idx =
atomicExch(&local_key[cur_func * BUCKET_SIZE + pos], new_data);
if (old_idx != EMPTY_CELL) {
cur_idx = fetch_val(old_idx, pos_width) - 1;
// potential overflow here. It seems that cur_idx < 0 is possible!
cur_key = key_buf[cur_idx];
cur_func = (fetch_func(old_idx, pos_width) + 1) % num_funcs;
evict_count++;
} else {
break;
}
} while (evict_count < num_funcs * evict_bound);
// If exceeds eviction bound, then needs rehashing.
if (evict_count >= num_funcs * evict_bound) {
atomicAdd(rehash_requests, 1);
}
}
// Every thread write its responsible local slot into the global data table.
__syncthreads();
for (int i = 0; i < num_funcs; ++i) {
uint64_t cur_idx = local_key[i * BUCKET_SIZE + threadIdx.x];
if (cur_idx == EMPTY_CELL) {
continue;
}
int cur_func = fetch_func(cur_idx, pos_width);
cur_idx = fetch_val(cur_idx, pos_width) - 1;
key[i * size + idx] = key_buf[cur_idx];
val[i * size + idx] = val_buf[cur_idx];
}
}
__global__ void cuckooLookupKernel_Multi(
const uint64_t *const keys, uint64_t *const results, const int n,
const uint64_t *const all_keys, const uint64_t *const all_vals,
const int size, const FuncConfig *const hash_func_configs,
const int num_funcs, const int num_buckets, const int pos_width) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// Only threads within range are active.
if (idx < n) {
uint64_t key = keys[idx];
int bucket_num = do_1st_hash(key, num_buckets);
for (int i = 0; i < num_funcs; ++i) {
int pos = bucket_num * BUCKET_SIZE +
do_2nd_hash(key, hash_func_configs, i, BUCKET_SIZE);
if (all_keys[i * size + pos] == key) {
results[idx] = all_vals[i * size + pos] + 1;
return;
}
}
// TODO(Haotian): should be a value that will not be encountered.
results[idx] = EMPTY_CELL;
}
}
void CuckooHashTableCuda_Multi::lookup_vals(const uint64_t *const keys,
uint64_t *d_key, uint64_t *d_val,
uint64_t *const results,
const int n) {
// Launch the lookup kernel.
cuckooLookupKernel_Multi<<<ceil((double)n / BUCKET_SIZE), BUCKET_SIZE>>>(
keys, results, n, d_key, d_val, _size, _d_hash_func_configs, _num_funcs,
_num_buckets, _pos_width);
}
int CuckooHashTableCuda_Multi::insert_vals(const uint64_t *const keys,
const uint64_t *const vals,
uint64_t *d_key_buf,
uint64_t *d_val_buf, uint64_t *d_key,
uint64_t *d_val, const int n) {
//
// Phase 1: Distribute keys into buckets.
//
// Allocate GPU memory.
int *d_counters = NULL;
cudaMalloc((void **)&d_counters, _num_buckets * sizeof(int));
cudaMemset(d_counters, 0, _num_buckets * sizeof(int));
// Invoke bucket kernel.
cuckooBucketKernel_Multi<<<ceil((double)n / BUCKET_SIZE), BUCKET_SIZE>>>(
d_key_buf, d_val_buf, _size, keys, vals, n, d_counters, _num_buckets);
//
// Phase 2: Local cuckoo hashing.
//
// Allocate GPU memory.
cudaDeviceSynchronize();
int *d_rehash_requests = NULL;
cudaMalloc((void **)&d_rehash_requests, sizeof(int));
// Copy values onto GPU memory.
cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
_num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
// Invoke insert kernel. Passes shared memory table size by the third
// argument. Loops until no rehashing needed.
int rehash_count = 0;
do {
int rehash_requests = 0;
cudaMemset(d_rehash_requests, 0, sizeof(int));
cuckooInsertKernel_Multi<<<ceil((double)_size / BUCKET_SIZE), BUCKET_SIZE,
_num_funcs * BUCKET_SIZE * sizeof(uint64_t)>>>(
d_key, d_val, d_key_buf, d_val_buf, _size, _d_hash_func_configs,
_num_funcs, d_counters, _num_buckets, _evict_bound, _pos_width,
d_rehash_requests);
cudaMemcpy(&rehash_requests, d_rehash_requests, sizeof(int),
cudaMemcpyDeviceToHost);
if (rehash_requests == 0) {
break;
} else {
rehash_count++;
gen_hash_funcs();
cudaMemcpy(_d_hash_func_configs, _hash_func_configs,
_num_funcs * sizeof(FuncConfig), cudaMemcpyHostToDevice);
}
} while (rehash_count < MAX_DEPTH);
cudaDeviceSynchronize();
// Free GPU resources.
if (d_counters != NULL) {
cudaFree(d_counters);
}
if (d_rehash_requests != NULL) {
cudaFree(d_rehash_requests);
}
return (rehash_count < MAX_DEPTH) ? rehash_count : ERR_DEPTH;
}
@alias(hash_query, in0)
@alias(hash_target, in1)
@alias(idx_target, in2)
@alias(key_buf, in3)
@alias(val_buf, in4)
@alias(key, in5)
@alias(val, in6)
""",
cuda_src="""
int n = hash_target_shape0;
int n1 = hash_query_shape0;
const int nextPow2 = pow(2, ceil(log2((double)n)));
// When n is large, the hash values tend to be more evenly distrubuted and
// choosing table_size to be 2 * nextPow2 typically suffices. For smaller n,
// the effect of uneven distribution of hash values is more pronounced and
// hence we choose table_size to be 4 * nextPow2 to reduce the chance of
// bucket overflow.
int table_size = (n < 2048) ? 4 * nextPow2 : 2 * nextPow2;
if (table_size < 512) {
table_size = 512;
}
int num_funcs = 3;
CuckooHashTableCuda_Multi in_hash_table(table_size, 8 * ceil(log2((double)n)),
num_funcs);
in_hash_table.insert_vals((uint64_t *)(hash_target_p),
(uint64_t *)(idx_target_p),
(uint64_t *)(key_buf_p),
(uint64_t *)(val_buf_p),
(uint64_t *)(key_p),
(uint64_t *)(val_p), n);
in_hash_table.lookup_vals((uint64_t *)(hash_query_p),
(uint64_t *)(key_p),
(uint64_t *)(val_p),
(uint64_t *)(out_p), n1);
"""
)
else:
output = jt.code(queries.shape, jt.int64, [queries, references, indices],
cpu_header="""
#include <cmath>
#include <google/dense_hash_map>
#include <iostream>
#include <vector>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
@alias(hash_query, in0)
@alias(hash_target, in1)
@alias(idx_target, in2)
""",
cpu_src="""
int n = hash_target_shape0;
int n1 = hash_query_shape0;
google::dense_hash_map<int64_t, int64_t> hashmap; // Google Sparse Hash library
hashmap.set_empty_key(0);
for (int idx = 0; idx < n; ++ idx ) {
int64_t key = @hash_target(idx);
int64_t val = @idx_target(idx) + 1;
hashmap.insert(std::make_pair(key, val));
}
#pragma omp parallel for
for (int idx = 0; idx < n1; ++ idx ) {
int64_t key = @hash_query(idx);
auto iter = hashmap.find(key);
if (iter != hashmap.end()) {
@out(idx) = iter->second;
} else @out(idx) = 0;
}
"""
)
output = (output - 1).view(*q_size)
return output
# return indices

View File

@ -1,114 +0,0 @@
import jittor as jt
from jittor import Function
from JSparse import SparseTensor
__all__ = ['spvoxelize']
class Voxelize(Function):
def execute(
self,
values: jt.Var,
idx_query: jt.Var,
counts: jt.Var
) -> jt.Var:
# N = values_shape0
# c = values_shape1
# N1 = counts_shape0
# out: N1 x c
output = jt.code((counts.shape[0], values.shape[1]), "float32", [values, idx_query, counts],
cuda_header="""
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
""",
cuda_src="""
__global__ void voxelize_forward_kernel(@ARGS_DEF) {
@PRECALC
@alias(values, in0)
@alias(idx_query, in1)
@alias(counts, in2)
int index = blockDim.x * blockIdx.x + threadIdx.x;
int c = values_shape1;
int i = index / c;
int j = index % c;
if (i < values_shape0) {
int pos = @idx_query(i);
if (pos < 0 || pos >= counts_shape0 || @counts(pos) == 0) return;
atomicAdd(&@out(pos, j), @values(i, j) / (float)(@counts(pos)));
}
}
@alias(values, in0)
voxelize_forward_kernel<<< values_shape0, values_shape1 >>>(@ARGS);
""",
cpu_src="""
@alias(values, in0)
@alias(idx_query, in1)
@alias(counts, in2)
for (int i = 0; i < values_shape0; ++ i ) {
int pos = @idx_query(i);
if (@counts(pos) == 0)
continue;
#pragma omp parallel for
for (int j = 0; j < values_shape1; ++ j ) {
#pragma omp atomic
@out(pos, j) += @values(i, j) / (float)@counts(pos);
}
}
"""
)
self.save_vars = idx_query, counts, values.shape[0]
return output
def grad(self, grad_output: jt.Var):
idx_query, counts, input_size = self.save_vars
grad_values = jt.code((input_size, grad_output.shape[1]), jt.float32, [idx_query, counts, grad_output],
cuda_header="""
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
""",
cuda_src="""
__global__ void voxelize_backward_kernel(@ARGS_DEF) {
@PRECALC
@alias(idx_query, in0)
@alias(counts, in1)
@alias(grad_output, in2)
int index = blockDim.x * blockIdx.x + threadIdx.x;
int i = index / grad_output_shape1;
int j = index % grad_output_shape1;
if (i < out_shape0) {
int pos = @idx_query(i);
if (pos < 0 || pos >= counts_shape0 || @counts(pos) == 0) return;
atomicAdd(&@out(pos, j), @grad_output(pos, j) / @counts(pos));
}
}
voxelize_backward_kernel<<<out_shape0, out_shape1>>>(@ARGS);
""",
cpu_src="""
@alias(idx_query, in0)
@alias(counts, in1)
@alias(grad_output, in2)
for (int i = 0; i < out_shape0; ++ i ) {
int pos = @idx_query(i);
if (@counts(pos) == 0) continue;
#pragma omp parallel for
for (int j = 0; j < grad_output_shape1; ++ j ) {
@out(i, j) = @grad_output(pos, j) / (float)@counts(pos);
}
}
"""
)
return grad_values, None, None
def spvoxelize(
values: jt.Var,
idx_query: jt.Var,
counts: jt.Var
) -> jt.Var:
return Voxelize.apply(values, idx_query, counts)

View File

@ -1,6 +0,0 @@
from .activation import *
# from .bev import *
from .conv import *
# from .crop import *
from .norm import *
from .pooling import *

View File

@ -1,21 +0,0 @@
import jittor as jt
from jittor import nn
from JSparse import SparseTensor
from JSparse.nn.functional import relu, leaky_relu
# from nn.utils import fapply
__all__ = ['ReLU', 'LeakyReLU']
# class ReLU(nn.ReLU):
# def execute(self, input: SparseTensor) -> SparseTensor:
# return fapply(input, super().execute)
# class LeakyReLU(nn.LeakyReLU):
# def execute(self, input: SparseTensor) -> SparseTensor:
# return fapply(input, super().execute)
Relu = jt.make_module(relu)
ReLU = Relu
Leaky_relu = jt.make_module(leaky_relu, 2)
LeakyReLU = Leaky_relu

View File

@ -1,85 +0,0 @@
import math
from typing import List, Tuple, Union
import numpy as np
import jittor as jt
from jittor import nn
from jittor import init
from jittor.misc import _pair, _triple
from JSparse import SparseTensor
from JSparse.nn import functional as F
# from utils import make_ntuple
__all__ = ['Conv3d']
class Conv3d(nn.Module):
def __init__(self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, ...]] = 3,
stride: Union[int, Tuple[int, ...]] = 1,
dilation: int = 1,
groups: int = 1,
bias: bool = False,
transposed: bool = False) -> None:
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
# self.kernel_size = make_ntuple(kernel_size, ndim=3)
# self.stride = make_ntuple(stride, ndim=3)
# self.dilation = dilation
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation, dilation)
self.groups = groups
assert in_channels % groups == 0, 'in_channels must be divisible by groups'
assert out_channels % groups == 0, 'out_channels must be divisible by groups'
self.transposed = transposed
self.kernel_volume = int(np.prod(self.kernel_size))
# if self.kernel_volume > 1:
# self.kernel = nn.Parameter(
# jt.zeros(self.kernel_volume, in_channels, out_channels))
# else:
# self.kernel = nn.Parameter(jt.zeros(in_channels, out_channels))
# if bias:
# self.bias = nn.Parameter(jt.zeros(out_channels))
# else:
# self.register_parameter('bias', None)
# self.reset_parameters()
fan = (self.out_channels if self.transposed else self.in_channels) * self.kernel_volume
std = 1 / math.sqrt(fan)
if self.kernel_volume > 1:
self.weight = init.uniform([self.kernel_volume, in_channels, out_channels], 'float32', -std, std)
else:
self.weight = init.uniform([in_channels, out_channels], 'float32')
if bias:
self.bias = init.uniform([out_channels], "float32", -std, std)
else:
self.bias = None
# self.reset_parameters()
def execute(self, input: SparseTensor) -> SparseTensor:
return F.conv3d(input,
weight=self.weight,
kernel_size=self.kernel_size,
bias=self.bias,
stride=self.stride,
dilation=self.dilation,
groups=self.groups,
transposed=self.transposed)
# def set_parameters(self) -> None:
# std = 1 / math.sqrt(
# (self.out_channels if self.transposed else self.in_channels)
# * self.kernel_volume)
# self.weight *= std
# if self.bias is not None:
# self.bias *= std

View File

@ -1,34 +0,0 @@
import jittor as jt
from jittor import nn
from numpy import kaiser
from JSparse import SparseTensor
from JSparse.nn.utils import fapply
__all__ = ['BatchNorm', 'GroupNorm']
class BatchNorm(nn.BatchNorm):
def execute(self, input: SparseTensor) -> SparseTensor:
return fapply(input, super().execute)
class GroupNorm(nn.GroupNorm):
def execute(self, input: SparseTensor) -> SparseTensor:
indices, values, stride, size = input.indices, input.values, input.stride, input.size
batch_size = jt.max(indices[:, 0]).item() + 1
num_channels = values.shape[1]
n_values = jt.zeros_like(values)
for k in range(batch_size):
idx = indices[:, 0] == k
b_values = values[idx]
b_values = b_values.t().reshape(1, num_channels, -1)
b_values = super().execute(b_values)
b_values = b_values.reshape(num_channels, -1).t()
n_values[idx] = b_values
output = SparseTensor(indices=indices, values=n_values, stride=stride, size=size)
output.cmaps = input.cmaps
output.kmaps = input.kmaps
return output

View File

@ -1,11 +0,0 @@
from ast import Global
import jittor as jt
from jittor import nn
from JSparse import SparseTensor
from JSparse.nn.functional import global_avg_pool, global_max_pool
__all__ = ['GlobalAvgPool', 'GlobalMaxPool']
GlobalAvgPool = jt.make_module(global_avg_pool)
GlobalMaxPool = jt.make_module(global_max_pool)

View File

@ -1,2 +0,0 @@
from .apply import *
from .kernel import *

View File

@ -1,15 +0,0 @@
from typing import Callable
import jittor as jt
from JSparse import SparseTensor
__all__ = ['fapply']
def fapply(input: SparseTensor, fn: Callable[..., jt.Var], *args,
**kwargs) -> SparseTensor:
values = fn(input.values, *args, **kwargs)
output = SparseTensor(indices=input.indices, values=values, stride=input.stride, size=input.size)
output.cmaps = input.cmaps
output.kmaps = input.kmaps
return output

View File

@ -1,28 +0,0 @@
from typing import Tuple, Union
import numpy as np
import jittor as jt
from JSparse.utils import make_ntuple, trunc
__all__ = ['get_kernel_offsets']
def get_kernel_offsets(kernel_size: Union[int, Tuple[int, ...]],
stride: Union[int, Tuple[int, ...]] = 1,
dilation: Union[int, Tuple[int, ...]] = 1) -> jt.Var:
kernel_size = make_ntuple(kernel_size, ndim=3)
stride = make_ntuple(stride, ndim=3)
dilation = make_ntuple(dilation, ndim=3)
offsets = [(np.arange(-kernel_size[k] // 2 + 1, kernel_size[k] // 2 + 1) * stride[k]
* dilation[k]) for k in range(3)]
if np.prod(kernel_size) % 2 == 1:
offsets = [[x, y, z] for z in offsets[2] for y in offsets[1]
for x in offsets[0]]
else:
offsets = [[x, y, z] for x in offsets[0] for y in offsets[1]
for z in offsets[2]]
offsets = jt.int32(offsets)
return offsets

View File

@ -1,125 +0,0 @@
from itertools import count
import numpy as np
import jittor as jt
from jittor.misc import _pair, _triple
from typing import Any, Dict, Tuple, Union
from type_check import type_check
from JSparse.utils import make_ntuple, sparse_quantize, set_hash
# from .utils.quantize import sparse_quantize
# from indice_manager import IndiceManager
class SparseTensor:
@type_check
def __init__(
self,
indices: jt.Var,
values: jt.Var,
stride: Union[int, Tuple[int, ...]],
size=None,
quantize=True,
voxel_size=1,
coalesce_mode:str='sum',
indice_manager=None,
device=None,
):
assert (values.ndim == 2)
# self.indices = indices
# self.values = values
self.size = size
self.ndim = indices.shape[1] - 1
self.stride =_triple(stride)
self.voxel_size = voxel_size
self.coalesce_mode = coalesce_mode
self.cmaps = {}
self.kmaps = {}
##########################
# Setup CoordsManager
##########################
# if indice_manager is None:
# self.indice_manager = IndiceManager(
# ndim=self.ndim,
# )
##########################
# Initialize coords
##########################
if quantize:
self.seed = 1
for i in range(len(self.stride)):
self.seed += i
self.seed *= self.stride[i]
self.hash_multiplier = set_hash(self.ndim, self.seed)
self.hash_num, self.indices, mapping, inverse_mapping, count = sparse_quantize(indices, self.hash_multiplier, self.voxel_size, return_index=True, return_inverse=True, return_count=True)
self.inverse_mapping = inverse_mapping
if len(values.shape) == 1:
out_size = (self.indices.shape[0], )
elif len(values.shape) == 2:
out_size = (self.indices.shape[0], values.shape[-1])
if self.coalesce_mode == 'sum':
out_size = (self.indices.shape[0], values.shape[-1])
self.values = jt.zeros(out_size, dtype=values.dtype).scatter_(0, inverse_mapping, values, reduce='add')
elif self.coalesce_mode == 'average':
out_size = (self.indices.shape[0], values.shape[-1])
self.values = jt.zeros(out_size, dtype=values.dtype).scatter_(0, inverse_mapping, values, reduce='add')
self.values /= count
elif self.coalesce_mode == 'sample':
self.values = values[self.indices]
else:
self.indices = indices
self.values = values
# if indice_manager is None:
# # TODO If set to share the indices man, use the global indices man
# # init the indices
# indice_manager = Indice
def _indices(self):
return self.indices
def _values(self):
return self.values
def _binary_operation(self, other, _binary_op):
assert isinstance(other, self.__class__)
return
# TODO set up the indices dict
# so that wedo not need to merge the indice group
# which has already been merged
# if the indices of self and other should be merged
class PointTensor:
def __init__(self, values, indices, idx_query=None, weights=None):
self.values = values
self.indices = indices
self.idx_query = idx_query if idx_query is not None else {}
self.weights = weights if weights is not None else {}
self.additional_values = {}
self.additional_values['idx_query'] = {}
self.additional_values['counts'] = {}
def detach(self):
self.values = self.values.detach()
self.indices = self.indices.detach()
return self
def __add__(self, other):
pt = PointTensor(self.values + other.values, self.indices, self.idx_query,
self.weights)
pt.additional_values = self.additional_values
return pt

View File

@ -1,42 +0,0 @@
import jittor as jt
from jittor import Function
def spmm(
rows: jt.Var,
cols: jt.Var,
vals: jt.Var,
size: jt.NanoVector,
mat: jt.Var,
spmm_mode='scatter',
is_sorted: bool = False,
cuda_spmm_alg: int = 1,
) -> jt.Var:
assert len(rows) == len(cols), "Invalid length"
assert len(rows) == len(vals), "Invalid length"
assert vals.dtype == mat.dtype, "dtype mismatch"
if jt.flags.use_cuda > 1:
assert jt.has_cuda == 1, "No GPUs available"
rows = rows.int()
cols = cols.int()
'''
TODO: Using the coo_spmm of cuSPARSE on GPU
result = coo_spmm_int32(
rows, cols, vals, size[0], size[1], mat, cuda_spmm_alg, is_sorted
)
'''
else:
if (spmm_mode == 'scatter'):
class SPMM(Function):

View File

@ -1,2 +0,0 @@
from .quantize import *
from .utils import *

View File

@ -1,58 +0,0 @@
from itertools import repeat
from typing import List, Tuple, Union
import jittor as jt
import numpy as np
from .utils import unique1d
__all__ = ['sparse_quantize', 'set_hash']
def set_hash(ndim, seed, low=100, high=1000):
jt.set_seed(seed)
return jt.randint(low, high, shape=(ndim + 1,), dtype='uint64')
def hash(x: np.ndarray, multiplier: np.ndarray) -> jt.Var:
assert x.ndim == 2, x.shape
x = x - x.min(dim=0)
x = x.uint64()
h = jt.zeros(x.shape[0], dtype='uint64')
for k in range(x.shape[1] - 1):
h += x[:, k]
h *= multiplier[k]
h += x[:, -1]
return h
def sparse_quantize(indices,
hash_multiplier,
voxel_size: Union[float, Tuple[float, ...]] = 1,
*,
return_index: bool = False,
return_inverse: bool = False,
return_count: bool = False) -> List[np.ndarray]:
if indices.dtype.is_int() and voxel_size == 1:
pass
else:
if isinstance(voxel_size, (float, int)):
voxel_size = tuple(repeat(voxel_size, 3))
assert isinstance(voxel_size, tuple) and len(voxel_size) == 3
voxel_size = jt.Var(voxel_size)
indices[:, 1:] /= voxel_size
indices = jt.floor(indices).astype(jt.int32)
hash_num, mapping, inverse_mapping, count = unique1d(hash(indices, hash_multiplier))
indices = indices[mapping]
outputs = [hash_num, indices]
if return_index:
outputs += [mapping]
if return_inverse:
outputs += [inverse_mapping]
if return_count:
outputs += [count]
return outputs[0] if len(outputs) == 1 else outputs

View File

@ -1,43 +0,0 @@
from itertools import repeat
from typing import List, Tuple, Union
import jittor as jt
__all__ = ['make_ntuple', 'trunc', 'unique1d']
def make_ntuple(x: Union[int, List[int], Tuple[int, ...], jt.Var],
ndim: int) -> Tuple[int, ...]:
if isinstance(x, int):
x = tuple(repeat(x, ndim))
elif isinstance(x, list):
x = tuple(x)
elif isinstance(x, jt.Var):
x = tuple(x.view(-1).numpy().tolist())
assert isinstance(x, tuple) and len(x) == ndim, x
return x
def trunc(x: jt.Var):
if x >= 0:
return jt.floor(x)
else:
return jt.ceil(x)
def unique1d(var):
assert len(var.shape) == 1
perm, aux = jt.argsort(var)
mask = jt.empty(aux.shape, dtype='bool')
mask[:1] = True
mask[1:] = aux[1:] != aux[:-1]
ret = (aux[mask],)
ret += (perm[mask],)
imask = jt.cumsum(mask.astype(perm.dtype)) - 1
inv_idx = jt.empty(mask.shape, dtype=perm.dtype)
inv_idx[perm] = imask
ret += (inv_idx,)
idx = jt.concat([jt.nonzero(mask).view(-1), jt.Var(mask.shape[0])])
ret += (idx[1:] - idx[:-1],)
return ret

View File

@ -1,15 +0,0 @@
from gettext import install
from setuptools import find_packages, setup
with open("../README.md", "r") as file:
description = file.read()
print(find_packages())
setup(
name='JSparse',
version='0.1',
description=description,
packages=find_packages(),
install_requires=["jittor", "type_check"]
)

View File

@ -1,96 +0,0 @@
import time
import math
import numpy as np
import torch
import jittor as jt
import jittor.nn as nn
from jittor import init
from jittor.misc import _pair, _triple
from itertools import repeat
from typing import List, Tuple, Union
from JSparse import SparseTensor
from JSparse import PointTensor
from JSparse.utils import make_ntuple
from JSparse.nn import functional as F
from JSparse.nn.utils import get_kernel_offsets
from JSparse.nn.functional import Convolution
import torchsparse
from torchsparse import nn as spnn
jt.flags.use_cuda = 1
in_channels = 3
out_channels = 64
kernel_size = 3
stride = 1
dilation = 1
groups = 1
bias = False
transposed = False
kernel_size = _triple(kernel_size)
stride = _triple(stride)
dilation = _triple(dilation)
kernel_volume = int(np.prod(kernel_size))
N = 10
coords = np.random.uniform(0, 10, size=(N, 4))
feats = np.random.randn(N, 3)
labels = np.random.choice(5, N)
print(coords.shape)
print(feats.shape)
coo = jt.Var(coords)
val = jt.Var(feats)
size = (10, 10, 10)
fan = (out_channels if transposed else in_channels) * kernel_volume
std = 1 / math.sqrt(fan)
if kernel_volume > 1:
weight = init.uniform([kernel_volume, in_channels, out_channels], 'float32', -std, std)
else:
weight = init.uniform([in_channels, out_channels], 'float32')
if bias:
bias = init.uniform([out_channels], "float32", -std, std)
else:
bias = None
x = SparseTensor(coo, val, 1, size)
z = PointTensor(x.values, x.indices.float())
pc_hash = F.sphash(
jt.concat([
z.indices[:, 0].int().view(-1, 1),
jt.floor(z.indices[:, 1:] / x.stride[0]).int() * x.stride[0]
], 1))
sparse_hash = F.sphash(x.indices)
idx_query = F.spquery(pc_hash, sparse_hash).int()
counts = F.spcount(idx_query, x.indices.shape[0])
z.additional_values['idx_query'][x.stride] = idx_query
z.additional_values['counts'][x.stride] = counts
inserted_values = F.spvoxelize(z.values, idx_query, counts)
new_tensor = SparseTensor(inserted_values, x.indices, x.stride, x.size, False)
new_tensor.cmaps = x.cmaps
new_tensor.kmaps = x.kmaps
print(inserted_values)
offsets = get_kernel_offsets(kernel_size=2, stride=x.stride, dilation=1)
old_hash = F.sphash(
jt.concat([
z.indices[:, 0].int().view(-1, 1),
jt.floor(z.indices[:, 1:] / x.stride[0]).int() * x.stride[0]
], 1), offsets)
pc_hash = F.sphash(x.indices)
idx_query = F.spquery(old_hash, pc_hash).int()
weights = F.calc_ti_weights(z.indices, idx_query,
scale=x.stride[0]).t()
idx_query = idx_query.t()
new_values = F.spdevoxelize(x.values, idx_query, weights)
print(jt.grad(new_values, x.values))