forked from liucheng/DeepBurning-MixQ
update readme, add debug on cifar
This commit is contained in:
parent
87f943e14a
commit
0b983a1223
|
@ -9,3 +9,4 @@ results
|
|||
!*/hls/config_simd_pe.txt
|
||||
localconfig.py
|
||||
/*.txt
|
||||
_logs
|
||||
|
|
|
@ -252,7 +252,11 @@ class QuantActivLinear(nn.Module):
|
|||
tmp = torch.tensor(input.shape[1] * 1e-3, dtype=torch.float)
|
||||
self.memory_size.copy_(tmp)
|
||||
out = self.activ(input)
|
||||
## print('ii',input[0,0,:,0]/self.activ.step)
|
||||
## print('lineari', torch.round(out[0,:]/self.activ.step).int())
|
||||
## wstd = self.linear.weight.std()
|
||||
out = self.linear(out)
|
||||
## print('linearo', torch.round(out[0,:]/(self.activ.step*self.linear.step*wstd)).int())
|
||||
return out
|
||||
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ def extract_model(in_shape):
|
|||
assert sub_module.bias is None, 'inner conv has no bias in this model'
|
||||
if isinstance(sub_module, QuantConv2d): # New quant
|
||||
conv_cur.wbit = sub_module.bit
|
||||
conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step becuause of alpha
|
||||
conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantConv2d.step because of alpha
|
||||
else:
|
||||
raise NotImplementedError(sub_module)
|
||||
print(', ich {ich}, och {och}, irow {irow}, icol {icol}, ksp {k}{s}{p}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur)))
|
||||
|
@ -124,7 +124,7 @@ def extract_model(in_shape):
|
|||
|
||||
if isinstance(sub_module, QuantLinear): # New quant
|
||||
conv_cur.wbit = sub_module.bit
|
||||
conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantLinear.step becuause of alpha
|
||||
conv_cur.w, conv_cur.wstep = sub_module.export_quant() # wstep is not QuantLinear.step because of alpha
|
||||
|
||||
print(', ich {ich}, och {och}, wbit {wbit}, wstep {wstep}'.format(**vars(conv_cur)))
|
||||
|
||||
|
@ -388,7 +388,7 @@ if __name__=='__main__':
|
|||
# load model and state_dict
|
||||
ptfile:Dict = torch.load('weights/' + opt.weight + '.pt', map_location='cpu')
|
||||
model = getattr(models, opt.model)(**ptfile.setdefault('model_params', {}))
|
||||
model.load_state_dict(ptfile['model'])
|
||||
model.load_state_dict(ptfile['model'], strict = False)
|
||||
|
||||
# processs
|
||||
model_param = extract_model([1, 32, 32])
|
||||
|
|
|
@ -13,8 +13,8 @@ class QConvLayer:
|
|||
self.conv = conv_param
|
||||
self.w = torch.tensor(self.conv.w, dtype = torch.int64)
|
||||
|
||||
def __call__(self, x: torch.Tensor):
|
||||
if self.conv.icol < x.shape[-1]: # maxpool
|
||||
def __call__(self, x: torch.Tensor, downsampling):
|
||||
if self.conv.icol < x.shape[-1]: # Maxpool. Note: Order of Maxpool and BN is IMPORTANT when BN.inc can be negative
|
||||
assert self.conv.irow*2, self.conv.icol*2 == x.shape[2:]
|
||||
x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64)
|
||||
|
||||
|
@ -27,6 +27,8 @@ class QConvLayer:
|
|||
|
||||
x = F.conv2d(x, self.w, bias=None, stride=self.conv.s, padding=self.conv.p) # [N, OCH, OROW, OCOL]
|
||||
# print('convo', self.conv.n, x[0,0,:,:])
|
||||
#if downsampling: # Maxpool
|
||||
# x = F.max_pool2d(x.float(), kernel_size = 2, stride = 2).to(dtype=torch.int64)
|
||||
och = x.shape[1]
|
||||
if True:
|
||||
if self.conv.inc is not None:
|
||||
|
@ -48,13 +50,11 @@ class QConvLayer:
|
|||
if hasattr(self.conv, 'bias'):
|
||||
bias_ch = self.conv.bias_raw.reshape((1, och, 1, 1))
|
||||
x += bias_ch
|
||||
|
||||
# print('biaso', self.conv.n, x[0,0,:,:])
|
||||
x = torch.round(x).to(dtype = torch.int64)
|
||||
|
||||
if hasattr(self.conv, 'obit'):
|
||||
x.clip_(0, 2**(self.conv.obit)-1)
|
||||
|
||||
return x
|
||||
|
||||
class HWModel:
|
||||
|
@ -69,7 +69,7 @@ class HWModel:
|
|||
x=x>>(8-self.layers[0].conv.abit)
|
||||
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = layer(x)
|
||||
x = layer(x, self.layers[i+1].conv.icol<layer.conv.icol if i+1<len(self.layers) else False)
|
||||
|
||||
x = x.float() / self.layers[-1].conv.div
|
||||
return x
|
||||
|
|
|
@ -68,7 +68,7 @@ if __name__ == '__main__':
|
|||
ptfile: Dict = torch.load('weights/' + opt.weight+'.pt', map_location=device)
|
||||
model_params = ptfile.setdefault('model_params', {})
|
||||
model = getattr(models, opt.model)(**model_params).to(device)
|
||||
model.load_state_dict(ptfile['model'])
|
||||
model.load_state_dict(ptfile['model'], strict = False)
|
||||
|
||||
# Test
|
||||
res = test(model, device, batch_size=opt.batch_size, num_batch=opt.num_batch)
|
||||
|
|
136
readme.md
136
readme.md
|
@ -1,16 +1,126 @@
|
|||
## train
|
||||
- python train.py --multi-scale --img-size 320 --multi-scale --batch-size 32
|
||||
# AnyPackingNet
|
||||
|
||||
## reference
|
||||
- https://github.com/ultralytics/yolov3.git
|
||||
This is the training model source code of the paper:
|
||||
|
||||
## 22修改
|
||||
- python train22.py 训练非量化模型时不用加其他参数
|
||||
- train22.py 80行改使用的模型,初始训练使用UltraNet_Float或RepVGG
|
||||
- QAT时把模型改为量化模型例如UltraNet_ismart,并且用--weights载入全精度权重
|
||||
- 测试精度使用python test.py --weights <权重路径> --model <模型类名称>
|
||||
Mixed-precision Neural Network Quantization and
|
||||
Implementation Co-optimization for FPGAs
|
||||
|
||||
### 注意
|
||||
- 余弦学习率衰减最好跑满全部epoch
|
||||
- RepVGG全精度训练后量化训练精度更好
|
||||
- quant_dorefa.py 第18行改权重量化为[-8,7]还是[-7,7]
|
||||
## Classification Model
|
||||
|
||||
### Usage
|
||||
```bash
|
||||
cd cifar/
|
||||
|
||||
# 1. Hardware-aware Mixed Precison NAS
|
||||
python search_train.py --cd 3e-5 --name mix_vggtiny_cifar_cd3e5
|
||||
# Params:
|
||||
# --cd Stands for complexity decay
|
||||
# --name Stands for checkpoint .pt and .log filename
|
||||
# --model Mixed precision supernet model, default is `VGGtiny_MixQ`
|
||||
# Then, the optimal bit width of each layer will converge after dozens of epochs, for example bitw={8,2,2,2,2,2}, bita = {8,3,3,3,6,3}
|
||||
|
||||
|
||||
# 2. Main train
|
||||
python main_train.py --bitw 822222 --bita 833363 --name vggtiny_cifar_cd3e5
|
||||
# Trained weights are under weights/tiny_cifar_cd3e5.pt
|
||||
|
||||
|
||||
# 3. Test model
|
||||
python test_acc.py
|
||||
# You can choose tiny_cifar_cd3e5.pt for test if nothing wrong
|
||||
|
||||
# 4. HLS code generation:
|
||||
# Now can directly export HLS configuration header and weight file form .pt weight file.
|
||||
# Adjust `simd, pe` parallelization factor of each layer firstly.
|
||||
vim hls/config_simd_pe.txt
|
||||
# Export `config.h` and `weights.hpp` to /hls/tiny_cifar_cd3e5/
|
||||
python export_hls.py
|
||||
|
||||
|
||||
# 5. Model-Level Hardware Simulation
|
||||
# simulate_hls.py requires /hls/tiny_cifar_cd3e5/model_param.pkl file generated by export_hls.py
|
||||
python simulate_hls.py
|
||||
# This output should consist with hardware output or HLS C-Level simluation
|
||||
|
||||
```
|
||||
|
||||
## DAC-SDC Object Detection Model
|
||||
|
||||
The DAC System Design Contest focused on low-power object detection on an embedded FPGA system: https://www.dac.com/Conference/System-Design-Contest.
|
||||
|
||||
The target of this contest is optimize performance of the designs in terms of accuracy and power on a Ultra 96 v2 FPGA board. This contest was held 5 times, from 2018 to 2022, and the performance of optimal design in these years increased from 30 fps to thousands of fps.
|
||||
|
||||
Base models for anypacking bitwidth search:
|
||||
|
||||
- UltraNet: https://github.com/heheda365/ultra_net by BJUT_runner team, 1st place of 2020 DAC-SDC contest. UltraNet is a VGGNet-like model with much less parameters. UltraNet_iSmart is 2nd place of 2021 DAC-SDC design by UIUC ismart team, which have much better throughput by fixed packing optimize.
|
||||
- UltraNet_Bypass: https://github.com/heymesut/SJTU_microe 21' SJTU, 3rd place of 2021 DAC-SDC contest. A variant of UltraNet with bypass connect. Bypass connect increases model accuracy, but makes design of NN acclerator based on pipeline architecture more difficult.
|
||||
- SkyNet: https://github.com/jiangwx/SkrSkr SkrSkr by SHTECH, 1st place of 2021 DAC-SDC contest. SkyNet is a MobileNet-like lightweight model.
|
||||
- SkyNetk5: SkyNet with 5x5 depthwise convolution kernel. Since dwconv uses much fewer calculations than pwconv, larger kernel brings higher accuracy with slight cost.
|
||||
|
||||
Dataset: See https://byuccl.github.io/dac_sdc_2022/info/.
|
||||
|
||||
**Usage**: First `cd dacsdc/`, then follow next steps.
|
||||
|
||||
### 1) Hardware-aware Mixed Precison NAS for bit width
|
||||
|
||||
```bash
|
||||
# For UltraNet with mixed precision:
|
||||
python search_train.py --cd 1e-5 --name mix_ultranet_cd1e5
|
||||
|
||||
# UltraNet with Bypass:
|
||||
python search_train.py --cd 1e-5 --name mix_ultranet_bypass_cd1e5 --model UltraNetBypass_MixQ
|
||||
|
||||
# SkyNet/SkyNetk5
|
||||
python search_train.py --cd 1e-5 --name mix_skynet_cd1e5 --model [SkyNet_MixQ | SkyNetk5_MixQ]
|
||||
```
|
||||
|
||||
### 2) Main Train
|
||||
|
||||
For UltraNet:
|
||||
```bash
|
||||
# UltraNet_BJTU use full 4bit wquantization
|
||||
python main_train.py --bitw 444444444 --bita 844444444 --name ultranet_BJTU
|
||||
|
||||
# UltraNet_iSmart use full 4-8 mixed quantization for weight
|
||||
python main_train.py --bitw 844444448 --bita 844444444 --name ultranet_iSmart
|
||||
|
||||
# Or use searched bitw, bita from search_train.py
|
||||
python main_train.py --bitw <bitw> --bita <bita> --name ultranet_anypacking
|
||||
```
|
||||
For UltraNet_Bypass/SkyNet/SkyNetk5
|
||||
```bash
|
||||
python main_train.py --bitw <bitw> --bita <bita> --name <ckptname> --model [UltraNet_Bypass | SkyNet | SkyNetk5]
|
||||
```
|
||||
|
||||
### 3) Test model
|
||||
|
||||
```bash
|
||||
python test.py [--model [UltraNet_Bypass_FixQ | SkyNet_FixQ | SkyNetk5_FixQ]]
|
||||
```
|
||||
|
||||
### 4) HLS export
|
||||
```bash
|
||||
# For Ultranet or Ultranet_Bypass
|
||||
python export_hls.py [--model UltraNet_Bypass_FixQ]
|
||||
# For SkyNet or SkyNetk5
|
||||
python export_hls.py [--model SkyNetk5_FixQ]
|
||||
```
|
||||
|
||||
### 5) Model-Level Hardware Simulation
|
||||
```bash
|
||||
python simulate_hls.py [--model [UltraNet_Bypass_FixQ | SkyNet_FixQ | SkyNetk5_FixQ]]
|
||||
```
|
||||
|
||||
## Reference
|
||||
- https://github.com/zhaoweicai/EdMIPS EdMIPS: Rethinking Differentiable Search for Mixed-Precision Neural Networks
|
||||
- https://github.com/kuangliu/pytorch-cifar Smaller models for cifar dataset
|
||||
- https://github.com/ultralytics/yolov3.git yolov3 training framework
|
||||
- https://github.com/jiangwx/SkyNet SkyNet by SHTECH, winner of 2019 DAC-SDC contest
|
||||
- https://github.com/jgoeders/dac_sdc_2020_designs Winner designs of 2020 DAC-SDC contest
|
||||
- https://github.com/heheda365/ultra_net BJUT_runner team, 1st place of 2020 DAC-SDC contest, UltraNet
|
||||
- https://github.com/jgoeders/dac_sdc_2021_designs Winner designs of 2021 DAC-SDC contest
|
||||
- https://github.com/jiangwx/SkrSkr SkrSkr by SHTECH, 1st place of 2021 DAC-SDC contest, SkyNet
|
||||
- https://github.com/xliu0709/DACSDC2021 iSmart team, 2nd place of 2021 DAC-SDC design, UltraNet with optimized packing method
|
||||
- https://github.com/heymesut/SJTU_microe 3rd place of 2021 DAC-SDC design by SJTU, a variant of UltraNet with bypass
|
||||
- https://github.com/jgoeders/dac_sdc_2022_designs Winner designs of 2022 DAC-SDC contest
|
||||
- https://github.com/MatthewLuo7/InvolutionNet 3rd place of 2022 DAC-SDC design (ours), without anypacking design
|
||||
|
|
|
@ -93,3 +93,19 @@ def load_classifier(name='resnet101', n=2):
|
|||
model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters))
|
||||
model.last_linear.out_features = n
|
||||
return model
|
||||
|
||||
log_layerid = 0
|
||||
def loglayer(x):
|
||||
global log_layerid
|
||||
import numpy as np
|
||||
x=x.numpy()
|
||||
assert x.dtype == np.int
|
||||
with open('_logs/test%d.txt'%log_layerid, 'w') as f:
|
||||
for i in range(x.shape[0]):
|
||||
print('C', i, file=f)
|
||||
for j in range(x.shape[1]):
|
||||
for k in range(x.shape[2]):
|
||||
print('%3d'%x[i,j,k], end=',', file=f)
|
||||
print(file=f)
|
||||
|
||||
log_layerid+=1
|
||||
|
|
Loading…
Reference in New Issue