mmpose/configs/body_2d_keypoint/rtmo/body7/rtmo-t_8xb32-600e_body7-416...

530 lines
13 KiB
Python

_base_ = ['../../../_base_/default_runtime.py']
# runtime
train_cfg = dict(max_epochs=600, val_interval=20, dynamic_intervals=[(580, 1)])
auto_scale_lr = dict(base_batch_size=256)
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=40, max_keep_ckpts=3))
optim_wrapper = dict(
type='OptimWrapper',
constructor='ForceDefaultOptimWrapperConstructor',
optimizer=dict(type='AdamW', lr=0.004, weight_decay=0.05),
paramwise_cfg=dict(
norm_decay_mult=0,
bias_decay_mult=0,
bypass_duplicate=True,
force_default_settings=True,
custom_keys=dict({'neck.encoder': dict(lr_mult=0.05)})),
clip_grad=dict(max_norm=0.1, norm_type=2))
param_scheduler = [
dict(
type='QuadraticWarmupLR',
by_epoch=True,
begin=0,
end=5,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=5,
T_max=280,
end=280,
by_epoch=True,
convert_to_iter_based=True),
# this scheduler is used to increase the lr from 2e-4 to 5e-4
dict(type='ConstantLR', by_epoch=True, factor=2.5, begin=280, end=281),
dict(
type='CosineAnnealingLR',
eta_min=0.0002,
begin=281,
T_max=300,
end=580,
by_epoch=True,
convert_to_iter_based=True),
dict(type='ConstantLR', by_epoch=True, factor=1, begin=580, end=600),
]
# data
input_size = (416, 416)
metafile = 'configs/_base_/datasets/coco.py'
codec = dict(type='YOLOXPoseAnnotationProcessor', input_size=input_size)
train_pipeline_stage1 = [
dict(type='LoadImage', backend_args=None),
dict(
type='Mosaic',
img_scale=(416, 416),
pad_val=114.0,
pre_transform=[dict(type='LoadImage', backend_args=None)]),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_factor=0.1,
rotate_factor=10,
scale_factor=(0.75, 1.0),
pad_val=114,
distribution='uniform',
transform_mode='perspective',
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
train_pipeline_stage2 = [
dict(type='LoadImage'),
dict(
type='BottomupRandomAffine',
input_size=(416, 416),
shift_prob=0,
rotate_prob=0,
scale_prob=0,
scale_type='long',
pad_val=(114, 114, 114),
bbox_keep_corner=False,
clip_border=True,
),
dict(type='YOLOXHSVRandomAug'),
dict(type='RandomFlip'),
dict(type='BottomupGetHeatmapMask', get_invalid=True),
dict(type='FilterAnnotations', by_kpt=True, by_box=True, keep_empty=False),
dict(type='GenerateTarget', encoder=codec),
dict(type='PackPoseInputs'),
]
# data settings
data_mode = 'bottomup'
data_root = 'data/'
# mapping
aic_coco = [
(0, 6),
(1, 8),
(2, 10),
(3, 5),
(4, 7),
(5, 9),
(6, 12),
(7, 14),
(8, 16),
(9, 11),
(10, 13),
(11, 15),
]
crowdpose_coco = [
(0, 5),
(1, 6),
(2, 7),
(3, 8),
(4, 9),
(5, 10),
(6, 11),
(7, 12),
(8, 13),
(9, 14),
(10, 15),
(11, 16),
]
mpii_coco = [
(0, 16),
(1, 14),
(2, 12),
(3, 11),
(4, 13),
(5, 15),
(10, 10),
(11, 8),
(12, 6),
(13, 5),
(14, 7),
(15, 9),
]
jhmdb_coco = [
(3, 6),
(4, 5),
(5, 12),
(6, 11),
(7, 8),
(8, 7),
(9, 14),
(10, 13),
(11, 10),
(12, 9),
(13, 16),
(14, 15),
]
halpe_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
ochuman_coco = [
(0, 0),
(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
posetrack_coco = [
(0, 0),
(3, 3),
(4, 4),
(5, 5),
(6, 6),
(7, 7),
(8, 8),
(9, 9),
(10, 10),
(11, 11),
(12, 12),
(13, 13),
(14, 14),
(15, 15),
(16, 16),
]
# train datasets
dataset_coco = dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_train2017.json',
data_prefix=dict(img='coco/train2017/'),
pipeline=[
dict(
type='KeypointConverter',
num_keypoints=17,
mapping=[(i, i) for i in range(17)])
],
)
dataset_aic = dict(
type='AicDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='aic/annotations/aic_train.json',
data_prefix=dict(img='pose/ai_challenge/ai_challenger_keypoint'
'_train_20170902/keypoint_train_images_20170902/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=aic_coco)
],
)
dataset_crowdpose = dict(
type='CrowdPoseDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='crowdpose/annotations/mmpose_crowdpose_trainval.json',
data_prefix=dict(img='pose/CrowdPose/images/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=crowdpose_coco)
],
)
dataset_mpii = dict(
type='MpiiDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='mpii/annotations/mpii_train.json',
data_prefix=dict(img='pose/MPI/images/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=mpii_coco)
],
)
dataset_jhmdb = dict(
type='JhmdbDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='jhmdb/annotations/Sub1_train.json',
data_prefix=dict(img='pose/JHMDB/'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=jhmdb_coco)
],
)
dataset_halpe = dict(
type='HalpeDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='halpe/annotations/halpe_train_v1.json',
data_prefix=dict(img='pose/Halpe/hico_20160224_det/images/train2015'),
pipeline=[
dict(type='KeypointConverter', num_keypoints=17, mapping=halpe_coco)
],
)
dataset_posetrack = dict(
type='PoseTrack18Dataset',
data_root=data_root,
data_mode=data_mode,
ann_file='posetrack18/annotations/posetrack18_train.json',
data_prefix=dict(img='pose/PoseChallenge2018/'),
pipeline=[
dict(
type='KeypointConverter', num_keypoints=17, mapping=posetrack_coco)
],
)
train_dataset = dict(
type='CombinedDataset',
metainfo=dict(from_file=metafile),
datasets=[
dataset_coco,
dataset_aic,
dataset_crowdpose,
dataset_mpii,
dataset_jhmdb,
dataset_halpe,
dataset_posetrack,
],
sample_ratio_factor=[1, 0.3, 0.5, 0.3, 0.3, 0.4, 0.3],
test_mode=False,
pipeline=train_pipeline_stage1)
train_dataloader = dict(
batch_size=32,
num_workers=8,
persistent_workers=True,
pin_memory=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=train_dataset)
# val datasets
val_pipeline = [
dict(type='LoadImage'),
dict(
type='BottomupResize', input_size=input_size, pad_val=(114, 114, 114)),
dict(
type='PackPoseInputs',
meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape',
'input_size', 'input_center', 'input_scale'))
]
val_dataloader = dict(
batch_size=1,
num_workers=2,
persistent_workers=True,
pin_memory=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
dataset=dict(
type='CocoDataset',
data_root=data_root,
data_mode=data_mode,
ann_file='coco/annotations/person_keypoints_val2017.json',
data_prefix=dict(img='coco/val2017/'),
test_mode=True,
pipeline=val_pipeline,
))
test_dataloader = val_dataloader
# evaluators
val_evaluator = dict(
type='CocoMetric',
ann_file=data_root + 'coco/annotations/person_keypoints_val2017.json',
score_mode='bbox',
nms_mode='none',
)
test_evaluator = val_evaluator
# hooks
custom_hooks = [
dict(
type='YOLOXPoseModeSwitchHook',
num_last_epochs=20,
new_train_dataset=dataset_coco,
new_train_pipeline=train_pipeline_stage2,
priority=48),
dict(
type='RTMOModeSwitchHook',
epoch_attributes={
280: {
'proxy_target_cc': True,
'loss_mle.loss_weight': 5.0,
'loss_oks.loss_weight': 10.0
},
},
priority=48),
dict(type='SyncNormHook', priority=48),
dict(
type='EMAHook',
ema_type='ExpMomentumEMA',
momentum=0.0002,
update_buffers=True,
strict_load=False,
priority=49),
]
# model
widen_factor = 0.375
deepen_factor = 0.33
model = dict(
type='BottomupPoseEstimator',
init_cfg=dict(
type='Kaiming',
layer='Conv2d',
a=2.23606797749979,
distribution='uniform',
mode='fan_in',
nonlinearity='leaky_relu'),
data_preprocessor=dict(
type='PoseDataPreprocessor',
pad_size_divisor=32,
mean=[0, 0, 0],
std=[1, 1, 1],
batch_augments=[
dict(
type='BatchSyncRandomResize',
random_size_range=(320, 640),
size_divisor=32,
interval=1),
]),
backbone=dict(
type='CSPDarknet',
deepen_factor=deepen_factor,
widen_factor=widen_factor,
out_indices=(2, 3, 4),
spp_kernal_sizes=(5, 9, 13),
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish'),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmdetection/v2.0/'
'yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_'
'20211124_171234-b4047906.pth',
prefix='backbone.',
)),
neck=dict(
type='HybridEncoder',
in_channels=[96, 192, 384],
deepen_factor=deepen_factor,
widen_factor=widen_factor,
hidden_dim=256,
output_indices=[1, 2],
encoder_cfg=dict(
self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=1024,
ffn_drop=0.0,
act_cfg=dict(type='GELU'))),
projector=dict(
type='ChannelMapper',
in_channels=[256, 256],
kernel_size=1,
out_channels=192,
act_cfg=None,
norm_cfg=dict(type='BN'),
num_outs=2)),
head=dict(
type='RTMOHead',
num_keypoints=17,
featmap_strides=(16, 32),
head_module_cfg=dict(
num_classes=1,
in_channels=256,
cls_feat_channels=256,
channels_per_group=36,
pose_vec_channels=192,
widen_factor=widen_factor,
stacked_convs=2,
norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
act_cfg=dict(type='Swish')),
assigner=dict(
type='SimOTAAssigner',
dynamic_k_indicator='oks',
oks_calculator=dict(type='PoseOKS', metainfo=metafile),
use_keypoints_for_center=True),
prior_generator=dict(
type='MlvlPointGenerator',
centralize_points=True,
strides=[16, 32]),
dcc_cfg=dict(
in_channels=192,
feat_channels=128,
num_bins=(192, 256),
spe_channels=128,
gau_cfg=dict(
s=128,
expansion_factor=2,
dropout_rate=0.0,
drop_path=0.0,
act_fn='SiLU',
pos_enc='add')),
overlaps_power=0.5,
loss_cls=dict(
type='VariFocalLoss',
reduction='sum',
use_target_weight=True,
loss_weight=1.0),
loss_bbox=dict(
type='IoULoss',
mode='square',
eps=1e-16,
reduction='sum',
loss_weight=5.0),
loss_oks=dict(
type='OKSLoss',
reduction='none',
metainfo=metafile,
loss_weight=30.0),
loss_vis=dict(
type='BCELoss',
use_target_weight=True,
reduction='mean',
loss_weight=1.0),
loss_mle=dict(
type='MLECCLoss',
use_target_weight=True,
loss_weight=1.0,
),
loss_bbox_aux=dict(type='L1Loss', reduction='sum', loss_weight=1.0),
),
test_cfg=dict(
input_size=input_size,
score_thr=0.1,
nms_thr=0.65,
))