mmpose/tools/dataset_converters/ubody_smplx_to_coco.py

# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
import os
import os.path as osp
from functools import partial
from typing import Dict, List

import mmengine
import numpy as np
import smplx
import torch
from pycocotools.coco import COCO


class SMPLX(object):

    def __init__(self, human_model_path):
        self.human_model_path = human_model_path
        self.layer_args = {
            'create_global_orient': False,
            'create_body_pose': False,
            'create_left_hand_pose': False,
            'create_right_hand_pose': False,
            'create_jaw_pose': False,
            'create_leye_pose': False,
            'create_reye_pose': False,
            'create_betas': False,
            'create_expression': False,
            'create_transl': False,
        }

        self.neutral_model = smplx.create(
            self.human_model_path,
            'smplx',
            gender='NEUTRAL',
            use_pca=False,
            use_face_contour=True,
            **self.layer_args)
        if torch.cuda.is_available():
            self.neutral_model = self.neutral_model.to('cuda:0')

        self.vertex_num = 10475
        self.face = self.neutral_model.faces
        self.shape_param_dim = 10
        self.expr_code_dim = 10
        # 22 (body joints) + 30 (hand joints) + 1 (face jaw joint)
        self.orig_joint_num = 53

        # yapf: disable
        self.orig_joints_name = (
            # 22 body joints
            'Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee',
            'Spine2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot',
            'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder',
            'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist',
            # left hand joints
            'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Middle_1', 'L_Middle_2',
            'L_Middle_3', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Ring_1',
            'L_Ring_2', 'L_Ring_3', 'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3',
            # right hand joints
            'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Middle_1', 'R_Middle_2',
            'R_Middle_3', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Ring_1',
            'R_Ring_2', 'R_Ring_3', 'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3',
            # 1 face jaw joint
            'Jaw',
        )
        self.orig_flip_pairs = (
            # body joints
            (1, 2), (4, 5), (7, 8), (10, 11), (13, 14), (16, 17), (18, 19),
            (20, 21),
            # hand joints
            (22, 37), (23, 38), (24, 39), (25, 40), (26, 41), (27, 42),
            (28, 43), (29, 44), (30, 45), (31, 46), (32, 47), (33, 48),
            (34, 49), (35, 50), (36, 51),
        )
        # yapf: enable
        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
        self.orig_joint_part = {
            'body':
            range(
                self.orig_joints_name.index('Pelvis'),
                self.orig_joints_name.index('R_Wrist') + 1),
            'lhand':
            range(
                self.orig_joints_name.index('L_Index_1'),
                self.orig_joints_name.index('L_Thumb_3') + 1),
            'rhand':
            range(
                self.orig_joints_name.index('R_Index_1'),
                self.orig_joints_name.index('R_Thumb_3') + 1),
            'face':
            range(
                self.orig_joints_name.index('Jaw'),
                self.orig_joints_name.index('Jaw') + 1)
        }

        # changed SMPLX joint set for the supervision
        self.joint_num = (
            137  # 25 (body joints) + 40 (hand joints) + 72 (face keypoints)
        )
        # yapf: disable
        self.joints_name = (
            # 25 body joints
            'Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle',
            'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow',
            'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe',
            'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear',
            'L_Eye', 'R_Eye', 'Nose',
            # left hand joints
            'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb4', 'L_Index_1',
            'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2',
            'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3',
            'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4',
            # right hand joints
            'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1',
            'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2',
            'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3',
            'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4',
            # 72 face keypoints
            *[
                f'Face_{i}' for i in range(1, 73)
            ],
        )

        self.root_joint_idx = self.joints_name.index('Pelvis')
        self.lwrist_idx = self.joints_name.index('L_Wrist')
        self.rwrist_idx = self.joints_name.index('R_Wrist')
        self.neck_idx = self.joints_name.index('Neck')
        self.flip_pairs = (
            # body joints
            (1, 2), (3, 4), (5, 6), (8, 9), (10, 11), (12, 13), (14, 17),
            (15, 18), (16, 19), (20, 21), (22, 23),
            # hand joints
            (25, 45), (26, 46), (27, 47), (28, 48), (29, 49), (30, 50),
            (31, 51), (32, 52), (33, 53), (34, 54), (35, 55), (36, 56),
            (37, 57), (38, 58), (39, 59), (40, 60), (41, 61), (42, 62),
            (43, 63), (44, 64),
            # face eyebrow
            (67, 68), (69, 78), (70, 77), (71, 76), (72, 75), (73, 74),
            # face below nose
            (83, 87), (84, 86),
            # face eyes
            (88, 97), (89, 96), (90, 95), (91, 94), (92, 99), (93, 98),
            # face mouse
            (100, 106), (101, 105), (102, 104), (107, 111), (108, 110),
            # face lip
            (112, 116), (113, 115), (117, 119),
            # face contours
            (120, 136), (121, 135), (122, 134), (123, 133), (124, 132),
            (125, 131), (126, 130), (127, 129)
        )
        self.joint_idx = (
            0, 1, 2, 4, 5, 7, 8, 12, 16, 17, 18, 19, 20, 21, 60, 61, 62, 63,
            64, 65, 59, 58, 57, 56, 55,  # body joints
            37, 38, 39, 66, 25, 26, 27, 67, 28, 29, 30, 68, 34, 35, 36, 69, 31,
            32, 33, 70,  # left hand joints
            52, 53, 54, 71, 40, 41, 42, 72, 43, 44, 45, 73, 49, 50, 51, 74, 46,
            47, 48, 75,  # right hand joints
            22, 15,  # jaw, head
            57, 56,  # eyeballs
            76, 77, 78, 79, 80, 81, 82, 83, 84, 85,  # eyebrow
            86, 87, 88, 89,  # nose
            90, 91, 92, 93, 94,  # below nose
            95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,  # eyes
            107,  # right mouth
            108, 109, 110, 111, 112,  # upper mouth
            113,  # left mouth
            114, 115, 116, 117, 118,  # lower mouth
            119,  # right lip
            120, 121, 122,  # upper lip
            123,  # left lip
            124, 125, 126,  # lower lip
            127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
            140, 141, 142, 143,  # face contour
        )
        # yapf: enable

        self.joint_part = {
            'body':
            range(
                self.joints_name.index('Pelvis'),
                self.joints_name.index('Nose') + 1),
            'lhand':
            range(
                self.joints_name.index('L_Thumb_1'),
                self.joints_name.index('L_Pinky_4') + 1),
            'rhand':
            range(
                self.joints_name.index('R_Thumb_1'),
                self.joints_name.index('R_Pinky_4') + 1),
            'hand':
            range(
                self.joints_name.index('L_Thumb_1'),
                self.joints_name.index('R_Pinky_4') + 1),
            'face':
            range(
                self.joints_name.index('Face_1'),
                self.joints_name.index('Face_72') + 1)
        }


def read_annotation_file(annotation_file: str) -> List[Dict]:
    with open(annotation_file, 'r') as f:
        annotations = json.load(f)
    return annotations


def cam2pixel(cam_coord, f, c):
    x = cam_coord[:, 0] / cam_coord[:, 2] * f[0] + c[0]
    y = cam_coord[:, 1] / cam_coord[:, 2] * f[1] + c[1]
    z = cam_coord[:, 2]
    return np.stack((x, y, z), 1)


def process_scene_anno(scene: str, annotation_root: str, splits: np.array,
                       human_model_path: str):
    annos = read_annotation_file(
        osp.join(annotation_root, scene, 'smplx_annotation.json'))
    keypoint_annos = COCO(
        osp.join(annotation_root, scene, 'keypoint_annotation.json'))
    human_model = SMPLX(human_model_path)

    train_annos = []
    val_annos = []
    train_imgs = []
    val_imgs = []

    progress_bar = mmengine.ProgressBar(len(keypoint_annos.anns.keys()))
    for aid in keypoint_annos.anns.keys():
        ann = keypoint_annos.anns[aid]
        img = keypoint_annos.loadImgs(ann['image_id'])[0]
        if img['file_name'].startswith('/'):
            file_name = img['file_name'][1:]
        else:
            file_name = img['file_name']

        video_name = file_name.split('/')[-2]
        if 'Trim' in video_name:
            video_name = video_name.split('_Trim')[0]

        img_path = os.path.join(
            annotation_root.replace('annotations', 'images'), scene, file_name)
        if not os.path.exists(img_path):
            progress_bar.update()
            continue
        if str(aid) not in annos:
            progress_bar.update()
            continue

        smplx_param = annos[str(aid)]
        human_model_param = smplx_param['smplx_param']
        cam_param = smplx_param['cam_param']
        if 'lhand_valid' not in human_model_param:
            human_model_param['lhand_valid'] = ann['lefthand_valid']
            human_model_param['rhand_valid'] = ann['righthand_valid']
            human_model_param['face_valid'] = ann['face_valid']

        rotation_valid = np.ones((human_model.orig_joint_num),
                                 dtype=np.float32)
        coord_valid = np.ones((human_model.joint_num), dtype=np.float32)

        root_pose = human_model_param['root_pose']
        body_pose = human_model_param['body_pose']
        shape = human_model_param['shape']
        trans = human_model_param['trans']

        if 'lhand_pose' in human_model_param and human_model_param.get(
                'lhand_valid', False):
            lhand_pose = human_model_param['lhand_pose']
        else:
            lhand_pose = np.zeros(
                (3 * len(human_model.orig_joint_part['lhand'])),
                dtype=np.float32)
            rotation_valid[human_model.orig_joint_part['lhand']] = 0
            coord_valid[human_model.orig_joint_part['lhand']] = 0

        if 'rhand_pose' in human_model_param and human_model_param.get(
                'rhand_valid', False):
            rhand_pose = human_model_param['rhand_pose']
        else:
            rhand_pose = np.zeros(
                (3 * len(human_model.orig_joint_part['rhand'])),
                dtype=np.float32)
            rotation_valid[human_model.orig_joint_part['rhand']] = 0
            coord_valid[human_model.orig_joint_part['rhand']] = 0

        if 'jaw_pose' in human_model_param and \
            'expr' in human_model_param and \
                human_model_param.get('face_valid', False):
            jaw_pose = human_model_param['jaw_pose']
            expr = human_model_param['expr']
        else:
            jaw_pose = np.zeros((3), dtype=np.float32)
            expr = np.zeros((human_model.expr_code_dim), dtype=np.float32)
            rotation_valid[human_model.orig_joint_part['face']] = 0
            coord_valid[human_model.orig_joint_part['face']] = 0

        # init human model inputs
        device = torch.device(
            'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
        root_pose = torch.FloatTensor(root_pose).to(device).view(1, 3)
        body_pose = torch.FloatTensor(body_pose).to(device).view(-1, 3)
        lhand_pose = torch.FloatTensor(lhand_pose).to(device).view(-1, 3)
        rhand_pose = torch.FloatTensor(rhand_pose).to(device).view(-1, 3)
        jaw_pose = torch.FloatTensor(jaw_pose).to(device).view(-1, 3)
        shape = torch.FloatTensor(shape).to(device).view(1, -1)
        expr = torch.FloatTensor(expr).to(device).view(1, -1)
        trans = torch.FloatTensor(trans).to(device).view(1, -1)
        zero_pose = torch.zeros((1, 3), dtype=torch.float32, device=device)

        with torch.no_grad():
            output = human_model.neutral_model(
                betas=shape,
                body_pose=body_pose.view(1, -1),
                global_orient=root_pose,
                transl=trans,
                left_hand_pose=lhand_pose.view(1, -1),
                right_hand_pose=rhand_pose.view(1, -1),
                jaw_pose=jaw_pose.view(1, -1),
                leye_pose=zero_pose,
                reye_pose=zero_pose,
                expression=expr)

        joint_cam = output.joints[0].cpu().numpy()[human_model.joint_idx, :]
        joint_img = cam2pixel(joint_cam, cam_param['focal'],
                              cam_param['princpt'])

        joint_cam = (joint_cam - joint_cam[human_model.root_joint_idx, None, :]
                     )  # root-relative
        joint_cam[human_model.joint_part['lhand'], :] = (
            joint_cam[human_model.joint_part['lhand'], :] -
            joint_cam[human_model.lwrist_idx, None, :]
        )  # left hand root-relative
        joint_cam[human_model.joint_part['rhand'], :] = (
            joint_cam[human_model.joint_part['rhand'], :] -
            joint_cam[human_model.rwrist_idx, None, :]
        )  # right hand root-relative
        joint_cam[human_model.joint_part['face'], :] = (
            joint_cam[human_model.joint_part['face'], :] -
            joint_cam[human_model.neck_idx, None, :])  # face root-relative

        body_3d_size = 2
        output_hm_shape = (16, 16, 12)
        joint_img[human_model.joint_part['body'],
                  2] = ((joint_cam[human_model.joint_part['body'], 2].copy() /
                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
        joint_img[human_model.joint_part['lhand'],
                  2] = ((joint_cam[human_model.joint_part['lhand'], 2].copy() /
                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
        joint_img[human_model.joint_part['rhand'],
                  2] = ((joint_cam[human_model.joint_part['rhand'], 2].copy() /
                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])
        joint_img[human_model.joint_part['face'],
                  2] = ((joint_cam[human_model.joint_part['face'], 2].copy() /
                         (body_3d_size / 2) + 1) / 2.0 * output_hm_shape[0])

        keypoints_2d = joint_img[:, :2].copy()
        keypoints_3d = joint_img.copy()
        keypoints_valid = coord_valid.reshape((-1, 1))

        ann['keypoints'] = keypoints_2d.tolist()
        ann['keypoints_3d'] = keypoints_3d.tolist()
        ann['keypoints_valid'] = keypoints_valid.tolist()
        ann['camera_param'] = cam_param
        img['file_name'] = os.path.join(scene, file_name)
        if video_name in splits:
            val_annos.append(ann)
            val_imgs.append(img)
        else:
            train_annos.append(ann)
            train_imgs.append(img)
        progress_bar.update()

    categories = [{
        'supercategory': 'person',
        'id': 1,
        'name': 'person',
        'keypoints': human_model.joints_name,
        'skeleton': human_model.flip_pairs
    }]
    train_data = {
        'images': train_imgs,
        'annotations': train_annos,
        'categories': categories
    }
    val_data = {
        'images': val_imgs,
        'annotations': val_annos,
        'categories': categories
    }

    mmengine.dump(
        train_data,
        osp.join(annotation_root, scene, 'train_3dkeypoint_annotation.json'))
    mmengine.dump(
        val_data,
        osp.join(annotation_root, scene, 'val_3dkeypoint_annotation.json'))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-root', type=str, default='data/UBody')
    parser.add_argument('--human-model-path', type=str, default='data/SMPLX')
    parser.add_argument(
        '--nproc', default=8, type=int, help='number of process')
    args = parser.parse_args()

    split_path = f'{args.data_root}/splits/intra_scene_test_list.npy'
    annotation_path = f'{args.data_root}/annotations'

    folders = os.listdir(annotation_path)
    folders = [f for f in folders if osp.isdir(osp.join(annotation_path, f))]
    human_model_path = args.human_model_path
    splits = np.load(split_path)

    if args.nproc > 1:
        mmengine.track_parallel_progress(
            partial(
                process_scene_anno,
                annotation_root=annotation_path,
                splits=splits,
                human_model_path=human_model_path), folders, args.nproc)
    else:
        mmengine.track_progress(
            partial(
                process_scene_anno,
                annotation_root=annotation_path,
                splits=splits,
                human_model_path=human_model_path), folders)