img2pose: Face Alignment and Detection via 6DoF, Face Pose Estimation代码理解

      

import argparse
import os
import sys
import time

import numpy as np
from PIL import Image, ImageOps
from torchvision import transforms
from tqdm import tqdm
import cv2

sys.path.append("./")
from img2pose import img2poseModel
from model_loader import load_model


def init_model(pretrained_path, threed_68_points, pose_stddev, pose_mean, depth):
    img2pose_model = img2poseModel(
            depth,
            200,
            1400,
            pose_mean=np.load(pose_mean),
            pose_stddev=np.load(pose_stddev),
            threed_68_points=np.load(threed_68_points),
        )
    load_model(
            img2pose_model.fpn_model,
            pretrained_path,
            cpu_mode=str(img2pose_model.device) == "cpu",
            model_only=True,
        )
    img2pose_model.evaluate()

    return img2pose_model


def bbox_voting(bboxes, iou_thresh=0.6):
    # bboxes: a numpy array of N*5 size representing N boxes;
    #         for each box, it is represented as [x1, y1, x2, y2, s]
    # iou_thresh: group bounding boxes if their overlap is > threshold.
    bboxes = np.asarray(bboxes)
    order = bboxes[:, 4].ravel().argsort()[::-1]
    bboxes = bboxes[order, :]
    areas = (bboxes[:, 2] - bboxes[:, 0] + 1) * (bboxes[:, 3] - bboxes[:, 1] + 1)
    voted_bboxes = np.zeros([0, 5])
    while bboxes.shape[0] > 0:
        xx1 = np.maximum(bboxes[0, 0], bboxes[:, 0])
        yy1 = np.maximum(bboxes[0, 1], bboxes[:, 1])
        xx2 = np.minimum(bboxes[0, 2], bboxes[:, 2])
        yy2 = np.minimum(bboxes[0, 3], bboxes[:, 3])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        overlaps = inter / (areas[0] + areas[:] - inter)
        merge_indexs = np.where(overlaps >= iou_thresh)[0]
        if merge_indexs.shape[0] == 0:
            bboxes = np.delete(bboxes, np.array([0]), 0)
            areas = np.delete(areas, np.array([0]), 0)
            continue
        bboxes_accu = bboxes[merge_indexs, :]
        bboxes = np.delete(bboxes, merge_indexs, 0)
        areas = np.delete(areas, merge_indexs, 0)
        # generate a new box by score voting and box voting
        bbox = np.zeros((1, 5))
        box_weights = (bboxes_accu[:, -1] / max(bboxes_accu[:, -1])) * overlaps[
            merge_indexs
        ]
        bboxes_accu[:, 0:4] = bboxes_accu[:, 0:4] * np.tile(
            box_weights.reshape((-1, 1)), (1, 4)
        )
        bbox[:, 0:4] = np.sum(bboxes_accu[:, 0:4], axis=0) / (np.sum(box_weights))
        bbox[0, 4] = np.sum(bboxes_accu[:, 4] * box_weights)
        voted_bboxes = np.row_stack((voted_bboxes, bbox))

    return voted_bboxes


def get_M_homo(R, t):
    M = [[R[0][0], R[0][1], R[0][2], t[0]],
         [R[1][0], R[1][1], R[1][2], t[1]],
         [R[2][0], R[2][1], R[2][2], t[2]],
         [0,       0,       0,       1]]
    
    M = np.linalg.inv(np.array(M))

    r = M[:3, :3]
    R_1X3 = np.zeros((1, 3))
    cv2.Rodrigues(r, R_1X3)

    t = np.array([M[0][3], M[1][3], M[2][3]])

    return R_1X3, t


def show_xyz(img, point_3d, rotation_vector, translation_vector, ipm, dpv):
    R_3X3 = np.zeros((3, 3))
    cv2.Rodrigues(rotation_vector, R_3X3)
    rotation_vector, translation_vector = get_M_homo(R_3X3, translation_vector)

    gap = 1
    x_3d = np.array([point_3d[0] + gap, point_3d[1], point_3d[2]], dtype=np.float64)
    y_3d = np.array([point_3d[0], point_3d[1] + gap, point_3d[2]], dtype=np.float64)
    z_3d = np.array([point_3d[0], point_3d[1], point_3d[2] + gap], dtype=np.float64)
    # print(x_3d.dtype)
    # print(rotation_vector.dtype)
    # print(translation_vector.dtype)
    # print(ipm.dtype)
    # print(dpv.dtype)
    (x_3d, jacobian) = cv2.projectPoints(x_3d, rotation_vector, translation_vector, ipm, dpv)
    (y_3d, jacobian) = cv2.projectPoints(y_3d, rotation_vector, translation_vector, ipm, dpv)
    (z_3d, jacobian) = cv2.projectPoints(z_3d, rotation_vector, translation_vector, ipm, dpv)
    (origin_coordinates, jacobian) = cv2.projectPoints(np.array(point_3d, dtype=np.float64), rotation_vector, translation_vector, ipm, dpv)
    x_coordinates = (int(x_3d[0][0][0]), int(x_3d[0][0][1]))
    y_coordinates = (int(y_3d[0][0][0]), int(y_3d[0][0][1]))
    z_coordinates = (int(z_3d[0][0][0]), int(z_3d[0][0][1]))
    origin_coordinates = (int(origin_coordinates[0][0][0]), int(origin_coordinates[0][0][1]))
    # print(origin_coordinates)
    # print(x_coordinates)

    cv2.line(img, origin_coordinates, x_coordinates, (0, 0, 255), 2)
    cv2.putText(img, "X", x_coordinates, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
    cv2.line(img, origin_coordinates, y_coordinates, (0, 255, 0), 2)
    cv2.putText(img, "Y", y_coordinates, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
    cv2.line(img, origin_coordinates, z_coordinates, (255, 0, 0), 2)
    cv2.putText(img, "Z", z_coordinates, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)


def main():
    cap = cv2.VideoCapture("./datasets/123.avi")  # "./datasets/123.avi"
    model = init_model(pretrained_path, threed_68_points, pose_stddev, pose_mean, depth)

    while 1:
        success, img_bgr = cap.read()
        # img_bgr = cv2.imread("./datasets/112.png")
        if not success:
            break

        count1 = cv2.getTickCount()
        
        img = img_bgr[:, :, ::-1]

        run_img = img.copy()

        transform = transforms.Compose([transforms.ToTensor()])
        res = model.predict([transform(run_img)])
        
        bboxes = []
        poses = []
        # print("src", res[0])
        for i in range(len(res[0]["scores"])):
            bbox = res[0]["boxes"].cpu().numpy()[i].astype("int")
            score = res[0]["scores"].cpu().numpy()[i]
            pose = res[0]["dofs"].cpu().numpy()[i]

            poses.append(pose)
            bboxes.append(np.append(bbox, score))

        if np.ndim(bboxes) == 1 and len(bboxes) > 0:
            bboxes = bboxes[np.newaxis, :]
        # print(bboxes)
        if bboxes == []:
            continue
        # bboxes = bbox_voting(bboxes, 0.6)
        # print("dest", bboxes)
        for i, [x1,y1,x2,y2,s] in enumerate(bboxes):
            if s > 0.6:
                cv2.rectangle(img_bgr, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 0))
                cv2.putText(img_bgr, str(int(s*1000)), (int(x1), int(y1)), cv2.FONT_HERSHEY_PLAIN, 2.0, (0, 0, 255))
                show_xyz(img_bgr, (0, 0, 0), np.array([poses[i][0], poses[i][1], poses[i][2]], dtype=np.float64),
                    np.array([poses[i][3], poses[i][4], poses[i][5]], dtype=np.float64), ipm, dpv)
                print(poses[i])

        count2 = cv2.getTickCount()
        fps = cv2.getTickFrequency() / (count2 - count1)
        cv2.putText(img_bgr, str(fps), (20, 20), cv2.FONT_HERSHEY_PLAIN, 2.0, (0, 0, 255))

        cv2.imshow('video', img_bgr)
        # cv2.waitKey(0)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break


# ipm:相机内参  dpv:相机畸变参数
ipm = np.array([[738.77113867, 0., 309.55582235], [0., 738.46009689, 220.83418211], [0, 0, 1]])
dpv = np.array([0.0573193604, -0.37914004, -0.00216807948, 0.000132759817, 0.923696829])
depth = 18   # 18, 50, 101
pose_mean = "./models/WIDER_train_pose_mean_v1.npy"
pose_stddev = "./models/WIDER_train_pose_stddev_v1.npy"
threed_68_points = "./pose_references/reference_3d_68_points_trans.npy"
pretrained_path = "models/img2pose_v1.pth"
if __name__ == "__main__":
    main()

        以上面的main.py为运行起点,img2pose代码主要调用逻辑如下所示(可配合这里一起理解)。

猜你喜欢

转载自blog.csdn.net/qq_31112205/article/details/126378846