diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index b983125a..b7003809 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -69,11 +69,23 @@ TASK_OUTPUTS = { # face 2d keypoint result for single sample # { # "keypoints": [ - # [x1, y1]*106 + # [[x, y]*106], + # [[x, y]*106], + # [[x, y]*106], # ], - # "poses": [pitch, roll, yaw] + # "poses": [ + # [pitch, roll, yaw], + # [pitch, roll, yaw], + # [pitch, roll, yaw], + # ], + # "boxes": [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ] # } - Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES], + Tasks.face_2d_keypoints: + [OutputKeys.KEYPOINTS, OutputKeys.POSES, OutputKeys.BOXES], # face detection result for single sample # { @@ -699,8 +711,9 @@ TASK_OUTPUTS = { # "text_embedding": np.array with shape [1, D], # "caption": "this is an image caption text." # } - Tasks.generative_multi_modal_embedding: - [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION], + Tasks.generative_multi_modal_embedding: [ + OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION + ], # multi-modal similarity result for single sample # { diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py index b48d013e..4de5a4f2 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py @@ -1,9 +1,16 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import copy +import math from typing import Any +import cv2 +import numpy as np + from modelscope.metainfo import Pipelines from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage from modelscope.utils.constant import ModelFile, Tasks from .base import EasyCVPipeline @@ -29,18 +36,251 @@ class Face2DKeypointsPipeline(EasyCVPipeline): *args, **kwargs) + # face detect pipeline + det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps' + self.face_detection = pipeline( + Tasks.face_detection, model=det_model_id) + def show_result(self, img, points, scale=2, save_path=None): return self.predict_op.show_result(img, points, scale, save_path) + def _choose_face(self, det_result, min_face=10): + """ + choose face with maximum area + Args: + det_result: output of face detection pipeline + min_face: minimum size of valid face w/h + """ + bboxes = np.array(det_result[OutputKeys.BOXES]) + landmarks = np.array(det_result[OutputKeys.KEYPOINTS]) + if bboxes.shape[0] == 0: + logger.warn('No face detected!') + return None + # face idx with enough size + face_idx = [] + for i in range(bboxes.shape[0]): + box = bboxes[i] + if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face: + face_idx += [i] + if len(face_idx) == 0: + logger.warn( + f'Face size not enough, less than {min_face}x{min_face}!') + return None + bboxes = bboxes[face_idx] + landmarks = landmarks[face_idx] + + return bboxes, landmarks + + def expend_box(self, box, w, h, scalex=0.3, scaley=0.5): + x1 = box[0] + y1 = box[1] + wb = box[2] - x1 + hb = box[3] - y1 + deltax = int(wb * scalex) + deltay1 = int(hb * scaley) + deltay2 = int(hb * scalex) + x1 = x1 - deltax + y1 = y1 - deltay1 + if x1 < 0: + deltax = deltax + x1 + x1 = 0 + if y1 < 0: + deltay1 = deltay1 + y1 + y1 = 0 + x2 = x1 + wb + 2 * deltax + y2 = y1 + hb + deltay1 + deltay2 + x2 = np.clip(x2, 0, w - 1) + y2 = np.clip(y2, 0, h - 1) + return [x1, y1, x2, y2] + + def rotate_point(self, angle, center, landmark): + rad = angle * np.pi / 180.0 + alpha = np.cos(rad) + beta = np.sin(rad) + M = np.zeros((2, 3), dtype=np.float32) + M[0, 0] = alpha + M[0, 1] = beta + M[0, 2] = (1 - alpha) * center[0] - beta * center[1] + M[1, 0] = -beta + M[1, 1] = alpha + M[1, 2] = beta * center[0] + (1 - alpha) * center[1] + + landmark_ = np.asarray([(M[0, 0] * x + M[0, 1] * y + M[0, 2], + M[1, 0] * x + M[1, 1] * y + M[1, 2]) + for (x, y) in landmark]) + return M, landmark_ + + def random_normal(self): + """ + 3-sigma rule + return: (-1, +1) + """ + mu, sigma = 0, 1 + while True: + s = np.random.normal(mu, sigma) + if s < mu - 3 * sigma or s > mu + 3 * sigma: + continue + return s / 3 * sigma + + def rotate_crop_img(self, img, pts, M): + image_size = 256 + enlarge_ratio = 1.1 + + imgT = cv2.warpAffine(img, M, (int(img.shape[1]), int(img.shape[0]))) + + x1 = pts[5][0] + y1 = pts[5][1] + x2 = pts[6][0] + y2 = pts[6][1] + w = x2 - x1 + 1 + h = y2 - y1 + 1 + x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w) + y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h) + + new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w) + new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h) + new_x1 = x1 + int(self.random_normal() * image_size * 0.05) + new_y1 = y1 + int(self.random_normal() * image_size * 0.05) + new_x2 = new_x1 + new_w + new_y2 = new_y1 + new_h + + height, width, _ = imgT.shape + dx = max(0, -new_x1) + dy = max(0, -new_y1) + new_x1 = max(0, new_x1) + new_y1 = max(0, new_y1) + + edx = max(0, new_x2 - width) + edy = max(0, new_y2 - height) + new_x2 = min(width, new_x2) + new_y2 = min(height, new_y2) + + sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2] + if dx > 0 or dy > 0 or edx > 0 or edy > 0: + sub_imgT = cv2.copyMakeBorder( + sub_imgT, + dy, + edy, + dx, + edx, + cv2.BORDER_CONSTANT, + value=(103.94, 116.78, 123.68)) + + return sub_imgT, imgT, [new_x1, new_y1, new_x2, + new_y2], [dx, dy, edx, edy] + + def crop_img(self, imgT, pts, angle): + image_size = 256 + enlarge_ratio = 1.1 + + x1 = np.min(pts[:, 0]) + x2 = np.max(pts[:, 0]) + y1 = np.min(pts[:, 1]) + y2 = np.max(pts[:, 1]) + w = x2 - x1 + 1 + h = y2 - y1 + 1 + x1 = int(x1 - (enlarge_ratio - 1.0) / 2.0 * w) + y1 = int(y1 - (enlarge_ratio - 1.0) / 2.0 * h) + + new_w = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * w) + new_h = int(enlarge_ratio * (1 + self.random_normal() * 0.1) * h) + new_x1 = x1 + int(self.random_normal() * image_size * 0.05) + new_y1 = y1 + int(self.random_normal() * image_size * 0.05) + new_x2 = new_x1 + new_w + new_y2 = new_y1 + new_h + + new_xy = new_x1, new_y1 + pts = pts - new_xy + + height, width, _ = imgT.shape + dx = max(0, -new_x1) + dy = max(0, -new_y1) + new_x1 = max(0, new_x1) + new_y1 = max(0, new_y1) + + edx = max(0, new_x2 - width) + edy = max(0, new_y2 - height) + new_x2 = min(width, new_x2) + new_y2 = min(height, new_y2) + + sub_imgT = imgT[new_y1:new_y2, new_x1:new_x2] + if dx > 0 or dy > 0 or edx > 0 or edy > 0: + sub_imgT = cv2.copyMakeBorder( + sub_imgT, + dy, + edy, + dx, + edx, + cv2.BORDER_CONSTANT, + value=(103.94, 116.78, 123.68)) + + return sub_imgT, [new_x1, new_y1, new_x2, new_y2], [dx, dy, edx, edy] + def __call__(self, inputs) -> Any: - outputs = self.predict_op(inputs) + image_size = 256 + + img = LoadImage.convert_to_ndarray(inputs) + h, w, c = img.shape + img_rgb = copy.deepcopy(img) + img_rgb = img_rgb[:, :, ::-1] + det_result = self.face_detection(img_rgb) + boxes, keypoints = self._choose_face(det_result) + + output_boxes = [] + output_keypoints = [] + output_poses = [] + for idx, box_ori in enumerate(boxes): + box = self.expend_box(box_ori, w, h, scalex=0.15, scaley=0.15) + y0 = int(box[1]) + y1 = int(box[3]) + x0 = int(box[0]) + x1 = int(box[2]) + sub_img = img[y0:y1, x0:x1] + + keypoint = keypoints[idx] + pts = [[keypoint[0], keypoint[1]], [keypoint[2], keypoint[3]], + [keypoint[4], keypoint[5]], [keypoint[6], keypoint[7]], + [keypoint[8], keypoint[9]], [box[0], box[1]], + [box[2], box[3]]] + # radian + angle = math.atan2((pts[1][1] - pts[0][1]), + (pts[1][0] - pts[0][0])) + # angle + theta = angle * (180 / np.pi) + + center = [image_size // 2, image_size // 2] + cx, cy = center + M, landmark_ = self.rotate_point(theta, (cx, cy), pts) + sub_img, imgT, bbox, delta_border = self.rotate_crop_img( + img, pts, M) + + outputs = self.predict_op([sub_img])[0] + tmp_keypoints = outputs['point'] + + for idx in range(0, len(tmp_keypoints)): + tmp_keypoints[idx][0] += (delta_border[0] + bbox[0]) + tmp_keypoints[idx][1] += (delta_border[1] + bbox[1]) + + for idx in range(0, 3): + sub_img, bbox, delta_border = self.crop_img( + imgT, tmp_keypoints, 0) + outputs = self.predict_op([sub_img])[0] + tmp_keypoints = outputs['point'] + for idx in range(0, len(tmp_keypoints)): + tmp_keypoints[idx][0] += (delta_border[0] + bbox[0]) + tmp_keypoints[idx][1] += (delta_border[1] + bbox[1]) + + M2, tmp_keypoints = self.rotate_point(-theta, (cx, cy), + tmp_keypoints) - results = [{ - OutputKeys.KEYPOINTS: output['point'], - OutputKeys.POSES: output['pose'] - } for output in outputs] + output_keypoints.append(np.array(tmp_keypoints)) + output_poses.append(np.array(outputs['pose'])) + output_boxes.append(np.array(box_ori)) - if self._is_single_inputs(inputs): - results = results[0] + results = { + OutputKeys.KEYPOINTS: output_keypoints, + OutputKeys.POSES: output_poses, + OutputKeys.BOXES: output_boxes + } return results diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index 34dc2348..095c36ec 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -91,6 +91,71 @@ def draw_keypoints(output, original_image): return image +def draw_106face_keypoints(in_path, + keypoints, + boxes, + scale=4.0, + save_path=None): + face_contour_point_index = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 + ] + left_eye_brow_point_index = [33, 34, 35, 36, 37, 38, 39, 40, 41, 33] + right_eye_brow_point_index = [42, 43, 44, 45, 46, 47, 48, 49, 50, 42] + left_eye_point_index = [66, 67, 68, 69, 70, 71, 72, 73, 66] + right_eye_point_index = [75, 76, 77, 78, 79, 80, 81, 82, 75] + nose_bridge_point_index = [51, 52, 53, 54] + nose_contour_point_index = [55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65] + mouth_outer_point_index = [ + 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 84 + ] + mouth_inter_point_index = [96, 97, 98, 99, 100, 101, 102, 103, 96] + + img = cv2.imread(in_path) + + for i in range(len(boxes)): + draw_box(img, np.array(boxes[i])) + + image = cv2.resize(img, dsize=None, fx=scale, fy=scale) + + def draw_line(point_index, image, point): + for i in range(len(point_index) - 1): + cur_index = point_index[i] + next_index = point_index[i + 1] + cur_pt = (int(point[cur_index][0] * scale), + int(point[cur_index][1] * scale)) + next_pt = (int(point[next_index][0] * scale), + int(point[next_index][1] * scale)) + cv2.line(image, cur_pt, next_pt, (0, 0, 255), thickness=2) + + for i in range(len(keypoints)): + points = keypoints[i] + + draw_line(face_contour_point_index, image, points) + draw_line(left_eye_brow_point_index, image, points) + draw_line(right_eye_brow_point_index, image, points) + draw_line(left_eye_point_index, image, points) + draw_line(right_eye_point_index, image, points) + draw_line(nose_bridge_point_index, image, points) + draw_line(nose_contour_point_index, image, points) + draw_line(mouth_outer_point_index, image, points) + draw_line(mouth_inter_point_index, image, points) + + size = len(points) + for i in range(size): + x = int(points[i][0]) + y = int(points[i][1]) + cv2.putText(image, str(i), (int(x * scale), int(y * scale)), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) + cv2.circle(image, (int(x * scale), int(y * scale)), 2, (0, 255, 0), + cv2.FILLED) + + if save_path is not None: + cv2.imwrite(save_path, image) + + return image + + def draw_face_detection_no_lm_result(img_path, detection_result): bboxes = np.array(detection_result[OutputKeys.BOXES]) scores = np.array(detection_result[OutputKeys.SCORES]) diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py index a5e347e8..7ccc8a59 100644 --- a/tests/pipelines/test_face_2d_keypoints.py +++ b/tests/pipelines/test_face_2d_keypoints.py @@ -1,11 +1,10 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import unittest -import cv2 - from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import draw_106face_keypoints from modelscope.utils.test_utils import test_level @@ -13,7 +12,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_face_2d_keypoints(self): - img_path = 'data/test/images/keypoints_detect/test_img_face_2d_keypoints.png' + img_path = 'data/test/images/face_detection.png' model_id = 'damo/cv_mobilenet_face-2d-keypoints_alignment' face_2d_keypoints_align = pipeline( @@ -21,15 +20,21 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): output = face_2d_keypoints_align(img_path) output_keypoints = output[OutputKeys.KEYPOINTS] - output_pose = output[OutputKeys.POSES] - - img = cv2.imread(img_path) - img = face_2d_keypoints_align.show_result( - img, output_keypoints, scale=2, save_path='face_keypoints.jpg') - - self.assertEqual(output_keypoints.shape[0], 106) - self.assertEqual(output_keypoints.shape[1], 2) - self.assertEqual(output_pose.shape[0], 3) + output_poses = output[OutputKeys.POSES] + output_boxes = output[OutputKeys.BOXES] + + draw_106face_keypoints( + img_path, + output_keypoints, + output_boxes, + scale=2, + save_path='face_keypoints.jpg') + + for idx in range(len(output_keypoints)): + self.assertEqual(output_keypoints[idx].shape[0], 106) + self.assertEqual(output_keypoints[idx].shape[1], 2) + self.assertEqual(output_poses[idx].shape[0], 3) + self.assertEqual(output_boxes[idx].shape[0], 4) if __name__ == '__main__':