// Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause // pip install paddlepaddle==3.0.0 // pip install paddleocr==3.0.0 // paddlex --install paddle2onnx // paddleocr ocr -i test.png // paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_det --onnx_model_dir PP-OCRv5_mobile_det // paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_rec --onnx_model_dir PP-OCRv5_mobile_rec // pnnx PP-OCRv5_mobile_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256] // pnnx PP-OCRv5_mobile_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256] // pnnx PP-OCRv5_server_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256] fp16=0 // pnnx PP-OCRv5_server_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256] fp16=0 #include "layer.h" #include "net.h" #include #include #include #include #include #include #include "ppocrv5_dict.h" struct Character { int id; float prob; }; struct Object { cv::RotatedRect rrect; int orientation; float prob; std::vector text; }; static double contour_score(const cv::Mat& binary, const std::vector& contour) { cv::Rect rect = cv::boundingRect(contour); if (rect.x < 0) rect.x = 0; if (rect.y < 0) rect.y = 0; if (rect.x + rect.width > binary.cols) rect.width = binary.cols - rect.x; if (rect.y + rect.height > binary.rows) rect.height = binary.rows - rect.y; cv::Mat binROI = binary(rect); cv::Mat mask = cv::Mat::zeros(rect.height, rect.width, CV_8U); std::vector roiContour; for (size_t i = 0; i < contour.size(); i++) { cv::Point pt = cv::Point(contour[i].x - rect.x, contour[i].y - rect.y); roiContour.push_back(pt); } std::vector > roiContours = {roiContour}; cv::fillPoly(mask, roiContours, cv::Scalar(255)); double score = cv::mean(binROI, mask).val[0]; return score / 255.f; } static cv::Mat get_rotate_crop_image(const cv::Mat& bgr, const Object& object) { const int orientation = object.orientation; const float rw = object.rrect.size.width; const float rh = object.rrect.size.height; const int target_height = 48; const float target_width = rh * target_height / rw; // warpperspective shall be used to rotate the image // but actually they are all rectangles, so warpaffine is almost enough :P cv::Mat dst; cv::Point2f corners[4]; object.rrect.points(corners); if (orientation == 0) { // horizontal text // corner points order // 0--------1 // | |rw -> as angle=90 // 3--------2 // rh std::vector src_pts(3); src_pts[0] = corners[0]; src_pts[1] = corners[1]; src_pts[2] = corners[3]; std::vector dst_pts(3); dst_pts[0] = cv::Point2f(0, 0); dst_pts[1] = cv::Point2f(target_width, 0); dst_pts[2] = cv::Point2f(0, target_height); cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts); cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE); } else { // vertial text // corner points order // 1----2 // | | // | | // | |rh -> as angle=0 // | | // | | // 0----3 // rw std::vector src_pts(3); src_pts[0] = corners[2]; src_pts[1] = corners[3]; src_pts[2] = corners[1]; std::vector dst_pts(3); dst_pts[0] = cv::Point2f(0, 0); dst_pts[1] = cv::Point2f(target_width, 0); dst_pts[2] = cv::Point2f(0, target_height); cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts); cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE); } return dst; } class PPOCRv5 { public: void init(); void detect(const cv::Mat& bgr, std::vector& objects); void recognize(const cv::Mat& bgr, Object& object); protected: ncnn::Net ppocrv5_det; ncnn::Net ppocrv5_rec; }; void PPOCRv5::init() { // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models // https://github.com/nihui/ncnn-android-ppocrv5/tree/master/app/src/main/assets ppocrv5_det.opt.use_vulkan_compute = true; // ppocrv5_det.opt.use_bf16_storage = true; // fp16 must be disabled for server model // ppocrv5_det.opt.use_fp16_packed = false; // ppocrv5_det.opt.use_fp16_storage = false; ppocrv5_det.load_param("PP_OCRv5_mobile_det.ncnn.param"); ppocrv5_det.load_model("PP_OCRv5_mobile_det.ncnn.bin"); // ppocrv5_det.load_param("PP_OCRv5_server_det.ncnn.param"); // ppocrv5_det.load_model("PP_OCRv5_server_det.ncnn.bin"); ppocrv5_rec.opt.use_vulkan_compute = true; // ppocrv5_rec.opt.use_bf16_storage = true; // fp16 must be disabled for server model // ppocrv5_rec.opt.use_fp16_packed = false; // ppocrv5_rec.opt.use_fp16_storage = false; ppocrv5_rec.load_param("PP_OCRv5_mobile_rec.ncnn.param"); ppocrv5_rec.load_model("PP_OCRv5_mobile_rec.ncnn.bin"); // ppocrv5_rec.load_param("PP_OCRv5_server_rec.ncnn.param"); // ppocrv5_rec.load_model("PP_OCRv5_server_rec.ncnn.bin"); } void PPOCRv5::detect(const cv::Mat& bgr, std::vector& objects) { const int target_size = 960; int img_w = bgr.cols; int img_h = bgr.rows; const int target_stride = 32; // letterbox pad to multiple of target_stride int w = img_w; int h = img_h; float scale = 1.f; if (std::max(w, h) > target_size) { if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } } ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h); int wpad = (w + target_stride - 1) / target_stride * target_stride - w; int hpad = (h + target_stride - 1) / target_stride * target_stride - h; ncnn::Mat in_pad; ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f}; const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f}; in_pad.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = ppocrv5_det.create_extractor(); ex.input("in0", in_pad); ncnn::Mat out; ex.extract("out0", out); const float denorm_vals[1] = {255.f}; out.substract_mean_normalize(0, denorm_vals); cv::Mat pred(out.h, out.w, CV_8UC1); out.to_pixels(pred.data, ncnn::Mat::PIXEL_GRAY); // threshold binary cv::Mat bitmap; const float threshold = 0.3f; cv::threshold(pred, bitmap, threshold * 255, 255, cv::THRESH_BINARY); // boxes from bitmap { // should use dbnet post process, but I think unclip process is difficult to write // so simply implement expansion. This may lose detection accuracy // original implementation can be referenced // https://github.com/MhLiao/DB/blob/master/structure/representers/seg_detector_representer.py const float box_thresh = 0.6f; const float enlarge_ratio = 1.95f; const float min_size = 3 * scale; const int max_candidates = 1000; std::vector > contours; std::vector hierarchy; cv::findContours(bitmap, contours, hierarchy, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); contours.resize(std::min(contours.size(), (size_t)max_candidates)); for (size_t i = 0; i < contours.size(); i++) { const std::vector& contour = contours[i]; if (contour.size() <= 2) continue; double score = contour_score(pred, contour); if (score < box_thresh) continue; cv::RotatedRect rrect = cv::minAreaRect(contour); float rrect_maxwh = std::max(rrect.size.width, rrect.size.height); if (rrect_maxwh < min_size) continue; int orientation = 0; if (rrect.angle >= -30 && rrect.angle <= 30 && rrect.size.height > rrect.size.width * 2.7) { // vertical text orientation = 1; } if ((rrect.angle <= -60 || rrect.angle >= 60) && rrect.size.width > rrect.size.height * 2.7) { // vertical text orientation = 1; } if (rrect.angle < -30) { // make orientation from -90 ~ -30 to 90 ~ 150 rrect.angle += 180; } if (orientation == 0 && rrect.angle < 30) { // make it horizontal rrect.angle += 90; std::swap(rrect.size.width, rrect.size.height); } if (orientation == 1 && rrect.angle >= 60) { // make it vertical rrect.angle -= 90; std::swap(rrect.size.width, rrect.size.height); } // enlarge rrect.size.height += rrect.size.width * (enlarge_ratio - 1); rrect.size.width *= enlarge_ratio; // adjust offset to original unpadded rrect.center.x = (rrect.center.x - (wpad / 2)) / scale; rrect.center.y = (rrect.center.y - (hpad / 2)) / scale; rrect.size.width = (rrect.size.width) / scale; rrect.size.height = (rrect.size.height) / scale; Object obj; obj.rrect = rrect; obj.orientation = orientation; obj.prob = score; objects.push_back(obj); } } } void PPOCRv5::recognize(const cv::Mat& bgr, Object& object) { cv::Mat roi = get_rotate_crop_image(bgr, object); ncnn::Mat in = ncnn::Mat::from_pixels(roi.data, ncnn::Mat::PIXEL_BGR, roi.cols, roi.rows); // ~/.paddlex/official_models/PP-OCRv5_mobile_rec/inference.yml const float mean_vals[3] = {127.5, 127.5, 127.5}; const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5}; in.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = ppocrv5_rec.create_extractor(); ex.input("in0", in); ncnn::Mat out; ex.extract("out0", out); // 18385 x len for (int i = 0; i < out.h; i++) { const float* p = out.row(i); int index = 0; float max_score = -9999.f; for (int j = 0; j < out.w; j++) { float score = *p++; if (score > max_score) { max_score = score; index = j; } } if (index <= 0) continue; Character ch; ch.id = index - 1; ch.prob = max_score; object.text.push_back(ch); } } static int detect_ppocrv5(const cv::Mat& bgr, std::vector& objects) { PPOCRv5 ppocrv5; ppocrv5.init(); ppocrv5.detect(bgr, objects); for (size_t i = 0; i < objects.size(); i++) { ppocrv5.recognize(bgr, objects[i]); } return 0; } static int draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const cv::Scalar colors[] = { cv::Scalar(156, 39, 176), cv::Scalar(103, 58, 183), cv::Scalar(63, 81, 181), cv::Scalar(33, 150, 243), cv::Scalar(3, 169, 244), cv::Scalar(0, 188, 212), cv::Scalar(0, 150, 136), cv::Scalar(76, 175, 80), cv::Scalar(139, 195, 74), cv::Scalar(205, 220, 57), cv::Scalar(255, 235, 59), cv::Scalar(255, 193, 7), cv::Scalar(255, 152, 0), cv::Scalar(255, 87, 34), cv::Scalar(121, 85, 72), cv::Scalar(158, 158, 158), cv::Scalar(96, 125, 139) }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 17]; fprintf(stderr, "%s %.5f at %.2f %.2f %.2f x %.2f @ %.2f = ", obj.orientation == 0 ? "H" : "V", obj.prob, obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle); cv::Point2f corners[4]; obj.rrect.points(corners); cv::line(image, corners[0], corners[1], color); cv::line(image, corners[1], corners[2], color); cv::line(image, corners[2], corners[3], color); cv::line(image, corners[3], corners[0], color); std::string text; for (size_t j = 0; j < objects[i].text.size(); j++) { const Character& ch = objects[i].text[j]; if (ch.id >= character_dict_size) continue; text += character_dict[ch.id]; } fprintf(stderr, "%s\n", text.c_str()); } fprintf(stderr, "opencv putText can not draw non-latin characters, you may see question marks instead\n"); fprintf(stderr, "see opencv-mobile for drawing non-latin characters\n"); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; const cv::Scalar& color = colors[i % 17]; std::string text; for (size_t j = 0; j < objects[i].text.size(); j++) { const Character& ch = objects[i].text[j]; if (ch.id >= character_dict_size) continue; if (obj.orientation == 0) { text += character_dict[ch.id]; } else { text += character_dict[ch.id]; if (j + 1 < objects[i].text.size()) text += "\n"; } } int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); int x = obj.rrect.center.x - label_size.width / 2; int y = obj.rrect.center.y - label_size.height / 2 - baseLine; if (y < 0) y = 0; if (y + label_size.height > image.rows) y = image.rows - label_size.height; if (x < 0) x = 0; if (x + label_size.width > image.cols) x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), cv::Scalar(255, 255, 255), -1); if (obj.orientation == 0) { cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } else { cv::putText(image, text, cv::Point(x, y + label_size.width), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); } } cv::imshow("image", image); cv::waitKey(0); return 0; } int main(int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); return -1; } const char* imagepath = argv[1]; cv::Mat m = cv::imread(imagepath, 1); if (m.empty()) { fprintf(stderr, "cv::imread %s failed\n", imagepath); return -1; } std::vector objects; detect_ppocrv5(m, objects); draw_objects(m, objects); return 0; }