You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ppocrv5.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. // pip install paddlepaddle==3.0.0
  15. // pip install paddleocr==3.0.0
  16. // paddlex --install paddle2onnx
  17. // paddleocr ocr -i test.png
  18. // paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_det --onnx_model_dir PP-OCRv5_mobile_det
  19. // paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_rec --onnx_model_dir PP-OCRv5_mobile_rec
  20. // pnnx PP-OCRv5_mobile_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256]
  21. // pnnx PP-OCRv5_mobile_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256]
  22. // pnnx PP-OCRv5_server_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256] fp16=0
  23. // pnnx PP-OCRv5_server_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256] fp16=0
  24. #include "layer.h"
  25. #include "net.h"
  26. #include <opencv2/core/core.hpp>
  27. #include <opencv2/highgui/highgui.hpp>
  28. #include <opencv2/imgproc/imgproc.hpp>
  29. #include <float.h>
  30. #include <stdio.h>
  31. #include <vector>
  32. #include "ppocrv5_dict.h"
  33. struct Character
  34. {
  35. int id;
  36. float prob;
  37. };
  38. struct Object
  39. {
  40. cv::RotatedRect rrect;
  41. int orientation;
  42. float prob;
  43. std::vector<Character> text;
  44. };
  45. static double contour_score(const cv::Mat& binary, const std::vector<cv::Point>& contour)
  46. {
  47. cv::Rect rect = cv::boundingRect(contour);
  48. if (rect.x < 0)
  49. rect.x = 0;
  50. if (rect.y < 0)
  51. rect.y = 0;
  52. if (rect.x + rect.width > binary.cols)
  53. rect.width = binary.cols - rect.x;
  54. if (rect.y + rect.height > binary.rows)
  55. rect.height = binary.rows - rect.y;
  56. cv::Mat binROI = binary(rect);
  57. cv::Mat mask = cv::Mat::zeros(rect.height, rect.width, CV_8U);
  58. std::vector<cv::Point> roiContour;
  59. for (size_t i = 0; i < contour.size(); i++)
  60. {
  61. cv::Point pt = cv::Point(contour[i].x - rect.x, contour[i].y - rect.y);
  62. roiContour.push_back(pt);
  63. }
  64. std::vector<std::vector<cv::Point> > roiContours = {roiContour};
  65. cv::fillPoly(mask, roiContours, cv::Scalar(255));
  66. double score = cv::mean(binROI, mask).val[0];
  67. return score / 255.f;
  68. }
  69. static cv::Mat get_rotate_crop_image(const cv::Mat& bgr, const Object& object)
  70. {
  71. const int orientation = object.orientation;
  72. const float rw = object.rrect.size.width;
  73. const float rh = object.rrect.size.height;
  74. const int target_height = 48;
  75. const float target_width = rh * target_height / rw;
  76. // warpperspective shall be used to rotate the image
  77. // but actually they are all rectangles, so warpaffine is almost enough :P
  78. cv::Mat dst;
  79. cv::Point2f corners[4];
  80. object.rrect.points(corners);
  81. if (orientation == 0)
  82. {
  83. // horizontal text
  84. // corner points order
  85. // 0--------1
  86. // | |rw -> as angle=90
  87. // 3--------2
  88. // rh
  89. std::vector<cv::Point2f> src_pts(3);
  90. src_pts[0] = corners[0];
  91. src_pts[1] = corners[1];
  92. src_pts[2] = corners[3];
  93. std::vector<cv::Point2f> dst_pts(3);
  94. dst_pts[0] = cv::Point2f(0, 0);
  95. dst_pts[1] = cv::Point2f(target_width, 0);
  96. dst_pts[2] = cv::Point2f(0, target_height);
  97. cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts);
  98. cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE);
  99. }
  100. else
  101. {
  102. // vertial text
  103. // corner points order
  104. // 1----2
  105. // | |
  106. // | |
  107. // | |rh -> as angle=0
  108. // | |
  109. // | |
  110. // 0----3
  111. // rw
  112. std::vector<cv::Point2f> src_pts(3);
  113. src_pts[0] = corners[2];
  114. src_pts[1] = corners[3];
  115. src_pts[2] = corners[1];
  116. std::vector<cv::Point2f> dst_pts(3);
  117. dst_pts[0] = cv::Point2f(0, 0);
  118. dst_pts[1] = cv::Point2f(target_width, 0);
  119. dst_pts[2] = cv::Point2f(0, target_height);
  120. cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts);
  121. cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE);
  122. }
  123. return dst;
  124. }
  125. class PPOCRv5
  126. {
  127. public:
  128. void init();
  129. void detect(const cv::Mat& bgr, std::vector<Object>& objects);
  130. void recognize(const cv::Mat& bgr, Object& object);
  131. protected:
  132. ncnn::Net ppocrv5_det;
  133. ncnn::Net ppocrv5_rec;
  134. };
  135. void PPOCRv5::init()
  136. {
  137. // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
  138. // https://github.com/nihui/ncnn-android-ppocrv5/tree/master/app/src/main/assets
  139. ppocrv5_det.opt.use_vulkan_compute = true;
  140. // ppocrv5_det.opt.use_bf16_storage = true;
  141. // fp16 must be disabled for server model
  142. // ppocrv5_det.opt.use_fp16_packed = false;
  143. // ppocrv5_det.opt.use_fp16_storage = false;
  144. ppocrv5_det.load_param("PP_OCRv5_mobile_det.ncnn.param");
  145. ppocrv5_det.load_model("PP_OCRv5_mobile_det.ncnn.bin");
  146. // ppocrv5_det.load_param("PP_OCRv5_server_det.ncnn.param");
  147. // ppocrv5_det.load_model("PP_OCRv5_server_det.ncnn.bin");
  148. ppocrv5_rec.opt.use_vulkan_compute = true;
  149. // ppocrv5_rec.opt.use_bf16_storage = true;
  150. // fp16 must be disabled for server model
  151. // ppocrv5_rec.opt.use_fp16_packed = false;
  152. // ppocrv5_rec.opt.use_fp16_storage = false;
  153. ppocrv5_rec.load_param("PP_OCRv5_mobile_rec.ncnn.param");
  154. ppocrv5_rec.load_model("PP_OCRv5_mobile_rec.ncnn.bin");
  155. // ppocrv5_rec.load_param("PP_OCRv5_server_rec.ncnn.param");
  156. // ppocrv5_rec.load_model("PP_OCRv5_server_rec.ncnn.bin");
  157. }
  158. void PPOCRv5::detect(const cv::Mat& bgr, std::vector<Object>& objects)
  159. {
  160. const int target_size = 960;
  161. int img_w = bgr.cols;
  162. int img_h = bgr.rows;
  163. const int target_stride = 32;
  164. // letterbox pad to multiple of target_stride
  165. int w = img_w;
  166. int h = img_h;
  167. float scale = 1.f;
  168. if (std::max(w, h) > target_size)
  169. {
  170. if (w > h)
  171. {
  172. scale = (float)target_size / w;
  173. w = target_size;
  174. h = h * scale;
  175. }
  176. else
  177. {
  178. scale = (float)target_size / h;
  179. h = target_size;
  180. w = w * scale;
  181. }
  182. }
  183. ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);
  184. int wpad = (w + target_stride - 1) / target_stride * target_stride - w;
  185. int hpad = (h + target_stride - 1) / target_stride * target_stride - h;
  186. ncnn::Mat in_pad;
  187. ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
  188. const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};
  189. const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
  190. in_pad.substract_mean_normalize(mean_vals, norm_vals);
  191. ncnn::Extractor ex = ppocrv5_det.create_extractor();
  192. ex.input("in0", in_pad);
  193. ncnn::Mat out;
  194. ex.extract("out0", out);
  195. const float denorm_vals[1] = {255.f};
  196. out.substract_mean_normalize(0, denorm_vals);
  197. cv::Mat pred(out.h, out.w, CV_8UC1);
  198. out.to_pixels(pred.data, ncnn::Mat::PIXEL_GRAY);
  199. // threshold binary
  200. cv::Mat bitmap;
  201. const float threshold = 0.3f;
  202. cv::threshold(pred, bitmap, threshold * 255, 255, cv::THRESH_BINARY);
  203. // boxes from bitmap
  204. {
  205. // should use dbnet post process, but I think unclip process is difficult to write
  206. // so simply implement expansion. This may lose detection accuracy
  207. // original implementation can be referenced
  208. // https://github.com/MhLiao/DB/blob/master/structure/representers/seg_detector_representer.py
  209. const float box_thresh = 0.6f;
  210. const float enlarge_ratio = 1.95f;
  211. const float min_size = 3 * scale;
  212. const int max_candidates = 1000;
  213. std::vector<std::vector<cv::Point> > contours;
  214. std::vector<cv::Vec4i> hierarchy;
  215. cv::findContours(bitmap, contours, hierarchy, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
  216. contours.resize(std::min(contours.size(), (size_t)max_candidates));
  217. for (size_t i = 0; i < contours.size(); i++)
  218. {
  219. const std::vector<cv::Point>& contour = contours[i];
  220. if (contour.size() <= 2)
  221. continue;
  222. double score = contour_score(pred, contour);
  223. if (score < box_thresh)
  224. continue;
  225. cv::RotatedRect rrect = cv::minAreaRect(contour);
  226. float rrect_maxwh = std::max(rrect.size.width, rrect.size.height);
  227. if (rrect_maxwh < min_size)
  228. continue;
  229. int orientation = 0;
  230. if (rrect.angle >= -30 && rrect.angle <= 30 && rrect.size.height > rrect.size.width * 2.7)
  231. {
  232. // vertical text
  233. orientation = 1;
  234. }
  235. if ((rrect.angle <= -60 || rrect.angle >= 60) && rrect.size.width > rrect.size.height * 2.7)
  236. {
  237. // vertical text
  238. orientation = 1;
  239. }
  240. if (rrect.angle < -30)
  241. {
  242. // make orientation from -90 ~ -30 to 90 ~ 150
  243. rrect.angle += 180;
  244. }
  245. if (orientation == 0 && rrect.angle < 30)
  246. {
  247. // make it horizontal
  248. rrect.angle += 90;
  249. std::swap(rrect.size.width, rrect.size.height);
  250. }
  251. if (orientation == 1 && rrect.angle >= 60)
  252. {
  253. // make it vertical
  254. rrect.angle -= 90;
  255. std::swap(rrect.size.width, rrect.size.height);
  256. }
  257. // enlarge
  258. rrect.size.height += rrect.size.width * (enlarge_ratio - 1);
  259. rrect.size.width *= enlarge_ratio;
  260. // adjust offset to original unpadded
  261. rrect.center.x = (rrect.center.x - (wpad / 2)) / scale;
  262. rrect.center.y = (rrect.center.y - (hpad / 2)) / scale;
  263. rrect.size.width = (rrect.size.width) / scale;
  264. rrect.size.height = (rrect.size.height) / scale;
  265. Object obj;
  266. obj.rrect = rrect;
  267. obj.orientation = orientation;
  268. obj.prob = score;
  269. objects.push_back(obj);
  270. }
  271. }
  272. }
  273. void PPOCRv5::recognize(const cv::Mat& bgr, Object& object)
  274. {
  275. cv::Mat roi = get_rotate_crop_image(bgr, object);
  276. ncnn::Mat in = ncnn::Mat::from_pixels(roi.data, ncnn::Mat::PIXEL_BGR, roi.cols, roi.rows);
  277. // ~/.paddlex/official_models/PP-OCRv5_mobile_rec/inference.yml
  278. const float mean_vals[3] = {127.5, 127.5, 127.5};
  279. const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
  280. in.substract_mean_normalize(mean_vals, norm_vals);
  281. ncnn::Extractor ex = ppocrv5_rec.create_extractor();
  282. ex.input("in0", in);
  283. ncnn::Mat out;
  284. ex.extract("out0", out);
  285. // 18385 x len
  286. for (int i = 0; i < out.h; i++)
  287. {
  288. const float* p = out.row(i);
  289. int index = 0;
  290. float max_score = -9999.f;
  291. for (int j = 0; j < out.w; j++)
  292. {
  293. float score = *p++;
  294. if (score > max_score)
  295. {
  296. max_score = score;
  297. index = j;
  298. }
  299. }
  300. if (index <= 0)
  301. continue;
  302. Character ch;
  303. ch.id = index - 1;
  304. ch.prob = max_score;
  305. object.text.push_back(ch);
  306. }
  307. }
  308. static int detect_ppocrv5(const cv::Mat& bgr, std::vector<Object>& objects)
  309. {
  310. PPOCRv5 ppocrv5;
  311. ppocrv5.init();
  312. ppocrv5.detect(bgr, objects);
  313. for (size_t i = 0; i < objects.size(); i++)
  314. {
  315. ppocrv5.recognize(bgr, objects[i]);
  316. }
  317. return 0;
  318. }
  319. static int draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
  320. {
  321. static const cv::Scalar colors[] = {
  322. cv::Scalar(156, 39, 176),
  323. cv::Scalar(103, 58, 183),
  324. cv::Scalar(63, 81, 181),
  325. cv::Scalar(33, 150, 243),
  326. cv::Scalar(3, 169, 244),
  327. cv::Scalar(0, 188, 212),
  328. cv::Scalar(0, 150, 136),
  329. cv::Scalar(76, 175, 80),
  330. cv::Scalar(139, 195, 74),
  331. cv::Scalar(205, 220, 57),
  332. cv::Scalar(255, 235, 59),
  333. cv::Scalar(255, 193, 7),
  334. cv::Scalar(255, 152, 0),
  335. cv::Scalar(255, 87, 34),
  336. cv::Scalar(121, 85, 72),
  337. cv::Scalar(158, 158, 158),
  338. cv::Scalar(96, 125, 139)
  339. };
  340. cv::Mat image = bgr.clone();
  341. for (size_t i = 0; i < objects.size(); i++)
  342. {
  343. const Object& obj = objects[i];
  344. const cv::Scalar& color = colors[i % 17];
  345. fprintf(stderr, "%s %.5f at %.2f %.2f %.2f x %.2f @ %.2f = ", obj.orientation == 0 ? "H" : "V", obj.prob,
  346. obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle);
  347. cv::Point2f corners[4];
  348. obj.rrect.points(corners);
  349. cv::line(image, corners[0], corners[1], color);
  350. cv::line(image, corners[1], corners[2], color);
  351. cv::line(image, corners[2], corners[3], color);
  352. cv::line(image, corners[3], corners[0], color);
  353. std::string text;
  354. for (size_t j = 0; j < objects[i].text.size(); j++)
  355. {
  356. const Character& ch = objects[i].text[j];
  357. if (ch.id >= character_dict_size)
  358. continue;
  359. text += character_dict[ch.id];
  360. }
  361. fprintf(stderr, "%s\n", text.c_str());
  362. }
  363. fprintf(stderr, "opencv putText can not draw non-latin characters, you may see question marks instead\n");
  364. fprintf(stderr, "see opencv-mobile for drawing non-latin characters\n");
  365. for (size_t i = 0; i < objects.size(); i++)
  366. {
  367. const Object& obj = objects[i];
  368. const cv::Scalar& color = colors[i % 17];
  369. std::string text;
  370. for (size_t j = 0; j < objects[i].text.size(); j++)
  371. {
  372. const Character& ch = objects[i].text[j];
  373. if (ch.id >= character_dict_size)
  374. continue;
  375. if (obj.orientation == 0)
  376. {
  377. text += character_dict[ch.id];
  378. }
  379. else
  380. {
  381. text += character_dict[ch.id];
  382. if (j + 1 < objects[i].text.size())
  383. text += "\n";
  384. }
  385. }
  386. int baseLine = 0;
  387. cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
  388. int x = obj.rrect.center.x - label_size.width / 2;
  389. int y = obj.rrect.center.y - label_size.height / 2 - baseLine;
  390. if (y < 0)
  391. y = 0;
  392. if (y + label_size.height > image.rows)
  393. y = image.rows - label_size.height;
  394. if (x < 0)
  395. x = 0;
  396. if (x + label_size.width > image.cols)
  397. x = image.cols - label_size.width;
  398. cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
  399. cv::Scalar(255, 255, 255), -1);
  400. if (obj.orientation == 0)
  401. {
  402. cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  403. }
  404. else
  405. {
  406. cv::putText(image, text, cv::Point(x, y + label_size.width), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
  407. }
  408. }
  409. cv::imshow("image", image);
  410. cv::waitKey(0);
  411. return 0;
  412. }
  413. int main(int argc, char** argv)
  414. {
  415. if (argc != 2)
  416. {
  417. fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
  418. return -1;
  419. }
  420. const char* imagepath = argv[1];
  421. cv::Mat m = cv::imread(imagepath, 1);
  422. if (m.empty())
  423. {
  424. fprintf(stderr, "cv::imread %s failed\n", imagepath);
  425. return -1;
  426. }
  427. std::vector<Object> objects;
  428. detect_ppocrv5(m, objects);
  429. draw_objects(m, objects);
  430. return 0;
  431. }