|
| 1 | +import com.beust.jcommander.JCommander; |
| 2 | +import com.beust.jcommander.Parameter; |
| 3 | +import com.beust.jcommander.UnixStyleUsageFormatter; |
| 4 | +import org.bytedeco.javacpp.FloatPointer; |
| 5 | +import org.bytedeco.javacv.CanvasFrame; |
| 6 | +import org.bytedeco.javacv.OpenCVFrameConverter; |
| 7 | +import org.bytedeco.opencv.global.opencv_dnn; |
| 8 | +import org.bytedeco.opencv.opencv_core.*; |
| 9 | +import org.bytedeco.opencv.opencv_dnn.TextDetectionModel_DB; |
| 10 | +import org.bytedeco.opencv.opencv_videoio.VideoCapture; |
| 11 | + |
| 12 | +import java.util.AbstractMap; |
| 13 | +import java.util.Map; |
| 14 | + |
| 15 | +import static org.bytedeco.opencv.global.opencv_imgcodecs.imwrite; |
| 16 | +import static org.bytedeco.opencv.global.opencv_imgproc.*; |
| 17 | + |
| 18 | +public class demo { |
| 19 | + |
| 20 | + // Valid combinations of backends and targets |
| 21 | + static int[][] backendTargetPairs = { |
| 22 | + {opencv_dnn.DNN_BACKEND_OPENCV, opencv_dnn.DNN_TARGET_CPU}, |
| 23 | + {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA}, |
| 24 | + {opencv_dnn.DNN_BACKEND_CUDA, opencv_dnn.DNN_TARGET_CUDA_FP16}, |
| 25 | + {opencv_dnn.DNN_BACKEND_TIMVX, opencv_dnn.DNN_TARGET_NPU}, |
| 26 | + {opencv_dnn.DNN_BACKEND_CANN, opencv_dnn.DNN_TARGET_NPU} |
| 27 | + }; |
| 28 | + |
| 29 | + static class Args { |
| 30 | + @Parameter(names = {"--help", "-h"}, order = 0, help = true, |
| 31 | + description = "Print help message.") |
| 32 | + boolean help; |
| 33 | + @Parameter(names = {"--model", "-m"}, order = 1, |
| 34 | + description = "Set model type.") |
| 35 | + String model = "text_detection_en_ppocrv3_2023may.onnx"; |
| 36 | + @Parameter(names = {"--input", "-i"}, order = 2, |
| 37 | + description = "Path to input image or video file. Skip this argument to capture frames from a camera.") |
| 38 | + String input; |
| 39 | + @Parameter(names = "--width", order = 3, |
| 40 | + description = "Resize input image to certain width, It should be multiple by 32.") |
| 41 | + int width = 736; |
| 42 | + @Parameter(names = "--height", order = 4, |
| 43 | + description = "Resize input image to certain height, It should be multiple by 32.") |
| 44 | + int height = 736; |
| 45 | + @Parameter(names = "--binary_threshold", order = 5, |
| 46 | + description = "Threshold of the binary map.") |
| 47 | + float binaryThreshold = 0.3f; |
| 48 | + @Parameter(names = "--polygon_threshold", order = 6, |
| 49 | + description = "Threshold of polygons.") |
| 50 | + float polygonThreshold = 0.5f; |
| 51 | + @Parameter(names = "--max_candidates", order = 7, |
| 52 | + description = "Set maximum number of polygon candidates.") |
| 53 | + int maxCandidates = 200; |
| 54 | + @Parameter(names = "--unclip_ratio", order = 8, |
| 55 | + description = "The unclip ratio of the detected text region, which determines the output size.") |
| 56 | + double unclipRatio = 2.0; |
| 57 | + @Parameter(names = {"--save", "-s"}, order = 9, arity = 1, |
| 58 | + description = "Specify to save file with results (i.e. bounding box, confidence level). Invalid in case of camera input.") |
| 59 | + boolean save = true; |
| 60 | + @Parameter(names = {"--viz", "-v"}, order = 10, arity = 1, |
| 61 | + description = "Specify to open a new window to show results. Invalid in case of camera input.") |
| 62 | + boolean viz = true; |
| 63 | + @Parameter(names = {"--backend", "-bt"}, order = 11, |
| 64 | + description = "Choose one of computation backends:" + |
| 65 | + " 0: OpenCV implementation + CPU," + |
| 66 | + " 1: CUDA + GPU (CUDA), " + |
| 67 | + " 2: CUDA + GPU (CUDA FP16)," + |
| 68 | + " 3: TIM-VX + NPU," + |
| 69 | + " 4: CANN + NPU") |
| 70 | + int backend = 0; |
| 71 | + } |
| 72 | + |
| 73 | + static class PPOCRDet { |
| 74 | + private final TextDetectionModel_DB model; |
| 75 | + private final Size inputSize; |
| 76 | + |
| 77 | + public PPOCRDet(String modelPath, Size inputSize, |
| 78 | + float binaryThreshold, float polygonThreshold, int maxCandidates, double unclipRatio, |
| 79 | + int backendId, int targetId) { |
| 80 | + this.inputSize = inputSize; |
| 81 | + |
| 82 | + model = new TextDetectionModel_DB(modelPath); |
| 83 | + model.setPreferableBackend(backendId); |
| 84 | + model.setPreferableTarget(targetId); |
| 85 | + |
| 86 | + model.setBinaryThreshold(binaryThreshold); |
| 87 | + model.setPolygonThreshold(polygonThreshold); |
| 88 | + model.setUnclipRatio(unclipRatio); |
| 89 | + model.setMaxCandidates(maxCandidates); |
| 90 | + |
| 91 | + model.setInputParams(1.0 / 255.0, inputSize, |
| 92 | + new Scalar(122.67891434, 116.66876762, 104.00698793, 0), true, false); |
| 93 | + } |
| 94 | + |
| 95 | + public Map.Entry<PointVectorVector, FloatPointer> infer(Mat image) { |
| 96 | + assert image.rows() == inputSize.height() : "height of input image != net input size"; |
| 97 | + assert image.cols() == inputSize.width() : "width of input image != net input size"; |
| 98 | + final PointVectorVector pt = new PointVectorVector(); |
| 99 | + final FloatPointer confidences = new FloatPointer(); |
| 100 | + model.detect(image, pt, confidences); |
| 101 | + return new AbstractMap.SimpleEntry<>(pt, confidences); |
| 102 | + } |
| 103 | + } |
| 104 | + |
| 105 | + static Mat visualize(Mat image, Map.Entry<PointVectorVector, FloatPointer> results, double fps, Scalar boxColor, |
| 106 | + Scalar textColor, boolean isClosed, int thickness) { |
| 107 | + final Mat output = new Mat(); |
| 108 | + image.copyTo(output); |
| 109 | + if (fps > 0) { |
| 110 | + putText(output, String.format("FPS: %.2f", fps), new Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, textColor); |
| 111 | + } |
| 112 | + final PointVectorVector pvv = results.getKey(); |
| 113 | + final MatVector matVector = new MatVector(); |
| 114 | + for (int i = 0; i < pvv.size(); i++) { |
| 115 | + final PointVector pv = pvv.get(i); |
| 116 | + final Point pts = new Point(pv.size()); |
| 117 | + for (int j = 0; j < pv.size(); j++) { |
| 118 | + pts.position(j).x(pv.get(j).x()).y(pv.get(j).y()); |
| 119 | + } |
| 120 | + matVector.push_back(new Mat(pts.position(0))); |
| 121 | + } |
| 122 | + polylines(output, matVector, isClosed, boxColor, thickness, LINE_AA, 0); |
| 123 | + matVector.close(); |
| 124 | + return output; |
| 125 | + } |
| 126 | + |
| 127 | + /** |
| 128 | + * Execute: |
| 129 | + * mvn compile exec:java -Dexec.mainClass=demo -q -Dexec.args="--help" |
| 130 | + */ |
| 131 | + public static void main(String[] argv) { |
| 132 | + final Args args = new Args(); |
| 133 | + final JCommander jc = JCommander.newBuilder() |
| 134 | + .addObject(args) |
| 135 | + .build(); |
| 136 | + jc.setUsageFormatter(new UnixStyleUsageFormatter(jc)); |
| 137 | + jc.parse(argv); |
| 138 | + if (args.help) { |
| 139 | + jc.usage(); |
| 140 | + return; |
| 141 | + } |
| 142 | + final int[] backendTargetPair = backendTargetPairs[args.backend]; |
| 143 | + assert args.model != null && !args.model.isEmpty() : "Model name is empty"; |
| 144 | + final Size inpSize = new Size(args.width, args.height); |
| 145 | + |
| 146 | + final PPOCRDet model = new PPOCRDet(args.model, inpSize, |
| 147 | + args.binaryThreshold, args.polygonThreshold, args.maxCandidates, args.unclipRatio, |
| 148 | + backendTargetPair[0], backendTargetPair[1]); |
| 149 | + |
| 150 | + final VideoCapture cap = new VideoCapture(); |
| 151 | + if (args.input != null) { |
| 152 | + cap.open(args.input); |
| 153 | + } else { |
| 154 | + cap.open(0); |
| 155 | + } |
| 156 | + assert cap.isOpened() : "Cannot open video or file"; |
| 157 | + Mat originalImage = new Mat(); |
| 158 | + |
| 159 | + final OpenCVFrameConverter.ToMat converter = new OpenCVFrameConverter.ToMat(); |
| 160 | + CanvasFrame mainframe = null; |
| 161 | + if (args.input == null || args.viz) { |
| 162 | + mainframe = new CanvasFrame(args.model + " Demo", CanvasFrame.getDefaultGamma() / 2.2); |
| 163 | + mainframe.setDefaultCloseOperation(javax.swing.JFrame.EXIT_ON_CLOSE); |
| 164 | + mainframe.setVisible(true); |
| 165 | + } |
| 166 | + |
| 167 | + final Scalar boxColor = new Scalar(0, 255, 0, 0); |
| 168 | + final Scalar textColor = new Scalar(0, 0, 255, 0); |
| 169 | + final TickMeter tm = new TickMeter(); |
| 170 | + while (cap.read(originalImage)) { |
| 171 | + cap.read(originalImage); |
| 172 | + |
| 173 | + final int originalW = originalImage.cols(); |
| 174 | + final int originalH = originalImage.rows(); |
| 175 | + final double scaleHeight = originalH / (double) inpSize.height(); |
| 176 | + final double scaleWidth = originalW / (double) inpSize.width(); |
| 177 | + final Mat image = new Mat(); |
| 178 | + resize(originalImage, image, inpSize); |
| 179 | + |
| 180 | + // inference |
| 181 | + tm.start(); |
| 182 | + Map.Entry<PointVectorVector, FloatPointer> results = model.infer(image); |
| 183 | + tm.stop(); |
| 184 | + // Scale the results bounding box |
| 185 | + final PointVectorVector pvv = results.getKey(); |
| 186 | + for (int i = 0; i < pvv.size(); i++) { |
| 187 | + final PointVector pts = pvv.get(i); |
| 188 | + for (int j = 0; j < pts.size(); j++) { |
| 189 | + pts.get(j).x((int) (pts.get(j).x() * scaleWidth)); |
| 190 | + pts.get(j).y((int) (pts.get(j).y() * scaleHeight)); |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + originalImage = visualize(originalImage, results, tm.getFPS(), boxColor, textColor, true, 2); |
| 195 | + tm.reset(); |
| 196 | + if (args.input != null) { |
| 197 | + if (args.save) { |
| 198 | + System.out.println("Result image saved to result.jpg"); |
| 199 | + imwrite("result.jpg", originalImage); |
| 200 | + } |
| 201 | + if (args.viz) { |
| 202 | + mainframe.showImage(converter.convert(originalImage)); |
| 203 | + } |
| 204 | + } else { |
| 205 | + mainframe.showImage(converter.convert(originalImage)); |
| 206 | + } |
| 207 | + |
| 208 | + // clear |
| 209 | + pvv.close(); |
| 210 | + image.close(); |
| 211 | + } |
| 212 | + tm.close(); |
| 213 | + } |
| 214 | + |
| 215 | +} |
0 commit comments