# Imports import shutil import sys from pathlib import Path import cv2 import matplotlib.pyplot as plt import numpy as np from IPython.display import Markdown, display from PIL import Image from openvino.runtime import Core from yaspin import yaspin sys.path.append("../utils") from notebook_utils import load_image # Settings ie = Core() model_dir = Path("model") precision = "FP16" detection_model = "horizontal-text-detection-0001" recognition_model = "text-recognition-resnet-fc" base_model_dir = Path("~/open_model_zoo_models").expanduser() omz_cache_dir = Path("~/open_model_zoo_cache").expanduser() model_dir.mkdir(exist_ok=True) # Download Models download_command = f"omz_downloader --name {detection_model},{recognition_model} --output_dir {base_model_dir} --cache_dir {omz_cache_dir} --precision {precision}" display(Markdown(f"Download command: `{download_command}`")) with yaspin(text=f"Downloading {detection_model}, {recognition_model}") as sp: download_result = !$download_command print(download_result) sp.text = f"Finished downloading {detection_model}, {recognition_model}" sp.ok("✔") # Convert Models convert_command = f"omz_converter --name {recognition_model} --precisions {precision} --download_dir {base_model_dir} --output_dir {base_model_dir}" display(Markdown(f"Convert command: `{convert_command}`")) display(Markdown(f"Converting {recognition_model}...")) ! $convert_command # Copy Models models_info_output = %sx omz_info_dumper --name $detection_model,$recognition_model print(f'sx omz_info_dumper --name {detection_model},{recognition_model}') detection_model_info, recognition_model_info = [ { "name": "horizontal-text-detection-0001", "composite_model_name": None, "description": "Horizontal text detector based on FCOS with light MobileNetV2 backbone", "framework": "dldt", "license_url": "https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/LICENSE", "precisions": [ "FP16", "FP16-INT8", "FP32" ], "quantization_output_precisions": [], "subdirectory": "intel/horizontal-text-detection-0001", "task_type": "detection" }, { "name": "text-recognition-resnet-fc", "composite_model_name": None, "description": "\"text-recognition-resnet-fc\" is a simple and preformant scene text recognition model based on ResNet with Fully Connected text recognition head. Source implementation on a PyTorch* framework could be found here <https://github.com/Media-Smart/vedastr>. Model is able to recognize alphanumeric text.", "framework": "pytorch", "license_url": "https://raw.githubusercontent.com/Media-Smart/vedastr/0fd2a0bd7819ae4daa2a161501e9f1c2ac67e96a/LICENSE", "precisions": [ "FP16", "FP32" ], "quantization_output_precisions": [], "subdirectory": "public/text-recognition-resnet-fc", "task_type": "optical_character_recognition" } ] for model_info in (detection_model_info, recognition_model_info): omz_dir = Path(model_info["subdirectory"]) omz_model_dir = base_model_dir / omz_dir / precision print(omz_model_dir) for model_file in omz_model_dir.iterdir(): try: shutil.copyfile(model_file, model_dir / model_file.name) except FileExistsError: pass detection_model_path = (model_dir / detection_model).with_suffix(".xml") recognition_model_path = (model_dir / recognition_model).with_suffix(".xml") # Load Detection Model detection_model = ie.read_model( model=detection_model_path, weights=detection_model_path.with_suffix(".bin") ) detection_compiled_model = ie.compile_model(model=detection_model, device_name="CPU") detection_input_layer = detection_compiled_model.input(0) # Load an Image # image_file can point to a URL or local image image_file = "https://github.com/openvinotoolkit/openvino_notebooks/raw/main/notebooks/004-hello-detection/data/intel_rnb.jpg" image = load_image(image_file) # N,C,H,W = batch size, number of channels, height, width N, C, H, W = detection_input_layer.shape # Resize image to meet network expected input sizes resized_image = cv2.resize(image, (W, H)) # Reshape to network input shape input_image = np.expand_dims(resized_image.transpose(2, 0, 1), 0) plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)); # Do Inference output_key = detection_compiled_model.output("boxes") boxes = detection_compiled_model([input_image])[output_key] # Remove zero only boxes boxes = boxes[~np.all(boxes == 0, axis=1)] # Get Detection Results def multiply_by_ratio(ratio_x, ratio_y, box): return [ max(shape * ratio_y, 10) if idx % 2 else shape * ratio_x for idx, shape in enumerate(box[:-1]) ] def run_preprocesing_on_crop(crop, net_shape): temp_img = cv2.resize(crop, net_shape) temp_img = temp_img.reshape((1,) * 2 + temp_img.shape) return temp_img def convert_result_to_image(bgr_image, resized_image, boxes, threshold=0.3, conf_labels=True): # Define colors for boxes and descriptions colors = {"red": (255, 0, 0), "green": (0, 255, 0), "white": (255, 255, 255)} # Fetch image shapes to calculate ratio (real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2] ratio_x, ratio_y = real_x / resized_x, real_y / resized_y # Convert base image from bgr to rgb format rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB) # Iterate through non-zero boxes for box, annotation in boxes: # Pick confidence factor from last place in array conf = box[-1] if conf > threshold: # Convert float to int and multiply position of each box by x and y ratio (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, box)) # Draw box based on position, parameters in rectangle function are: image, start_point, end_point, color, thickness cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["green"], 3) # Add text to image based on position and confidence, parameters in putText function are: image, text, bottomleft_corner_textfield, font, font_scale, color, thickness, line_type if conf_labels: # Create background box based on annotation length (text_w, text_h), _ = cv2.getTextSize( f"{annotation}", cv2.FONT_HERSHEY_TRIPLEX, 0.8, 1 ) image_copy = rgb_image.copy() cv2.rectangle( image_copy, (x_min, y_min - text_h - 10), (x_min + text_w, y_min - 10), colors["white"], -1, ) # Add weighted image copy with white boxes under text cv2.addWeighted(image_copy, 0.4, rgb_image, 0.6, 0, rgb_image) cv2.putText( rgb_image, f"{annotation}", (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, colors["red"], 1, cv2.LINE_AA, ) return rgb_image # Load Text Recognition Model recognition_model = ie.read_model( model=recognition_model_path, weights=recognition_model_path.with_suffix(".bin") ) recognition_compiled_model = ie.compile_model(model=recognition_model, device_name="CPU") recognition_output_layer = recognition_compiled_model.output(0) recognition_input_layer = recognition_compiled_model.input(0) # Get height and width of input layer _, _, H, W = recognition_input_layer.shape # Do Inference # Calculate scale for image resizing (real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2] ratio_x, ratio_y = real_x / resized_x, real_y / resized_y # Convert image to grayscale for text recognition model grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Get dictionary to encode output, based on model documentation letters = "~0123456789abcdefghijklmnopqrstuvwxyz" # Prepare empty list for annotations annotations = list() cropped_images = list() # fig, ax = plt.subplots(len(boxes), 1, figsize=(5,15), sharex=True, sharey=True) # For each crop, based on boxes given by detection model we want to get annotations for i, crop in enumerate(boxes): # Get coordinates on corners of crop (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, crop)) image_crop = run_preprocesing_on_crop(grayscale_image[y_min:y_max, x_min:x_max], (W, H)) # Run inference with recognition model result = recognition_compiled_model([image_crop])[recognition_output_layer] # Squeeze output to remove unnececery dimension recognition_results_test = np.squeeze(result) # Read annotation based on probabilities from output layer annotation = list() for letter in recognition_results_test: parsed_letter = letters[letter.argmax()] # Returning 0 index from argmax signalises end of string if parsed_letter == letters[0]: break annotation.append(parsed_letter) annotations.append("".join(annotation)) cropped_image = Image.fromarray(image[y_min:y_max, x_min:x_max]) cropped_images.append(cropped_image) boxes_with_annotations = list(zip(boxes, annotations)) # Show Detected Text Boxes and OCR Results for the Image plt.figure(figsize=(12, 12)) plt.imshow(convert_result_to_image(image, resized_image, boxes_with_annotations, conf_labels=True)); # Show the OCR Result per Bounding Box for cropped_image, annotation in zip(cropped_images, annotations): display(cropped_image, Markdown("".join(annotation))) # Print Annotations in Plain Text Format [ annotation for _, annotation in sorted(zip(boxes, annotations), key=lambda x: x[0][0] ** 2 + x[0][1] ** 2) ]
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter