# Imports
import shutil
import sys
from pathlib import Path
import cv2
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Markdown, display
from PIL import Image
from openvino.runtime import Core
from yaspin import yaspin
sys.path.append("../utils")
from notebook_utils import load_image
# Settings
ie = Core()
model_dir = Path("model")
precision = "FP16"
detection_model = "horizontal-text-detection-0001"
recognition_model = "text-recognition-resnet-fc"
base_model_dir = Path("~/open_model_zoo_models").expanduser()
omz_cache_dir = Path("~/open_model_zoo_cache").expanduser()
model_dir.mkdir(exist_ok=True)
# Download Models
download_command = f"omz_downloader --name {detection_model},{recognition_model} --output_dir {base_model_dir} --cache_dir {omz_cache_dir} --precision {precision}"
display(Markdown(f"Download command: `{download_command}`"))
with yaspin(text=f"Downloading {detection_model}, {recognition_model}") as sp:
download_result = !$download_command
print(download_result)
sp.text = f"Finished downloading {detection_model}, {recognition_model}"
sp.ok("✔")
# Convert Models
convert_command = f"omz_converter --name {recognition_model} --precisions {precision} --download_dir {base_model_dir} --output_dir {base_model_dir}"
display(Markdown(f"Convert command: `{convert_command}`"))
display(Markdown(f"Converting {recognition_model}..."))
! $convert_command
# Copy Models
models_info_output = %sx omz_info_dumper --name $detection_model,$recognition_model
print(f'sx omz_info_dumper --name {detection_model},{recognition_model}')
detection_model_info, recognition_model_info = [
{
"name": "horizontal-text-detection-0001",
"composite_model_name": None,
"description": "Horizontal text detector based on FCOS with light MobileNetV2 backbone",
"framework": "dldt",
"license_url": "https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/LICENSE",
"precisions": [
"FP16",
"FP16-INT8",
"FP32"
],
"quantization_output_precisions": [],
"subdirectory": "intel/horizontal-text-detection-0001",
"task_type": "detection"
},
{
"name": "text-recognition-resnet-fc",
"composite_model_name": None,
"description": "\"text-recognition-resnet-fc\" is a simple and preformant scene text recognition model based on ResNet with Fully Connected text recognition head. Source implementation on a PyTorch* framework could be found here <https://github.com/Media-Smart/vedastr>. Model is able to recognize alphanumeric text.",
"framework": "pytorch",
"license_url": "https://raw.githubusercontent.com/Media-Smart/vedastr/0fd2a0bd7819ae4daa2a161501e9f1c2ac67e96a/LICENSE",
"precisions": [
"FP16",
"FP32"
],
"quantization_output_precisions": [],
"subdirectory": "public/text-recognition-resnet-fc",
"task_type": "optical_character_recognition"
}
]
for model_info in (detection_model_info, recognition_model_info):
omz_dir = Path(model_info["subdirectory"])
omz_model_dir = base_model_dir / omz_dir / precision
print(omz_model_dir)
for model_file in omz_model_dir.iterdir():
try:
shutil.copyfile(model_file, model_dir / model_file.name)
except FileExistsError:
pass
detection_model_path = (model_dir / detection_model).with_suffix(".xml")
recognition_model_path = (model_dir / recognition_model).with_suffix(".xml")
# Load Detection Model
detection_model = ie.read_model(
model=detection_model_path, weights=detection_model_path.with_suffix(".bin")
)
detection_compiled_model = ie.compile_model(model=detection_model, device_name="CPU")
detection_input_layer = detection_compiled_model.input(0)
# Load an Image
# image_file can point to a URL or local image
image_file = "https://github.com/openvinotoolkit/openvino_notebooks/raw/main/notebooks/004-hello-detection/data/intel_rnb.jpg"
image = load_image(image_file)
# N,C,H,W = batch size, number of channels, height, width
N, C, H, W = detection_input_layer.shape
# Resize image to meet network expected input sizes
resized_image = cv2.resize(image, (W, H))
# Reshape to network input shape
input_image = np.expand_dims(resized_image.transpose(2, 0, 1), 0)
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB));
# Do Inference
output_key = detection_compiled_model.output("boxes")
boxes = detection_compiled_model([input_image])[output_key]
# Remove zero only boxes
boxes = boxes[~np.all(boxes == 0, axis=1)]
# Get Detection Results
def multiply_by_ratio(ratio_x, ratio_y, box):
return [
max(shape * ratio_y, 10) if idx % 2 else shape * ratio_x
for idx, shape in enumerate(box[:-1])
]
def run_preprocesing_on_crop(crop, net_shape):
temp_img = cv2.resize(crop, net_shape)
temp_img = temp_img.reshape((1,) * 2 + temp_img.shape)
return temp_img
def convert_result_to_image(bgr_image, resized_image, boxes, threshold=0.3, conf_labels=True):
# Define colors for boxes and descriptions
colors = {"red": (255, 0, 0), "green": (0, 255, 0), "white": (255, 255, 255)}
# Fetch image shapes to calculate ratio
(real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2]
ratio_x, ratio_y = real_x / resized_x, real_y / resized_y
# Convert base image from bgr to rgb format
rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
# Iterate through non-zero boxes
for box, annotation in boxes:
# Pick confidence factor from last place in array
conf = box[-1]
if conf > threshold:
# Convert float to int and multiply position of each box by x and y ratio
(x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, box))
# Draw box based on position, parameters in rectangle function are: image, start_point, end_point, color, thickness
cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["green"], 3)
# Add text to image based on position and confidence, parameters in putText function are: image, text, bottomleft_corner_textfield, font, font_scale, color, thickness, line_type
if conf_labels:
# Create background box based on annotation length
(text_w, text_h), _ = cv2.getTextSize(
f"{annotation}", cv2.FONT_HERSHEY_TRIPLEX, 0.8, 1
)
image_copy = rgb_image.copy()
cv2.rectangle(
image_copy,
(x_min, y_min - text_h - 10),
(x_min + text_w, y_min - 10),
colors["white"],
-1,
)
# Add weighted image copy with white boxes under text
cv2.addWeighted(image_copy, 0.4, rgb_image, 0.6, 0, rgb_image)
cv2.putText(
rgb_image,
f"{annotation}",
(x_min, y_min - 10),
cv2.FONT_HERSHEY_SIMPLEX,
0.8,
colors["red"],
1,
cv2.LINE_AA,
)
return rgb_image
# Load Text Recognition Model
recognition_model = ie.read_model(
model=recognition_model_path, weights=recognition_model_path.with_suffix(".bin")
)
recognition_compiled_model = ie.compile_model(model=recognition_model, device_name="CPU")
recognition_output_layer = recognition_compiled_model.output(0)
recognition_input_layer = recognition_compiled_model.input(0)
# Get height and width of input layer
_, _, H, W = recognition_input_layer.shape
# Do Inference
# Calculate scale for image resizing
(real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2]
ratio_x, ratio_y = real_x / resized_x, real_y / resized_y
# Convert image to grayscale for text recognition model
grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Get dictionary to encode output, based on model documentation
letters = "~0123456789abcdefghijklmnopqrstuvwxyz"
# Prepare empty list for annotations
annotations = list()
cropped_images = list()
# fig, ax = plt.subplots(len(boxes), 1, figsize=(5,15), sharex=True, sharey=True)
# For each crop, based on boxes given by detection model we want to get annotations
for i, crop in enumerate(boxes):
# Get coordinates on corners of crop
(x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, crop))
image_crop = run_preprocesing_on_crop(grayscale_image[y_min:y_max, x_min:x_max], (W, H))
# Run inference with recognition model
result = recognition_compiled_model([image_crop])[recognition_output_layer]
# Squeeze output to remove unnececery dimension
recognition_results_test = np.squeeze(result)
# Read annotation based on probabilities from output layer
annotation = list()
for letter in recognition_results_test:
parsed_letter = letters[letter.argmax()]
# Returning 0 index from argmax signalises end of string
if parsed_letter == letters[0]:
break
annotation.append(parsed_letter)
annotations.append("".join(annotation))
cropped_image = Image.fromarray(image[y_min:y_max, x_min:x_max])
cropped_images.append(cropped_image)
boxes_with_annotations = list(zip(boxes, annotations))
# Show Detected Text Boxes and OCR Results for the Image
plt.figure(figsize=(12, 12))
plt.imshow(convert_result_to_image(image, resized_image, boxes_with_annotations, conf_labels=True));
# Show the OCR Result per Bounding Box
for cropped_image, annotation in zip(cropped_images, annotations):
display(cropped_image, Markdown("".join(annotation)))
# Print Annotations in Plain Text Format
[
annotation
for _, annotation in sorted(zip(boxes, annotations), key=lambda x: x[0][0] ** 2 + x[0][1] ** 2)
]
Comments