# Imports
import shutil
import sys
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Markdown, display
from PIL import Image
from openvino.runtime import Core
from yaspin import yaspin

from notebook_utils import load_image

# Settings
ie = Core()

model_dir = Path("model")
precision = "FP16"
detection_model = "horizontal-text-detection-0001"
recognition_model = "text-recognition-resnet-fc"
base_model_dir = Path("~/open_model_zoo_models").expanduser()
omz_cache_dir = Path("~/open_model_zoo_cache").expanduser()


# Download Models
download_command = f"omz_downloader --name {detection_model},{recognition_model} --output_dir {base_model_dir} --cache_dir {omz_cache_dir} --precision {precision}"
display(Markdown(f"Download command: `{download_command}`"))
with yaspin(text=f"Downloading {detection_model}, {recognition_model}") as sp:
    download_result = !$download_command
    sp.text = f"Finished downloading {detection_model}, {recognition_model}"

# Convert Models
convert_command = f"omz_converter --name {recognition_model} --precisions {precision} --download_dir {base_model_dir} --output_dir {base_model_dir}"
display(Markdown(f"Convert command: `{convert_command}`"))
display(Markdown(f"Converting {recognition_model}..."))
! $convert_command

# Copy Models
models_info_output = %sx omz_info_dumper --name $detection_model,$recognition_model
print(f'sx omz_info_dumper --name {detection_model},{recognition_model}')
detection_model_info, recognition_model_info = [
        "name": "horizontal-text-detection-0001",
        "composite_model_name": None,
        "description": "Horizontal text detector based on FCOS with light MobileNetV2 backbone",
        "framework": "dldt",
        "license_url": "",
        "precisions": [
        "quantization_output_precisions": [],
        "subdirectory": "intel/horizontal-text-detection-0001",
        "task_type": "detection"
        "name": "text-recognition-resnet-fc",
        "composite_model_name": None,
        "description": "\"text-recognition-resnet-fc\" is a simple and preformant scene text recognition model based on ResNet with Fully Connected text recognition head. Source implementation on a PyTorch* framework could be found here <>. Model is able to recognize alphanumeric text.",
        "framework": "pytorch",
        "license_url": "",
        "precisions": [
        "quantization_output_precisions": [],
        "subdirectory": "public/text-recognition-resnet-fc",
        "task_type": "optical_character_recognition"

for model_info in (detection_model_info, recognition_model_info):
    omz_dir = Path(model_info["subdirectory"])
    omz_model_dir = base_model_dir / omz_dir / precision
    for model_file in omz_model_dir.iterdir():
            shutil.copyfile(model_file, model_dir /
        except FileExistsError:

detection_model_path = (model_dir / detection_model).with_suffix(".xml")
recognition_model_path = (model_dir / recognition_model).with_suffix(".xml")

# Load Detection Model
detection_model = ie.read_model(
    model=detection_model_path, weights=detection_model_path.with_suffix(".bin")
detection_compiled_model = ie.compile_model(model=detection_model, device_name="CPU")

detection_input_layer = detection_compiled_model.input(0)

# Load an Image
# image_file can point to a URL or local image
image_file = ""

image = load_image(image_file)

# N,C,H,W = batch size, number of channels, height, width
N, C, H, W = detection_input_layer.shape

# Resize image to meet network expected input sizes
resized_image = cv2.resize(image, (W, H))

# Reshape to network input shape
input_image = np.expand_dims(resized_image.transpose(2, 0, 1), 0)

plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB));

# Do Inference
output_key = detection_compiled_model.output("boxes")
boxes = detection_compiled_model([input_image])[output_key]

# Remove zero only boxes
boxes = boxes[~np.all(boxes == 0, axis=1)]

# Get Detection Results
def multiply_by_ratio(ratio_x, ratio_y, box):
    return [
        max(shape * ratio_y, 10) if idx % 2 else shape * ratio_x
        for idx, shape in enumerate(box[:-1])

def run_preprocesing_on_crop(crop, net_shape):
    temp_img = cv2.resize(crop, net_shape)
    temp_img = temp_img.reshape((1,) * 2 + temp_img.shape)
    return temp_img

def convert_result_to_image(bgr_image, resized_image, boxes, threshold=0.3, conf_labels=True):
    # Define colors for boxes and descriptions
    colors = {"red": (255, 0, 0), "green": (0, 255, 0), "white": (255, 255, 255)}

    # Fetch image shapes to calculate ratio
    (real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2]
    ratio_x, ratio_y = real_x / resized_x, real_y / resized_y

    # Convert base image from bgr to rgb format
    rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)

    # Iterate through non-zero boxes
    for box, annotation in boxes:
        # Pick confidence factor from last place in array
        conf = box[-1]
        if conf > threshold:
            # Convert float to int and multiply position of each box by x and y ratio
            (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, box))

            # Draw box based on position, parameters in rectangle function are: image, start_point, end_point, color, thickness
            cv2.rectangle(rgb_image, (x_min, y_min), (x_max, y_max), colors["green"], 3)

            # Add text to image based on position and confidence, parameters in putText function are: image, text, bottomleft_corner_textfield, font, font_scale, color, thickness, line_type
            if conf_labels:
                # Create background box based on annotation length
                (text_w, text_h), _ = cv2.getTextSize(
                    f"{annotation}", cv2.FONT_HERSHEY_TRIPLEX, 0.8, 1
                image_copy = rgb_image.copy()
                    (x_min, y_min - text_h - 10),
                    (x_min + text_w, y_min - 10),
                # Add weighted image copy with white boxes under text
                cv2.addWeighted(image_copy, 0.4, rgb_image, 0.6, 0, rgb_image)
                    (x_min, y_min - 10),

    return rgb_image

# Load Text Recognition Model
recognition_model = ie.read_model(
    model=recognition_model_path, weights=recognition_model_path.with_suffix(".bin")

recognition_compiled_model = ie.compile_model(model=recognition_model, device_name="CPU")

recognition_output_layer = recognition_compiled_model.output(0)
recognition_input_layer = recognition_compiled_model.input(0)

# Get height and width of input layer
_, _, H, W = recognition_input_layer.shape

# Do Inference
# Calculate scale for image resizing
(real_y, real_x), (resized_y, resized_x) = image.shape[:2], resized_image.shape[:2]
ratio_x, ratio_y = real_x / resized_x, real_y / resized_y

# Convert image to grayscale for text recognition model
grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Get dictionary to encode output, based on model documentation
letters = "~0123456789abcdefghijklmnopqrstuvwxyz"

# Prepare empty list for annotations
annotations = list()
cropped_images = list()
# fig, ax = plt.subplots(len(boxes), 1, figsize=(5,15), sharex=True, sharey=True)
# For each crop, based on boxes given by detection model we want to get annotations
for i, crop in enumerate(boxes):
    # Get coordinates on corners of crop
    (x_min, y_min, x_max, y_max) = map(int, multiply_by_ratio(ratio_x, ratio_y, crop))
    image_crop = run_preprocesing_on_crop(grayscale_image[y_min:y_max, x_min:x_max], (W, H))

    # Run inference with recognition model
    result = recognition_compiled_model([image_crop])[recognition_output_layer]

    # Squeeze output to remove unnececery dimension
    recognition_results_test = np.squeeze(result)

    # Read annotation based on probabilities from output layer
    annotation = list()
    for letter in recognition_results_test:
        parsed_letter = letters[letter.argmax()]

        # Returning 0 index from argmax signalises end of string
        if parsed_letter == letters[0]:
    cropped_image = Image.fromarray(image[y_min:y_max, x_min:x_max])

boxes_with_annotations = list(zip(boxes, annotations))

# Show Detected Text Boxes and OCR Results for the Image
plt.figure(figsize=(12, 12))
plt.imshow(convert_result_to_image(image, resized_image, boxes_with_annotations, conf_labels=True));

# Show the OCR Result per Bounding Box
for cropped_image, annotation in zip(cropped_images, annotations):
    display(cropped_image, Markdown("".join(annotation)))

# Print Annotations in Plain Text Format
    for _, annotation in sorted(zip(boxes, annotations), key=lambda x: x[0][0] ** 2 + x[0][1] ** 2)
downloadDownload PNG downloadDownload JPEG downloadDownload SVG

Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!

Click to optimize width for Twitter