201-vision-monodepth: Monodepth Estimation with OpenVINO
Thu Jun 16 2022 14:41:17 GMT+0000 (Coordinated Universal Time)
Saved by @OpenVINOtoolkit #python #openvino #openvino-notebooks #monodepth-estimation
# Preparation
# Imports
import sys
import time
from pathlib import Path
import cv2
import matplotlib.cm
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import (
HTML,
FileLink,
Pretty,
ProgressBar,
Video,
clear_output,
display,
)
from openvino.runtime import Core
sys.path.append("../utils")
from notebook_utils import load_image
# Settings
DEVICE = "CPU"
MODEL_FILE = "model/MiDaS_small.xml"
model_xml_path = Path(MODEL_FILE)
# Functions
def normalize_minmax(data):
"""Normalizes the values in `data` between 0 and 1"""
return (data - data.min()) / (data.max() - data.min())
def convert_result_to_image(result, colormap="viridis"):
"""
Convert network result of floating point numbers to an RGB image with
integer values from 0-255 by applying a colormap.
`result` is expected to be a single network result in 1,H,W shape
`colormap` is a matplotlib colormap.
See https://matplotlib.org/stable/tutorials/colors/colormaps.html
"""
cmap = matplotlib.cm.get_cmap(colormap)
result = result.squeeze(0)
result = normalize_minmax(result)
result = cmap(result)[:, :, :3] * 255
result = result.astype(np.uint8)
return result
def to_rgb(image_data) -> np.ndarray:
"""
Convert image_data from BGR to RGB
"""
return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)
# Load the Model
ie = Core()
model = ie.read_model(model=model_xml_path, weights=model_xml_path.with_suffix(".bin"))
compiled_model = ie.compile_model(model=model, device_name=DEVICE)
input_key = compiled_model.input(0)
output_key = compiled_model.output(0)
network_input_shape = list(input_key.shape)
network_image_height, network_image_width = network_input_shape[2:]
# Load, resize and reshape input image
IMAGE_FILE = "data/coco_bike.jpg"
image = load_image(path=IMAGE_FILE)
# resize to input shape for network
resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width))
# reshape image to network input shape NCHW
input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0)
# Do inference on image
result = compiled_model([input_image])[output_key]
# convert network result of disparity map to an image that shows
# distance as colors
result_image = convert_result_to_image(result=result)
# resize back to original image shape. cv2.resize expects shape
# in (width, height), [::-1] reverses the (height, width) shape to match this
result_image = cv2.resize(result_image, image.shape[:2][::-1])
# Display monodepth image
fig, ax = plt.subplots(1, 2, figsize=(20, 15))
ax[0].imshow(to_rgb(image))
ax[1].imshow(result_image);
# Video Settings
# Video source: https://www.youtube.com/watch?v=fu1xcQdJRws (Public Domain)
VIDEO_FILE = "data/Coco Walking in Berkeley.mp4"
# Number of seconds of input video to process. Set to 0 to process
# the full video.
NUM_SECONDS = 4
# Set ADVANCE_FRAMES to 1 to process every frame from the input video
# Set ADVANCE_FRAMES to 2 to process every second frame. This reduces
# the time it takes to process the video
ADVANCE_FRAMES = 2
# Set SCALE_OUTPUT to reduce the size of the result video
# If SCALE_OUTPUT is 0.5, the width and height of the result video
# will be half the width and height of the input video
SCALE_OUTPUT = 0.5
# The format to use for video encoding. vp09 is slow,
# but it works on most systems.
# Try the THEO encoding if you have FFMPEG installed.
# FOURCC = cv2.VideoWriter_fourcc(*"THEO")
FOURCC = cv2.VideoWriter_fourcc(*"vp09")
# Create Path objects for the input video and the resulting video
output_directory = Path("output")
output_directory.mkdir(exist_ok=True)
result_video_path = output_directory / f"{Path(VIDEO_FILE).stem}_monodepth.mp4"
# Load Video
cap = cv2.VideoCapture(str(VIDEO_FILE))
ret, image = cap.read()
if not ret:
raise ValueError(f"The video at {VIDEO_FILE} cannot be read.")
input_fps = cap.get(cv2.CAP_PROP_FPS)
input_video_frame_height, input_video_frame_width = image.shape[:2]
target_fps = input_fps / ADVANCE_FRAMES
target_frame_height = int(input_video_frame_height * SCALE_OUTPUT)
target_frame_width = int(input_video_frame_width * SCALE_OUTPUT)
cap.release()
print(
f"The input video has a frame width of {input_video_frame_width}, "
f"frame height of {input_video_frame_height} and runs at {input_fps:.2f} fps"
)
print(
"The monodepth video will be scaled with a factor "
f"{SCALE_OUTPUT}, have width {target_frame_width}, "
f" height {target_frame_height}, and run at {target_fps:.2f} fps"
)
# Do Inference on a Video and Create Monodepth Video
# Initialize variables
input_video_frame_nr = 0
start_time = time.perf_counter()
total_inference_duration = 0
# Open input video
cap = cv2.VideoCapture(str(VIDEO_FILE))
# Create result video
out_video = cv2.VideoWriter(
str(result_video_path),
FOURCC,
target_fps,
(target_frame_width * 2, target_frame_height),
)
num_frames = int(NUM_SECONDS * input_fps)
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) if num_frames == 0 else num_frames
progress_bar = ProgressBar(total=total_frames)
progress_bar.display()
try:
while cap.isOpened():
ret, image = cap.read()
if not ret:
cap.release()
break
if input_video_frame_nr >= total_frames:
break
# Only process every second frame
# Prepare frame for inference
# resize to input shape for network
resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width))
# reshape image to network input shape NCHW
input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0)
# Do inference
inference_start_time = time.perf_counter()
result = compiled_model([input_image])[output_key]
inference_stop_time = time.perf_counter()
inference_duration = inference_stop_time - inference_start_time
total_inference_duration += inference_duration
if input_video_frame_nr % (10 * ADVANCE_FRAMES) == 0:
clear_output(wait=True)
progress_bar.display()
# input_video_frame_nr // ADVANCE_FRAMES gives the number of
# frames that have been processed by the network
display(
Pretty(
f"Processed frame {input_video_frame_nr // ADVANCE_FRAMES}"
f"/{total_frames // ADVANCE_FRAMES}. "
f"Inference time per frame: {inference_duration:.2f} seconds "
f"({1/inference_duration:.2f} FPS)"
)
)
# Transform network result to RGB image
result_frame = to_rgb(convert_result_to_image(result))
# Resize image and result to target frame shape
result_frame = cv2.resize(result_frame, (target_frame_width, target_frame_height))
image = cv2.resize(image, (target_frame_width, target_frame_height))
# Put image and result side by side
stacked_frame = np.hstack((image, result_frame))
# Save frame to video
out_video.write(stacked_frame)
input_video_frame_nr = input_video_frame_nr + ADVANCE_FRAMES
cap.set(1, input_video_frame_nr)
progress_bar.progress = input_video_frame_nr
progress_bar.update()
except KeyboardInterrupt:
print("Processing interrupted.")
finally:
clear_output()
processed_frames = num_frames // ADVANCE_FRAMES
out_video.release()
cap.release()
end_time = time.perf_counter()
duration = end_time - start_time
print(
f"Processed {processed_frames} frames in {duration:.2f} seconds. "
f"Total FPS (including video processing): {processed_frames/duration:.2f}."
f"Inference FPS: {processed_frames/total_inference_duration:.2f} "
)
print(f"Monodepth Video saved to '{str(result_video_path)}'.")
# Display Monodepth Video
video = Video(result_video_path, width=800, embed=True)
if not result_video_path.exists():
plt.imshow(stacked_frame)
raise ValueError("OpenCV was unable to write the video file. Showing one video frame.")
else:
print(f"Showing monodepth video saved at\n{result_video_path.resolve()}")
print(
"If you cannot see the video in your browser, please click on the "
"following link to download the video "
)
video_link = FileLink(result_video_path)
video_link.html_link_str = "<a href='%s' download>%s</a>"
display(HTML(video_link._repr_html_()))
display(video)
Monocular Depth Estimation is the task of estimating scene depth using a single image. It has many potential applications in robotics, 3D reconstruction, medical imaging and autonomous systems. For this demo, we use a neural network model called MiDaS which was developed by the Intelligent Systems Lab at Intel. If you have not yet installed OpenVINO™, please follow the Installation Guide to install all required dependencies. https://github.com/openvinotoolkit/openvino_notebooks/blob/main/README.md#-installation-guide Link to .bin and .xml files: https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/midasnet/model.yml
https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/201-vision-monodepth/201-vision-monodepth.ipynb


Comments