201-vision-monodepth: Monodepth Estimation with OpenVINO
Thu Jun 16 2022 14:41:17 GMT+0000 (Coordinated Universal Time)
Saved by @OpenVINOtoolkit #python #openvino #openvino-notebooks #monodepth-estimation
# Preparation # Imports import sys import time from pathlib import Path import cv2 import matplotlib.cm import matplotlib.pyplot as plt import numpy as np from IPython.display import ( HTML, FileLink, Pretty, ProgressBar, Video, clear_output, display, ) from openvino.runtime import Core sys.path.append("../utils") from notebook_utils import load_image # Settings DEVICE = "CPU" MODEL_FILE = "model/MiDaS_small.xml" model_xml_path = Path(MODEL_FILE) # Functions def normalize_minmax(data): """Normalizes the values in `data` between 0 and 1""" return (data - data.min()) / (data.max() - data.min()) def convert_result_to_image(result, colormap="viridis"): """ Convert network result of floating point numbers to an RGB image with integer values from 0-255 by applying a colormap. `result` is expected to be a single network result in 1,H,W shape `colormap` is a matplotlib colormap. See https://matplotlib.org/stable/tutorials/colors/colormaps.html """ cmap = matplotlib.cm.get_cmap(colormap) result = result.squeeze(0) result = normalize_minmax(result) result = cmap(result)[:, :, :3] * 255 result = result.astype(np.uint8) return result def to_rgb(image_data) -> np.ndarray: """ Convert image_data from BGR to RGB """ return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB) # Load the Model ie = Core() model = ie.read_model(model=model_xml_path, weights=model_xml_path.with_suffix(".bin")) compiled_model = ie.compile_model(model=model, device_name=DEVICE) input_key = compiled_model.input(0) output_key = compiled_model.output(0) network_input_shape = list(input_key.shape) network_image_height, network_image_width = network_input_shape[2:] # Load, resize and reshape input image IMAGE_FILE = "data/coco_bike.jpg" image = load_image(path=IMAGE_FILE) # resize to input shape for network resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width)) # reshape image to network input shape NCHW input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0) # Do inference on image result = compiled_model([input_image])[output_key] # convert network result of disparity map to an image that shows # distance as colors result_image = convert_result_to_image(result=result) # resize back to original image shape. cv2.resize expects shape # in (width, height), [::-1] reverses the (height, width) shape to match this result_image = cv2.resize(result_image, image.shape[:2][::-1]) # Display monodepth image fig, ax = plt.subplots(1, 2, figsize=(20, 15)) ax[0].imshow(to_rgb(image)) ax[1].imshow(result_image); # Video Settings # Video source: https://www.youtube.com/watch?v=fu1xcQdJRws (Public Domain) VIDEO_FILE = "data/Coco Walking in Berkeley.mp4" # Number of seconds of input video to process. Set to 0 to process # the full video. NUM_SECONDS = 4 # Set ADVANCE_FRAMES to 1 to process every frame from the input video # Set ADVANCE_FRAMES to 2 to process every second frame. This reduces # the time it takes to process the video ADVANCE_FRAMES = 2 # Set SCALE_OUTPUT to reduce the size of the result video # If SCALE_OUTPUT is 0.5, the width and height of the result video # will be half the width and height of the input video SCALE_OUTPUT = 0.5 # The format to use for video encoding. vp09 is slow, # but it works on most systems. # Try the THEO encoding if you have FFMPEG installed. # FOURCC = cv2.VideoWriter_fourcc(*"THEO") FOURCC = cv2.VideoWriter_fourcc(*"vp09") # Create Path objects for the input video and the resulting video output_directory = Path("output") output_directory.mkdir(exist_ok=True) result_video_path = output_directory / f"{Path(VIDEO_FILE).stem}_monodepth.mp4" # Load Video cap = cv2.VideoCapture(str(VIDEO_FILE)) ret, image = cap.read() if not ret: raise ValueError(f"The video at {VIDEO_FILE} cannot be read.") input_fps = cap.get(cv2.CAP_PROP_FPS) input_video_frame_height, input_video_frame_width = image.shape[:2] target_fps = input_fps / ADVANCE_FRAMES target_frame_height = int(input_video_frame_height * SCALE_OUTPUT) target_frame_width = int(input_video_frame_width * SCALE_OUTPUT) cap.release() print( f"The input video has a frame width of {input_video_frame_width}, " f"frame height of {input_video_frame_height} and runs at {input_fps:.2f} fps" ) print( "The monodepth video will be scaled with a factor " f"{SCALE_OUTPUT}, have width {target_frame_width}, " f" height {target_frame_height}, and run at {target_fps:.2f} fps" ) # Do Inference on a Video and Create Monodepth Video # Initialize variables input_video_frame_nr = 0 start_time = time.perf_counter() total_inference_duration = 0 # Open input video cap = cv2.VideoCapture(str(VIDEO_FILE)) # Create result video out_video = cv2.VideoWriter( str(result_video_path), FOURCC, target_fps, (target_frame_width * 2, target_frame_height), ) num_frames = int(NUM_SECONDS * input_fps) total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) if num_frames == 0 else num_frames progress_bar = ProgressBar(total=total_frames) progress_bar.display() try: while cap.isOpened(): ret, image = cap.read() if not ret: cap.release() break if input_video_frame_nr >= total_frames: break # Only process every second frame # Prepare frame for inference # resize to input shape for network resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width)) # reshape image to network input shape NCHW input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0) # Do inference inference_start_time = time.perf_counter() result = compiled_model([input_image])[output_key] inference_stop_time = time.perf_counter() inference_duration = inference_stop_time - inference_start_time total_inference_duration += inference_duration if input_video_frame_nr % (10 * ADVANCE_FRAMES) == 0: clear_output(wait=True) progress_bar.display() # input_video_frame_nr // ADVANCE_FRAMES gives the number of # frames that have been processed by the network display( Pretty( f"Processed frame {input_video_frame_nr // ADVANCE_FRAMES}" f"/{total_frames // ADVANCE_FRAMES}. " f"Inference time per frame: {inference_duration:.2f} seconds " f"({1/inference_duration:.2f} FPS)" ) ) # Transform network result to RGB image result_frame = to_rgb(convert_result_to_image(result)) # Resize image and result to target frame shape result_frame = cv2.resize(result_frame, (target_frame_width, target_frame_height)) image = cv2.resize(image, (target_frame_width, target_frame_height)) # Put image and result side by side stacked_frame = np.hstack((image, result_frame)) # Save frame to video out_video.write(stacked_frame) input_video_frame_nr = input_video_frame_nr + ADVANCE_FRAMES cap.set(1, input_video_frame_nr) progress_bar.progress = input_video_frame_nr progress_bar.update() except KeyboardInterrupt: print("Processing interrupted.") finally: clear_output() processed_frames = num_frames // ADVANCE_FRAMES out_video.release() cap.release() end_time = time.perf_counter() duration = end_time - start_time print( f"Processed {processed_frames} frames in {duration:.2f} seconds. " f"Total FPS (including video processing): {processed_frames/duration:.2f}." f"Inference FPS: {processed_frames/total_inference_duration:.2f} " ) print(f"Monodepth Video saved to '{str(result_video_path)}'.") # Display Monodepth Video video = Video(result_video_path, width=800, embed=True) if not result_video_path.exists(): plt.imshow(stacked_frame) raise ValueError("OpenCV was unable to write the video file. Showing one video frame.") else: print(f"Showing monodepth video saved at\n{result_video_path.resolve()}") print( "If you cannot see the video in your browser, please click on the " "following link to download the video " ) video_link = FileLink(result_video_path) video_link.html_link_str = "<a href='%s' download>%s</a>" display(HTML(video_link._repr_html_())) display(video)
Monocular Depth Estimation is the task of estimating scene depth using a single image. It has many potential applications in robotics, 3D reconstruction, medical imaging and autonomous systems. For this demo, we use a neural network model called MiDaS which was developed by the Intelligent Systems Lab at Intel. If you have not yet installed OpenVINO™, please follow the Installation Guide to install all required dependencies. https://github.com/openvinotoolkit/openvino_notebooks/blob/main/README.md#-installation-guide Link to .bin and .xml files: https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/midasnet/model.yml
https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/201-vision-monodepth/201-vision-monodepth.ipynb
Comments