# pip3 install openvino
# Install ONNX Runtime for OpenVINO™ Execution Provider
# pip3 install onnxruntime-openvino==1.11.0
# pip3 install -r requirements.txt
# How to run the sample
# python3 tiny_yolov2_obj_detection_sample.py --h
# Running the ONNXRuntime OpenVINO™ Execution Provider sample
# python3 tiny_yolov2_obj_detection_sample.py --video face-demographics-walking-and-pause.mp4 --model tinyyolov2.onnx --device CPU_FP32
'''
Copyright (C) 2021-2022, Intel Corporation
SPDX-License-Identifier: Apache-2.0
'''
import numpy as np
import onnxruntime as rt
import cv2
import time
import os
import argparse
import platform
if platform.system() == "Windows":
from openvino import utils
utils.add_openvino_libs_to_path()
# color look up table for different classes for object detection sample
clut = [(0,0,0),(255,0,0),(255,0,255),(0,0,255),(0,255,0),(0,255,128),
(128,255,0),(128,128,0),(0,128,255),(128,0,128),
(255,0,128),(128,0,255),(255,128,128),(128,255,128),(255,255,0),
(255,128,128),(128,128,255),(255,128,128),(128,255,128),(128,255,128)]
# 20 labels that the tiny-yolov2 model can do the object_detection on
label = ["aeroplane","bicycle","bird","boat","bottle",
"bus","car","cat","chair","cow","diningtable",
"dog","horse","motorbike","person","pottedplant",
"sheep","sofa","train","tvmonitor"]
def parse_arguments():
parser = argparse.ArgumentParser(description='Object Detection using YOLOv2 in OPENCV using OpenVINO Execution Provider for ONNXRuntime')
parser.add_argument('--device', default='CPU_FP32', help="Device to perform inference on 'cpu (MLAS)' or on devices supported by OpenVINO-EP [CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16].")
parser.add_argument('--video', help='Path to video file.')
parser.add_argument('--model', help='Path to model.')
args = parser.parse_args()
return args
def sigmoid(x, derivative=False):
return x*(1-x) if derivative else 1/(1+np.exp(-x))
def softmax(x):
score_mat_exp = np.exp(np.asarray(x))
return score_mat_exp / score_mat_exp.sum(0)
def check_model_extension(fp):
# Split the extension from the path and normalise it to lowercase.
ext = os.path.splitext(fp)[-1].lower()
# Now we can simply use != to check for inequality, no need for wildcards.
if(ext != ".onnx"):
raise Exception(fp, "is an unknown file format. Use the model ending with .onnx format")
if not os.path.exists(fp):
raise Exception("[ ERROR ] Path of the onnx model file is Invalid")
def check_video_file_extension(fp):
# Split the extension from the path and normalise it to lowercase.
ext = os.path.splitext(fp)[-1].lower()
# Now we can simply use != to check for inequality, no need for wildcards.
if(ext == ".mp4" or ext == ".avi" or ext == ".mov"):
pass
else:
raise Exception(fp, "is an unknown file format. Use the video file ending with .mp4 or .avi or .mov formats")
if not os.path.exists(fp):
raise Exception("[ ERROR ] Path of the video file is Invalid")
def image_preprocess(frame):
in_frame = cv2.resize(frame, (416, 416))
preprocessed_image = np.asarray(in_frame)
preprocessed_image = preprocessed_image.astype(np.float32)
preprocessed_image = preprocessed_image.transpose(2,0,1)
#Reshaping the input array to align with the input shape of the model
preprocessed_image = preprocessed_image.reshape(1,3,416,416)
return preprocessed_image
def postprocess_output(out, frame, x_scale, y_scale, i):
out = out[0][0]
num_classes = 20
anchors = [1.08, 1.19, 3.42, 4.41, 6.63, 11.38, 9.42, 5.11, 16.62, 10.52]
existing_labels = {l: [] for l in label}
#Inside this loop we compute the bounding box b for grid cell (cy, cx)
for cy in range(0,13):
for cx in range(0,13):
for b in range(0,5):
# First we read the tx, ty, width(tw), and height(th) for the bounding box from the out array, as well as the confidence score
channel = b*(num_classes+5)
tx = out[channel ][cy][cx]
ty = out[channel+1][cy][cx]
tw = out[channel+2][cy][cx]
th = out[channel+3][cy][cx]
tc = out[channel+4][cy][cx]
x = (float(cx) + sigmoid(tx))*32
y = (float(cy) + sigmoid(ty))*32
w = np.exp(tw) * 32 * anchors[2*b]
h = np.exp(th) * 32 * anchors[2*b+1]
#calculating the confidence score
confidence = sigmoid(tc) # The confidence value for the bounding box is given by tc
classes = np.zeros(num_classes)
for c in range(0,num_classes):
classes[c] = out[channel + 5 +c][cy][cx]
# we take the softmax to turn the array into a probability distribution. And then we pick the class with the largest score as the winner.
classes = softmax(classes)
detected_class = classes.argmax()
# Now we can compute the final score for this bounding box and we only want to keep the ones whose combined score is over a certain threshold
if 0.60 < classes[detected_class]*confidence:
color =clut[detected_class]
x = (x - w/2)*x_scale
y = (y - h/2)*y_scale
w *= x_scale
h *= y_scale
labelX = int((x+x+w)/2)
labelY = int((y+y+h)/2)
addLabel = True
lab_threshold = 100
for point in existing_labels[label[detected_class]]:
if labelX < point[0] + lab_threshold and labelX > point[0] - lab_threshold and \
labelY < point[1] + lab_threshold and labelY > point[1] - lab_threshold:
addLabel = False
#Adding class labels to the output of the frame and also drawing a rectangular bounding box around the object detected.
if addLabel:
cv2.rectangle(frame, (int(x),int(y)),(int(x+w),int(y+h)),color,2)
cv2.rectangle(frame, (int(x),int(y-13)),(int(x)+9*len(label[detected_class]),int(y)),color,-1)
cv2.putText(frame,label[detected_class],(int(x)+2,int(y)-3),cv2.FONT_HERSHEY_COMPLEX,0.4,(255,255,255),1)
existing_labels[label[detected_class]].append((labelX,labelY))
print('{} detected in frame {}'.format(label[detected_class],i))
def show_bbox(device, frame, inference_time):
cv2.putText(frame,device,(10,20),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1)
cv2.putText(frame,'FPS: {}'.format(1.0/inference_time),(10,40),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,255,255),1)
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
cv2.imshow('frame',frame)
def main():
# Process arguments
args = parse_arguments()
# Validate model file path
check_model_extension(args.model)
so = rt.SessionOptions()
so.log_severity_level = 3
if (args.device == 'cpu'):
print("Device type selected is 'cpu' which is the default CPU Execution Provider (MLAS)")
#Specify the path to the ONNX model on your machine and register the CPU EP
sess = rt.InferenceSession(args.model, so, providers=['CPUExecutionProvider'])
elif (args.device == 'CPU_FP32' or args.device == 'GPU_FP32' or args.device == 'GPU_FP16' or args.device == 'MYRIAD_FP16' or args.device == 'VADM_FP16'):
#Specify the path to the ONNX model on your machine and register the OpenVINO EP
sess = rt.InferenceSession(args.model, so, providers=['OpenVINOExecutionProvider'], provider_options=[{'device_type' : args.device}])
print("Device type selected is: " + args.device + " using the OpenVINO Execution Provider")
'''
other 'device_type' options are: (Any hardware target can be assigned if you have the access to it)
'CPU_FP32', 'GPU_FP32', 'GPU_FP16', 'MYRIAD_FP16', 'VAD-M_FP16'
'''
else:
raise Exception("Device type selected is not [cpu, CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VADM_FP16]")
# Get the input name of the model
input_name = sess.get_inputs()[0].name
#validate video file input path
check_video_file_extension(args.video)
#Path to video file has to be provided
cap = cv2.VideoCapture(args.video)
# capturing different metrics of the image from the video
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
x_scale = float(width)/416.0 #In the document of tino-yolo-v2, input shape of this network is (1,3,416,416).
y_scale = float(height)/416.0
# writing the inferencing output as a video to the local disk
fourcc = cv2.VideoWriter_fourcc(*'XVID')
output_video_name = args.device + "_output.avi"
output_video = cv2.VideoWriter(output_video_name,fourcc, float(17.0), (640,360))
# capturing one frame at a time from the video feed and performing the inference
i = 0
while cv2.waitKey(1) < 0:
l_start = time.time()
ret, frame = cap.read()
if not ret:
break
initial_w = cap.get(3)
initial_h = cap.get(4)
# preprocessing the input frame and reshaping it.
#In the document of tino-yolo-v2, input shape of this network is (1,3,416,416). so we resize the model frame w.r.t that size.
preprocessed_image = image_preprocess(frame)
start = time.time()
#Running the session by passing in the input data of the model
out = sess.run(None, {input_name: preprocessed_image})
end = time.time()
inference_time = end - start
#Get the output
postprocess_output(out, frame, x_scale, y_scale, i)
#Show the Output
output_video.write(frame)
show_bbox(args.device, frame, inference_time)
#Press 'q' to quit the process
print('Processed Frame {}'.format(i))
i += 1
l_end = time.time()
print('Loop Time = {}'.format(l_end - l_start))
output_video.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()