211-speech-to-text: Speech to Text with OpenVINO

PHOTO

Fri Jun 17 2022 04:58:24 GMT+0000 (Coordinated Universal Time)

Saved by @OpenVINOtoolkit #python #openvino #openvino-notebooks #deeplearning #accelerated-inference ##nlp ##speechtotext

# Imports
from pathlib import Path

import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import scipy
from openvino.runtime import Core

# Settings
model_folder = "model"
download_folder = "output"
data_folder = "data"

precision = "FP16"
model_name = "quartznet-15x5-en"

# Download Model
# Check if model is already downloaded in download directory
path_to_model_weights = Path(f'{download_folder}/public/{model_name}/models')
downloaded_model_file = list(path_to_model_weights.glob('*.pth'))

if not path_to_model_weights.is_dir() or len(downloaded_model_file) == 0:
    download_command = f"omz_downloader --name {model_name} --output_dir {download_folder} --precision {precision}"
    ! $download_command

# Convert Model
# Check if model is already converted in model directory
path_to_converted_weights = Path(f'{model_folder}/public/{model_name}/{precision}/{model_name}.bin')

if not path_to_converted_weights.is_file():
    convert_command = f"omz_converter --name {model_name} --precisions {precision} --download_dir {download_folder} --output_dir {model_folder}"
    ! $convert_command

# Defining constants
audio_file_name = "edge_to_cloud.ogg"
alphabet = " abcdefghijklmnopqrstuvwxyz'~"

# Load Audio File
audio, sampling_rate = librosa.load(path=f'{data_folder}/{audio_file_name}', sr=16000)
ipd.Audio(audio, rate=sampling_rate)

# Visualise Audio File
plt.figure()
librosa.display.waveplot(y=audio, sr=sampling_rate, max_points=50000.0, x_axis='time', offset=0.0, max_sr=1000);
plt.show()
specto_audio = librosa.stft(audio)
specto_audio = librosa.amplitude_to_db(np.abs(specto_audio), ref=np.max)
print(specto_audio.shape)
librosa.display.specshow(specto_audio, sr=sampling_rate, x_axis='time', y_axis='hz');

# Change Type of Data
if max(np.abs(audio)) <= 1:
    audio = (audio * (2**15 - 1))
audio = audio.astype(np.int16)

# Convert Audio to Mel Spectrum
def audio_to_mel(audio, sampling_rate):
    assert sampling_rate == 16000, "Only 16 KHz audio supported"
    preemph = 0.97
    preemphased = np.concatenate([audio[:1], audio[1:] - preemph * audio[:-1].astype(np.float32)])

    # Calculate window length
    win_length = round(sampling_rate * 0.02)

    # Based on previously calculated window length run short-time Fourier transform
    spec = np.abs(librosa.core.spectrum.stft(preemphased, n_fft=512, hop_length=round(sampling_rate * 0.01),
                  win_length=win_length, center=True, window=scipy.signal.windows.hann(win_length), pad_mode='reflect'))

    # Create mel filter-bank, produce transformation matrix to project current values onto Mel-frequency bins
    mel_basis = librosa.filters.mel(sampling_rate, 512, n_mels=64, fmin=0.0, fmax=8000.0, htk=False)
    return mel_basis, spec


def mel_to_input(mel_basis, spec, padding=16):
    # Convert to logarithmic scale
    log_melspectrum = np.log(np.dot(mel_basis, np.power(spec, 2)) + 2 ** -24)

    # Normalize output
    normalized = (log_melspectrum - log_melspectrum.mean(1)[:, None]) / (log_melspectrum.std(1)[:, None] + 1e-5)

    # Calculate padding
    remainder = normalized.shape[1] % padding
    if remainder != 0:
        return np.pad(normalized, ((0, 0), (0, padding - remainder)))[None]
    return normalized[None]

# Run Conversion from Audio to Mel Format
mel_basis, spec = audio_to_mel(audio=audio.flatten(), sampling_rate=sampling_rate)

# Visualise Mel Spectogram
librosa.display.specshow(data=spec, sr=sampling_rate, x_axis='time', y_axis='log');
plt.show();
librosa.display.specshow(data=mel_basis, sr=sampling_rate, x_axis='linear');
plt.ylabel('Mel filter');

# Adjust Mel scale to Input
audio = mel_to_input(mel_basis=mel_basis, spec=spec)

# Load Model
ie = Core()

model = ie.read_model(
    model=f"{model_folder}/public/{model_name}/{precision}/{model_name}.xml"
)
model_input_layer = model.input(0)
shape = model_input_layer.partial_shape
shape[2] = -1
model.reshape({model_input_layer: shape})
compiled_model = ie.compile_model(model=model, device_name="CPU")

# Do Inference
output_layer_ir = compiled_model.output(0)

character_probabilities = compiled_model([audio])[output_layer_ir]

# Read Output
# Remove unnececery dimension
character_probabilities = np.squeeze(character_probabilities)

# Run argmax to pick most possible symbols
character_probabilities = np.argmax(character_probabilities, axis=1)

# Implementation of Decoding
def ctc_greedy_decode(predictions):
    previous_letter_id = blank_id = len(alphabet) - 1
    transcription = list()
    for letter_index in predictions:
        if previous_letter_id != letter_index != blank_id:
            transcription.append(alphabet[letter_index])
        previous_letter_id = letter_index
    return ''.join(transcription)

# Run Decoding and Print Output
transcription = ctc_greedy_decode(character_probabilities)
print(transcription)

COPY

This tutorial demonstrates speech-to-text recognition with OpenVINO. For this tutorial, we use the quartznet 15x5 model. QuartzNet performs automatic speech recognition. Its design is based on the Jasper architecture, which is a convolutional model trained with Connectionist Temporal Classification (CTC) loss. The model is available from Open Model Zoo. If you have not yet installed OpenVINO™, please follow the Installation Guide to install all required dependencies. https://github.com/openvinotoolkit/openvino_notebooks/blob/main/README.md#-installation-guide Link to .bin and .xml files: https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/quartznet-15x5-en/model.yml

https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/211-speech-to-text/211-speech-to-text.ipynb

Save snippets that work from anywhere online with our extensions

Comments

OpenVINO™ notebooks

@OpenVINOtoolkit

001-hello-world: Hello Image Classification using OpenVINO™ toolkit 002-openvino-api: OpenVINO API tutorial 003-hello-segmentation: Introduction to Segmentation in OpenVINO 004-hello-detection: Introduction to Detection in OpenVINO 101-tensorflow-to-openvino: TensorFlow to OpenVINO Model Conversion Tutorial 102-pytorch-onnx-to-openvino: PyTorch to ONNX and OpenVINO IR Tutorial 103-paddle-onnx-to-openvino: Convert a PaddlePaddle Model to ONNX and OpenVINO IR 104-model-tools: Working with Open Model Zoo Models 210-ct-scan-live-inference: Live Inference and Benchmark CT-scan Data with OpenVINO 201-vision-monodepth: Monodepth Estimation with OpenVINO 210-ct-scan-live-inference: Live Inference and Benchmark CT-scan Data with OpenVINO 401-object-detection-webcam: Live Object Detection with OpenVINO 402-pose-estimation-webcam: Live Human Pose Estimation with OpenVINO 403-action-recognition-webcam: Human Action Recognition with OpenVINO 211-speech-to-text: Speech to Text with OpenVINO 213-question-answering: Interactive Question Answering with OpenVINO 208-optical-character-recognition: Optical Character Recognition (OCR) with OpenVINO 209-handwritten-ocr: Handwritten Chinese and Japanese OCR 405-paddle-ocr-webcam: PaddleOCR with OpenVINO 305-tensorflow-quantization-aware-training: Optimizing TensorFlow models with Neural Network Compression Framework of OpenVINO by 8-bit quantization 302-pytorch-quantization-aware-training: Optimizing PyTorch models with Neural Network Compression Framework of OpenVINO by 8-bit quantization 301-tensorflow-training-openvino: Post-Training Quantization with TensorFlow Classification Model 301-tensorflow-training-openvino: From Training to Deployment with TensorFlow and OpenVINO 204-named-entity-recognition: Named Entity Recognition with OpenVINO

#python

Importing images from a directory (Python) to list or dictionary

from PIL import Image
import glob
image_list = []
for filename in glob.glob('yourpath/*.gif'): #assuming gif
    im=Image.open(filename)
    image_list.append(im)

#python

python - Find out the percentage of missing values in each column in the given dataset - Stack Overflow

percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

#python #python #loops #whileloop

Print the name of 7 days in a week - by using while loop

days = 0
week = [‘Monday’, ‘Tuesday’, ‘Wednesday’, ‘Thursday’, ‘Friday’, ‘Saturday’, 3.‘Sunday’]
while day < 7:
print(“Today is” + week[days])
days += 1

#python

Getting the index of an item in a list containing it in Python

>>> ["foo", "bar", "baz"].index("bar")
1

#javascript #python #search #historicalcode #google #algorithms

Google’s PageRank Algorithm from 1996 - the origin of internet search

import numpy as np

def pagerank(M, num_iterations=100, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    iteration = 0
    while iteration < num_iterations:
        iteration += 1
        v = d * np.matmul(M, v) + (1 - d) / N
    return v

#python #python #strings #vowels #function

Get vowels in strings

This method gets vowels (‘a’, ‘e’, ‘i’, ‘o’, ‘u’) found in a string.
   
#make a function:
def get_vowels(string):

#return is the keyword which means function have to return value: 
 return [each for each in string if each in 'aeiou']


#assign the words and function will return vowels words.
get_vowels('foobar') # ['o', 'o', 'a']


get_vowels('gym') # []

#python

Could not build wheels for tokenizers which use PEP 517 and cannot be installed directly

https://github.com/pydata/bottleneck/issues/281

#python

How To Bypass Cloudflare Bot Protection In Selenium - CodingTutz

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)

#python

Python Loop through Excel sheets, place into one df - Stack Overflow

import pandas as pd

sheets_dict = pd.read_excel('Book1.xlsx', sheetname=None)

full_table = pd.DataFrame()
for name, sheet in sheets_dict.items():
    sheet['sheet'] = name
    sheet = sheet.rename(columns=lambda x: x.split('\n')[-1])
    full_table = full_table.append(sheet)

full_table.reset_index(inplace=True, drop=True)

print full_table

#python #dates #functions #python3.8

How to parse a String into Datetime in Python

from datetime import datetime

datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

#python

python - Way to change Google Chrome user agent in Selenium? - Stack Overflow

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

options = Options()
ua = UserAgent()
userAgent = ua.random
print(userAgent)
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get("https://www.google.co.in")
driver.quit()

#python

python - How to see the progress bar of read_csv - Stack Overflow

def read_csv_pgbar(csv_path, chunksize, usecols, dtype=object):


    # print('Getting row count of csv file')

    rows = sum(1 for _ in open(csv_path, 'r')) - 1 # minus the header
    # chunks = rows//chunksize + 1
    # print('Reading csv file')
    chunk_list = []

    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, chunksize=chunksize, usecols=usecols, dtype=dtype):
            chunk_list.append(chunk)
            bar.update(len(chunk))

    df = pd.concat((f for f in chunk_list), axis=0)
    print('Finish reading csv file')

    return df

#python #python #lists #dictionary

Convert two lists into a dictionary

keys, values)) # {'a': 2, 'c': 4, 'b': 3}
 
 
#make a function: def is the keyword for the function:
def to_dictionary(keys, values):
 
 
#return is the keyword that tells program that function has to return value   
return dict(zip(keys, values))
 
  
 
# keys and values are the lists:
 
keys = ["a", "b", "c"]   
 
values = [2, 3, 4]

#python #interesting #arrays #sorting #interviewquestions

Sorting an array without changing position of negative numbers

# Python3 implementation of the approach 

# Function to sort the array such that 
# negative values do not get affected 
def sortArray(a, n): 

	# Store all non-negative values 
	ans=[] 
	for i in range(n): 
		if (a[i] >= 0): 
			ans.append(a[i]) 

	# Sort non-negative values 
	ans = sorted(ans) 

	j = 0
	for i in range(n): 

		# If current element is non-negative then 
		# update it such that all the 
		# non-negative values are sorted 
		if (a[i] >= 0): 
			a[i] = ans[j] 
			j += 1

	# Print the sorted array 
	for i in range(n): 
		print(a[i],end = " ") 


# Driver code 

arr = [2, -6, -3, 8, 4, 1] 

n = len(arr) 

sortArray(arr, n)

#python ##python #strings #comments

Create simple string along with variables

#assign a value to a variable:
types_of_people = 10 
# make a string using variable name:
X = f “there are {types_of_people} types of people.”

Output:
There are 10 types of people

211-speech-to-text: Speech to Text with OpenVINO

Save snippets that work from anywhere online with our extensions

Comments

More like this

OpenVINO™ notebooks

Browse more snippets >>

211-speech-to-text: Speech to Text with OpenVINO

Save snippets that work from anywhere online with our extensions

Comments

More like this

OpenVINO™ notebooks

Browse more snippets >>

Embed code snippet