211-speech-to-text: Speech to Text with OpenVINO

PHOTO EMBED

Fri Jun 17 2022 04:58:24 GMT+0000 (Coordinated Universal Time)

Saved by @OpenVINOtoolkit #python #openvino #openvino-notebooks #deeplearning #accelerated-inference ##nlp ##speechtotext

# Imports
from pathlib import Path

import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import scipy
from openvino.runtime import Core

# Settings
model_folder = "model"
download_folder = "output"
data_folder = "data"

precision = "FP16"
model_name = "quartznet-15x5-en"

# Download Model
# Check if model is already downloaded in download directory
path_to_model_weights = Path(f'{download_folder}/public/{model_name}/models')
downloaded_model_file = list(path_to_model_weights.glob('*.pth'))

if not path_to_model_weights.is_dir() or len(downloaded_model_file) == 0:
    download_command = f"omz_downloader --name {model_name} --output_dir {download_folder} --precision {precision}"
    ! $download_command

# Convert Model
# Check if model is already converted in model directory
path_to_converted_weights = Path(f'{model_folder}/public/{model_name}/{precision}/{model_name}.bin')

if not path_to_converted_weights.is_file():
    convert_command = f"omz_converter --name {model_name} --precisions {precision} --download_dir {download_folder} --output_dir {model_folder}"
    ! $convert_command

# Defining constants
audio_file_name = "edge_to_cloud.ogg"
alphabet = " abcdefghijklmnopqrstuvwxyz'~"

# Load Audio File
audio, sampling_rate = librosa.load(path=f'{data_folder}/{audio_file_name}', sr=16000)
ipd.Audio(audio, rate=sampling_rate)

# Visualise Audio File
plt.figure()
librosa.display.waveplot(y=audio, sr=sampling_rate, max_points=50000.0, x_axis='time', offset=0.0, max_sr=1000);
plt.show()
specto_audio = librosa.stft(audio)
specto_audio = librosa.amplitude_to_db(np.abs(specto_audio), ref=np.max)
print(specto_audio.shape)
librosa.display.specshow(specto_audio, sr=sampling_rate, x_axis='time', y_axis='hz');

# Change Type of Data
if max(np.abs(audio)) <= 1:
    audio = (audio * (2**15 - 1))
audio = audio.astype(np.int16)

# Convert Audio to Mel Spectrum
def audio_to_mel(audio, sampling_rate):
    assert sampling_rate == 16000, "Only 16 KHz audio supported"
    preemph = 0.97
    preemphased = np.concatenate([audio[:1], audio[1:] - preemph * audio[:-1].astype(np.float32)])

    # Calculate window length
    win_length = round(sampling_rate * 0.02)

    # Based on previously calculated window length run short-time Fourier transform
    spec = np.abs(librosa.core.spectrum.stft(preemphased, n_fft=512, hop_length=round(sampling_rate * 0.01),
                  win_length=win_length, center=True, window=scipy.signal.windows.hann(win_length), pad_mode='reflect'))

    # Create mel filter-bank, produce transformation matrix to project current values onto Mel-frequency bins
    mel_basis = librosa.filters.mel(sampling_rate, 512, n_mels=64, fmin=0.0, fmax=8000.0, htk=False)
    return mel_basis, spec


def mel_to_input(mel_basis, spec, padding=16):
    # Convert to logarithmic scale
    log_melspectrum = np.log(np.dot(mel_basis, np.power(spec, 2)) + 2 ** -24)

    # Normalize output
    normalized = (log_melspectrum - log_melspectrum.mean(1)[:, None]) / (log_melspectrum.std(1)[:, None] + 1e-5)

    # Calculate padding
    remainder = normalized.shape[1] % padding
    if remainder != 0:
        return np.pad(normalized, ((0, 0), (0, padding - remainder)))[None]
    return normalized[None]

# Run Conversion from Audio to Mel Format
mel_basis, spec = audio_to_mel(audio=audio.flatten(), sampling_rate=sampling_rate)

# Visualise Mel Spectogram
librosa.display.specshow(data=spec, sr=sampling_rate, x_axis='time', y_axis='log');
plt.show();
librosa.display.specshow(data=mel_basis, sr=sampling_rate, x_axis='linear');
plt.ylabel('Mel filter');

# Adjust Mel scale to Input
audio = mel_to_input(mel_basis=mel_basis, spec=spec)

# Load Model
ie = Core()

model = ie.read_model(
    model=f"{model_folder}/public/{model_name}/{precision}/{model_name}.xml"
)
model_input_layer = model.input(0)
shape = model_input_layer.partial_shape
shape[2] = -1
model.reshape({model_input_layer: shape})
compiled_model = ie.compile_model(model=model, device_name="CPU")

# Do Inference
output_layer_ir = compiled_model.output(0)

character_probabilities = compiled_model([audio])[output_layer_ir]

# Read Output
# Remove unnececery dimension
character_probabilities = np.squeeze(character_probabilities)

# Run argmax to pick most possible symbols
character_probabilities = np.argmax(character_probabilities, axis=1)

# Implementation of Decoding
def ctc_greedy_decode(predictions):
    previous_letter_id = blank_id = len(alphabet) - 1
    transcription = list()
    for letter_index in predictions:
        if previous_letter_id != letter_index != blank_id:
            transcription.append(alphabet[letter_index])
        previous_letter_id = letter_index
    return ''.join(transcription)

# Run Decoding and Print Output
transcription = ctc_greedy_decode(character_probabilities)
print(transcription)
content_copyCOPY

This tutorial demonstrates speech-to-text recognition with OpenVINO. For this tutorial, we use the quartznet 15x5 model. QuartzNet performs automatic speech recognition. Its design is based on the Jasper architecture, which is a convolutional model trained with Connectionist Temporal Classification (CTC) loss. The model is available from Open Model Zoo. If you have not yet installed OpenVINO™, please follow the Installation Guide to install all required dependencies. https://github.com/openvinotoolkit/openvino_notebooks/blob/main/README.md#-installation-guide Link to .bin and .xml files: https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/quartznet-15x5-en/model.yml

https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/211-speech-to-text/211-speech-to-text.ipynb