# Imports from pathlib import Path import IPython.display as ipd import librosa import librosa.display import matplotlib.pyplot as plt import numpy as np import scipy from openvino.runtime import Core # Settings model_folder = "model" download_folder = "output" data_folder = "data" precision = "FP16" model_name = "quartznet-15x5-en" # Download Model # Check if model is already downloaded in download directory path_to_model_weights = Path(f'{download_folder}/public/{model_name}/models') downloaded_model_file = list(path_to_model_weights.glob('*.pth')) if not path_to_model_weights.is_dir() or len(downloaded_model_file) == 0: download_command = f"omz_downloader --name {model_name} --output_dir {download_folder} --precision {precision}" ! $download_command # Convert Model # Check if model is already converted in model directory path_to_converted_weights = Path(f'{model_folder}/public/{model_name}/{precision}/{model_name}.bin') if not path_to_converted_weights.is_file(): convert_command = f"omz_converter --name {model_name} --precisions {precision} --download_dir {download_folder} --output_dir {model_folder}" ! $convert_command # Defining constants audio_file_name = "edge_to_cloud.ogg" alphabet = " abcdefghijklmnopqrstuvwxyz'~" # Load Audio File audio, sampling_rate = librosa.load(path=f'{data_folder}/{audio_file_name}', sr=16000) ipd.Audio(audio, rate=sampling_rate) # Visualise Audio File plt.figure() librosa.display.waveplot(y=audio, sr=sampling_rate, max_points=50000.0, x_axis='time', offset=0.0, max_sr=1000); plt.show() specto_audio = librosa.stft(audio) specto_audio = librosa.amplitude_to_db(np.abs(specto_audio), ref=np.max) print(specto_audio.shape) librosa.display.specshow(specto_audio, sr=sampling_rate, x_axis='time', y_axis='hz'); # Change Type of Data if max(np.abs(audio)) <= 1: audio = (audio * (2**15 - 1)) audio = audio.astype(np.int16) # Convert Audio to Mel Spectrum def audio_to_mel(audio, sampling_rate): assert sampling_rate == 16000, "Only 16 KHz audio supported" preemph = 0.97 preemphased = np.concatenate([audio[:1], audio[1:] - preemph * audio[:-1].astype(np.float32)]) # Calculate window length win_length = round(sampling_rate * 0.02) # Based on previously calculated window length run short-time Fourier transform spec = np.abs(librosa.core.spectrum.stft(preemphased, n_fft=512, hop_length=round(sampling_rate * 0.01), win_length=win_length, center=True, window=scipy.signal.windows.hann(win_length), pad_mode='reflect')) # Create mel filter-bank, produce transformation matrix to project current values onto Mel-frequency bins mel_basis = librosa.filters.mel(sampling_rate, 512, n_mels=64, fmin=0.0, fmax=8000.0, htk=False) return mel_basis, spec def mel_to_input(mel_basis, spec, padding=16): # Convert to logarithmic scale log_melspectrum = np.log(np.dot(mel_basis, np.power(spec, 2)) + 2 ** -24) # Normalize output normalized = (log_melspectrum - log_melspectrum.mean(1)[:, None]) / (log_melspectrum.std(1)[:, None] + 1e-5) # Calculate padding remainder = normalized.shape[1] % padding if remainder != 0: return np.pad(normalized, ((0, 0), (0, padding - remainder)))[None] return normalized[None] # Run Conversion from Audio to Mel Format mel_basis, spec = audio_to_mel(audio=audio.flatten(), sampling_rate=sampling_rate) # Visualise Mel Spectogram librosa.display.specshow(data=spec, sr=sampling_rate, x_axis='time', y_axis='log'); plt.show(); librosa.display.specshow(data=mel_basis, sr=sampling_rate, x_axis='linear'); plt.ylabel('Mel filter'); # Adjust Mel scale to Input audio = mel_to_input(mel_basis=mel_basis, spec=spec) # Load Model ie = Core() model = ie.read_model( model=f"{model_folder}/public/{model_name}/{precision}/{model_name}.xml" ) model_input_layer = model.input(0) shape = model_input_layer.partial_shape shape[2] = -1 model.reshape({model_input_layer: shape}) compiled_model = ie.compile_model(model=model, device_name="CPU") # Do Inference output_layer_ir = compiled_model.output(0) character_probabilities = compiled_model([audio])[output_layer_ir] # Read Output # Remove unnececery dimension character_probabilities = np.squeeze(character_probabilities) # Run argmax to pick most possible symbols character_probabilities = np.argmax(character_probabilities, axis=1) # Implementation of Decoding def ctc_greedy_decode(predictions): previous_letter_id = blank_id = len(alphabet) - 1 transcription = list() for letter_index in predictions: if previous_letter_id != letter_index != blank_id: transcription.append(alphabet[letter_index]) previous_letter_id = letter_index return ''.join(transcription) # Run Decoding and Print Output transcription = ctc_greedy_decode(character_probabilities) print(transcription)
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter