import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize'] = [15,10]
import librosa
import librosa.display
import IPython.display as ipd
from pydub import AudioSegment
from pydub.utils import mediainfo
# Path to arctic_a0005: 'will we ever forget it'
speech_path = '../../../Data/speech/cmu_us_rms_arctic/wav/arctic_a0005.wav'
sound = AudioSegment.from_wav(speech_path) # Read audio file
sound_samples = sound.get_array_of_samples() # Extract signal samples
samp_freq = sound.frame_rate # Sampling frequency
# Normilze to max amplitude of 1
speech_samples_norm = np.array(sound_samples)/np.max(np.array(sound_samples))
strt_samp = 0
end_samp = len(speech_samples_norm)
end_ms = len(speech_samples_norm)/samp_freq
xrange = np.linspace(0, end_ms, end_samp-strt_samp)
# Plot speech and the corresponding spectrogram
fg1 = plt.figure(figsize=(18, 8))
plt.plot(xrange, speech_samples_norm)
plt.xlabel('Time in seconds')
plt.ylabel('Amplitude')
plt.axis('tight')
fg1.savefig('speech.jpg')
winlen = int(samp_freq*.03) # Window size of 30 ms
X = librosa.stft(
np.array(speech_samples_norm[strt_samp:end_samp]), win_length=winlen)
Xdb = librosa.amplitude_to_db(abs(X))
fg2 = plt.figure(figsize=(18, 8))
librosa.display.specshow(Xdb, sr=samp_freq, x_axis='time',
y_axis='hz', hop_length=winlen/4)
fg2.savefig('specgram.jpg')
# Read lab file
fil = open('../../../Data/speech/cmu_us_rms_arctic/lab/arctic_a0005.lab')
phonemes = []
for sent in fil.readlines():
wrds = sent.split()
if len(wrds) > 1:
phonemes.append([float(wrds[0]), wrds[2]])
# Phoneme list and their ending point
phonemes
[[0.11, 'pau'], [0.23, 'w'], [0.245, 'ih'], [0.375, 'l'], [0.395, 'w'], [0.52, 'iy'], [0.59, 'eh'], [0.65, 'v'], [0.735, 'er'], [0.845, 'f'], [0.9, 'er'], [0.975, 'g'], [1.09, 'eh'], [1.115, 't'], [1.22, 'ih'], [1.28, 't'], [1.325, 'pau'], [1.405, 'pau']]
# Plot wave file along with range of phonemes to print
nophn = 2
col = ['r', 'b']
ind = 0
fg1 = plt.figure(figsize=(14, 5))
stmx = []
stmy = []
for i in range(0, nophn+1):
strt_s = phonemes[i][0]
end_s = phonemes[i+1][0]
xrange = np.linspace(strt_s, end_s, int(
(end_s)*samp_freq)-int((strt_s)*samp_freq))
plt.plot(xrange, speech_samples_norm[int(
strt_s*samp_freq):int(end_s*samp_freq)], col[ind])
ind = 1-ind
stmx.append(end_s)
stmy.append(1.1)
plt.text((strt_s+end_s)/2, 1, phonemes[i+1][1], fontsize=12)
plt.stem(stmx, stmy, 'k')
plt.grid()
plt.xlabel('Time in seconds')
plt.ylabel('Amplitude')
fg1.savefig('willphn.jpg')
# Read noise file
s = '../../../Data/speech/Noisex92/factory1.wav'
sound = AudioSegment.from_wav(s)
soundall = sound.get_array_of_samples()
# Sampling rate of noise
samp_freq_nois = sound.frame_rate
# NOise samples
soundallnois = np.array(soundall)/np.max(np.array(soundall))
# Play speech files
ipd.Audio(speech_samples_norm, rate=int(samp_freq))
# PLay noise file before resampling
ipd.Audio(soundallnois, rate=int(samp_freq_nois))