#!/usr/bin/env python # coding: utf-8 # # Audio signal analysis with python #
# # ## Phd. Jose Ricardo Zapata #

# PyCON 2018 - The Art of Coding #
# Medellin - Colombia # **Note:** This is a Jupyter notebook slideshow for a better visualization of the presentation I highly recommend to install RISE (Reveal.js - Jupyter/IPython Slideshow Extension)- turns the Jupyter Notebooks into a live presentation, then use a full screen view in the web browser (press F11) # # https://github.com/damianavila/RISE # # About me (https://joserzapata.github.io/) # # Professor and researcher at Universidad Pontificia Bolivariana (UPB) # - Phd. Sound and music computing, Universitat Pompeu Fabra, Barcelona - España # - MEng. Telecommunications and BSc in Electronic Engineering, UPB, Medellin - Colombia # # # About me (https://joserzapata.github.io/) # ## Interests # - Music Information Retrieval # - Audio signal processing # - Data science # # What is Audio Analysis # Audio analysis refers to the extraction of information, description and meaning from audio signals for: # - Signal Analysis -> Ex: Voice diseases detection # - Musical Analysis -> Ex: Extract Rhythm, Melody and Harmony # - Classification -> Ex: Genre classification, mood recognition # # What is Audio Analysis # - Storage -> Ex: Music compression (mp3, mp4) # - Retrieval -> Ex: Query by humming # - Music Recommendation # - Synthesis, etc. # # Outline # # 1. Python libraries for audio # 1. Reading, writing and visualizing Audio signals in python # 1. Frequency representation of audio signals # 1. Applications -> Music Information Retrieval # # Python libraries for Audio Analysis # # # - **Scipy** ([scipy.signal](https://docs.scipy.org/doc/scipy/reference/tutorial/signal.html), [scipy.fftpack](https://docs.scipy.org/doc/scipy/reference/tutorial/fftpack.html), [scipy.io](https://docs.scipy.org/doc/scipy/reference/tutorial/io.html) ) # - **Essentia** ([http://essentia.upf.edu](http://essentia.upf.edu)) # - **Librosa** ([https://librosa.github.io/](https://librosa.github.io/)) # - **Madmon** ([https://github.com/CPJKU/madmom](https://github.com/CPJKU/madmom)) # - **PyAudioAnalysis** ([https://github.com/tyiannak/pyAudioAnalysis](https://github.com/tyiannak/pyAudioAnalysis)) # # Audio signals in python # # - Reading and Playing audio files # - Visualizing audio # - Writing audio files # # Reading Audio Files # In[1]: # Example: Reading audio file with scipy.io AudioFileName = "Data/Whistle.wav" from scipy.io import wavfile # Output fs: Frequency sample and data: Audio signal -> int16 fs, Audiodata = wavfile.read(AudioFileName) print('AudioFile = {}, Sample Rate = {} [=] Samples/Sec, Wav format = {}'.format(AudioFileName,fs,Audiodata.dtype)) # For play audio in Jupyter Notebook import IPython.display as ipd #Ipython functions for jupyter ipd.Audio(AudioFileName) # play audio directly in a Jupyter notebook. # # Visualizing Audio Signal # In[3]: import matplotlib.pyplot as plt #Ploting library from __future__ import print_function, division get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('ggplot') #plot style plt.rcParams['figure.figsize'] = (15, 5) # set plot size plt.plot(Audiodata) plt.title('Audio Waveform with no proper axis values',size=16); # In[4]: # Set proper values for plot axis import numpy as np # set data amplitude values between [-1 : 1] Audiodata.dtype is int16 AudiodataScaled = Audiodata / (2.**15) #Set x axis values to milliseconds timeValues = np.arange(0, len(AudiodataScaled), 1)/ fs # converting samples/Sec to Seconds timeValues = timeValues * 1000 #scale to milliseconds plt.plot(timeValues, AudiodataScaled);plt.title('Audio Waveform',size=16) plt.ylabel('Amplitude'); plt.xlabel('Time (ms)'); # # Writing Audio Files # In[5]: # Making some modifications to audio file LessGaindata = AudiodataScaled/2.0 # Dividing by two the amplitude to the signal # Converting to int16 to save the audio file with 16 Bits @ 441000 Hz freq sampling LessGaindata = LessGaindata*(2.**15) LessGaindata = LessGaindata.astype(np.int16) print(' Directory files before writing = '); get_ipython().system('ls ./Data # Jupyter Magic :)') # Writing de audio signal to a file wavfile.write('Data/LessGain.wav',fs,LessGaindata) # In[ ]: ## Note: You can write the audio file as float point in this way # Making some modifications to audio file LessGaindata = AudiodataScaled/2.0 # Dividing by two the amplitude to the signal # Converting to int16 to save the audio file with 16 Bits @ 441000 Hz freq sampling #LessGaindata = LessGaindata*(2.**15) #Commenting to save in float point #LessGaindata = LessGaindata.astype(np.int16)# Comenting to save in float point print(' Directory files before writing = '); get_ipython().system('ls ./Data # Jupyter Magic :)') # Writing de audio signal to a file wavfile.write('Data/LessGain.wav',fs,LessGaindata) # The advantage is -> the audio file data amplitude values will be between [-1 : 1] # so, if read the audio file the values will be between [-1:1] directly. # In[6]: #Reading the new audiofile fs, LessGain = wavfile.read('Data/LessGain.wav') # set data amplitude values between [-1 : 1] LessGain.dtype is int16 LessGainScaled = LessGain / (2.**15) plt.plot(timeValues,LessGainScaled);plt.title('Audio Waveform Transformed',size=16) plt.ylabel('Amplitude'); plt.xlabel('Time (ms)'); # # Frequency representation # # * Fast Fourier Transform (FFT) # * Magnitude Spectrum plot(2D) # * Spectrogram (3D) # In[7]: fs, Audiodata = wavfile.read(AudioFileName) # Reading AudioFile Audiodata = Audiodata / (2.**15) # set data amplitude values between [-1 : 1] from scipy.fftpack import fft n = len(Audiodata) AudioFreq = fft(Audiodata) # Computing the fourier transform AudioFreq = AudioFreq[0:int(np.ceil((n+1)/2.0))] # FFT output is a array of complex numbers MagFreq = np.abs(AudioFreq) # absolute value to obtain the magnitude # scaling by the number of points to avoid that the magnitude values # depends of signal length the signal or its sampling frequency MagFreq = MagFreq / float(n) # Compute the power of the magnitude MagFreq = MagFreq**2 # ## Magnitude Spectrum Plot (2D) # In[8]: from plotly import offline as py #library for interactive plots import plotly.tools as tls py.init_notebook_mode() # In[9]: # check if the nfft is odd to find the Nyquist point in the spectrum if n % 2 > 0: # we've got odd number of points fft MagFreq[1:len(MagFreq)] = MagFreq[1:len(MagFreq)] * 2 else:# we've got even number of points fft MagFreq[1:len(MagFreq) -1] = MagFreq[1:len(MagFreq) - 1] * 2 freqAxis = np.arange(0,int(np.ceil((n+1)/2.0)), 1.0) * (fs / n); plt.plot(freqAxis/1000.0, 10*np.log10(MagFreq)) #Power spectrum plt.xlabel('Frequency (kHz)'); plt.ylabel('Power (dB)'); py.iplot(tls.mpl_to_plotly(plt.gcf())) #use plotly for interactive plot # In[ ]: # Just for verification the rms value of the audio signal in time # has to be the same as the value of the root square sum of the magnitude in frequency rms_val = np.sqrt(np.mean(Audiodata**2)) SumMagnitude = np.sqrt(np.sum(MagFreq)) print('rms (time) = {} and the Sum of Magnitude peaks (Freq) = {}'.format(rms_val,SumMagnitude)) print(u'The values are the same \U0001f603'); # ## Spectrogram # In[10]: Fs, data = wavfile.read(AudioFileName) data = data/(2.**15) # Scale the signal in [-1, 1] for 16 bits input N = 512 #Number of points of fft from scipy import signal Pxx, freqs, bins, im = plt.specgram(data, NFFT=N, Fs=Fs,window = signal.blackman(N),noverlap = 128) plt.title('Spectrogram using Matplotlib',size=16); plt.ylabel('Freq [Hz]'); plt.xlabel('Time [Sec]'); # In[ ]: #spectrum computed using scipy Fs, data = wavfile.read(AudioFileName) data = data/(2.**15) # Scale the signal in [-1, 1] for 16 bits input N = 512 #Number of points of fft from scipy import signal f, t, Sxx = signal.spectrogram(data, Fs,window = signal.blackman(N),nfft=N) #plt.pcolormesh(t, f,10*np.log10(Sxx)) # dB Magnitude spectrum plt.pcolormesh(t, f,Sxx) #Linear Magnitude spectrum plt.ylabel('Frequency [Hz]') plt.xlabel('Time [sec]') plt.title('Spectrogram using scipy.signal',size=16); # # Application -> Music Information Retrieval # What Google does with text, MIR does it with music. # - Recommend, Organize and discover music in multimedia data bases # - New ways of searching # - Query by humming # - Acoustic Fingerprint # # Application -> Music Information Retrieval # - Automatic musical information extraction in the Audio signals # - Beats and Tempo # - Melody # - Harmony # **Note: ** A **Music Information Retrieval** Course using python can be found at # - https://github.com/stevetjoa/stanford-mir # - http://musicinformationretrieval.com/ # # Applications Examples in Python # # - Audio Beat tracking and Tempo Estimation # - Onset Detection # - Melody Detection # - Source Separation # - Music / Voice separation # - Harmonic / Percussive Separation # ## Audio Beat Tracking # The task of Audio Beat Tracking refers to recovering a sequence of time instants from a musical input that are consistent with the times when a human might tap their foot. # ## Tempo Estimation # Normally corresponds to the rate of the beats per minute, is associate with the interpretation speed of a piece of music. The computation of tempo is based on the periodicities extracted of the music signal and # the time differences between the beats. # ![alt text](./Img/Rhythm.png "https://github.com/JoseRZapata/MultiFeatureBeatTracking") # In[11]: # Some examples will be using Essentia Library, # Import essentia library http://essentia.upf.edu/ import essentia from essentia.standard import * import warnings # some libraries display a future warning for deprecated functions used warnings.filterwarnings('ignore') # ** "Essentia: An Audio Analysis Library for Music Information Retrieval" ** ,Dmitry Bogdanov, Nicolas Wack, Emilia Gómez, Sankalp Gulati, Perfecto Herrera, Oscar Mayor, Gerard Roma, Justin Salamon, Jose R. Zapata, Xavier Serra. . 14th International Society for Music Information Retrieval Conference (ISMIR 2013), P. 493-498, Curitiba, Brazil, 2013. [Essentia](http://essentia.upf.edu) # In[12]: #Loading audio File with essentia audiobeat = MonoLoader(filename='Data/Journey.wav')() #15 sec extract from -> Journey - Dont stop beliving #Plot audio waveform plt.plot((np.arange(0, len(audiobeat), 1)/ 44100),audiobeat)#x axis in seconds plt.title("Don't stop beliving - Journey [Audio excerpt]") plt.xlabel("Time [=] Sec") ipd.Audio('Data/Journey.wav') # In[13]: # Compute beat positions and BPM # with the multifeature Beat tracker implemented in essentia rhythm_extractor = RhythmExtractor2013(method="multifeature") bpm, beats, b_conf, _, _ = rhythm_extractor(audiobeat) print("BPM:", bpm) print("Beat positions (sec.):", beats) print("Beat estimation confidence:", b_conf) # ** "Multi-Feature Beat Tracking" **, Jose R. Zapata, Matthew E.P. Davies, Emilia Gómez. IEEE/ACM Transactions on Audio, Speech, and Language Processing, Vol. 22, No. 4, P. 816-825, 2014. [https://doi.org/10.1109/TASLP.2014.2305252](https://doi.org/10.1109/TASLP.2014.2305252) # In[14]: # Mark beat positions on the audio and write it to a file # Let's use beeps instead of white noise to mark them, as it's more distinctive marker = AudioOnsetsMarker(onsets=beats, type='beep') marked_audio = marker(audiobeat) MonoWriter(filename='Data/Journey_beats.wav')(marked_audio) # In[15]: plt.plot(audiobeat,alpha=0.6) for beat in beats: # add beat times to the audio waveform plot plt.axvline(x=beat*44100, color='blue') plt.title("Audio waveform and the estimated beat positions, BPM = {}".format(bpm),size=16); plt.xlabel('Time'); ipd.Audio('Data/Journey_beats.wav') # ## Note Onset Detection # The time position in the musical audio signal where a new note begins. # In[16]: # Loading audio file audioOnset = MonoLoader(filename='Data/Beastie.wav')() # Phase 1: compute the onset detection function # The OnsetDetection algorithm provides various onset detection functions. Let's use two of them. od2 = OnsetDetection(method='complex') # In[17]: # Let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type = 'hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = essentia.Pool() # Computing onset detection functions. for frame in FrameGenerator(audioOnset, frameSize = 1024, hopSize = 512): mag, phase, = c2p(fft(w(frame))) pool.add('features.complex', od2(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() onsets_complex = onsets(essentia.array([ pool['features.complex'] ]),[ 1 ]) # In[18]: plt.plot(audioOnset,alpha=0.6) for onset in onsets_complex: plt.axvline(x=onset*44100, color='blue') plt.title("Audio waveform and the estimated onset positions",size=16); ipd.Audio('Data/Beastie.wav') # ## Melody Detection # Estimates the fundamental frequency of the predominant melody from polyphonic music signals. # ![alt text](./Img/MKF3V.png "http://www.justinsalamon.com/melody-extraction.html") # # In[19]: # Load audio file; it is recommended to apply equal-loudness filter for PredominantPitchMelodia loader = EqloudLoader(filename='Data/SMC_9.wav', sampleRate=44100) audio = loader() print("Duration of the audio sample [sec]: {}".format(len(audio)/44100.0)) # Extract the pitch curve # PitchMelodia takes the entire audio signal as input (no frame-wise processing is required) pitch_extractor = PredominantPitchMelodia(frameSize=2048, hopSize=128) pitch_values, pitch_confidence = pitch_extractor(audio) # Pitch is estimated on frames. Compute frame time positions pitch_times = np.linspace(0.0,len(audio)/44100.0,len(pitch_values) ) # In[20]: # Plot the estimated pitch contour and confidence over time plt.subplot(211) plt.plot(pitch_times, pitch_values) plt.title('Estimated pitch [Hz]') plt.xticks([]) #removing x axis numbers plt.subplot(212) plt.plot(pitch_times, pitch_confidence) plt.title('Pitch confidence');plt.xlabel('Time'); # ** "Melody Extraction from Polyphonic Music Signals using Pitch Contour Characteristics" **, J. Salamon and E. Gómez, IEEE Transactions on Audio, Speech and Language Processing, 20(6):1759-1770, Aug. 2012.[https://doi.org/10.1109/TASL.2012.2188515](https://doi.org/10.1109/TASL.2012.2188515) # # Music / Voice Separation # The basic idea is to find the repeating elements in the audio mixture and extract the repeating background for music/voice separation # ![alt text](./Img/repet.png "REPET http://zafarrafii.com/") # # In[22]: # Examples using The Northwestern University Source Separation Library (nussl) Library import nussl #https://github.com/interactiveaudiolab/nussl history = nussl.AudioSignal('./Data/History.wav') repet = nussl.Repet(history) repet.run(); beat_spectrum = repet.get_beat_spectrum() repet.update_periods() bkgd, fgnd = repet.make_audio_signals() # In[23]: bkgd.write_audio_to_file('Data/History_background.wav'); fgnd.write_audio_to_file('Data/History_foreground.wav'); print('Original Audio') ipd.display(ipd.Audio('Data/History.wav')) print('Background Audio') ipd.display(ipd.Audio('Data/History_background.wav')) print('Foreground Audio') ipd.display(ipd.Audio('Data/History_foreground.wav')) # # Harmonic / Percussive Separation # The aim is to decompose the original music signal to the harmonic and the percussive parts of the signal. # ![alt text](./Img/harmonic_percussive.png "http://mir.ilsp.gr/harmonic_percussive_separation.html") # In[24]: # Examples will be using Librosa library -> https://librosa.github.io/ import librosa import librosa.display y, sr = librosa.load('./Data/disco.wav') D = librosa.stft(y); D_harmonic, D_percussive = librosa.decompose.hpss(D,margin=3.0); # Pre-compute a global reference power from the input spectrum rp = np.max(np.abs(D)); # In[25]: fig = plt.figure(figsize=(15, 8)) plt.subplot(3, 1, 1) librosa.display.specshow(librosa.amplitude_to_db(D, ref=rp), y_axis='log') plt.colorbar() plt.title('Full spectrogram') plt.subplot(3, 1, 2) librosa.display.specshow(librosa.amplitude_to_db(D_harmonic, ref=rp), y_axis='log') plt.colorbar() plt.title('Harmonic spectrogram') plt.subplot(3, 1, 3) librosa.display.specshow(librosa.amplitude_to_db(D_percussive, ref=rp), y_axis='log', x_axis='time') plt.colorbar() plt.title('Percussive spectrogram') plt.tight_layout(); plt.close(fig)# close figure to plot in the other cell to use # In[26]: d = plt.figure()# plot figure from past cell new_m = d.canvas.manager; new_m.canvas.figure = fig;fig.set_canvas(new_m.canvas) # In[27]: y_harm = librosa.core.istft(D_harmonic) # Harmonic spectrum to audio signal in time y_perc = librosa.core.istft(D_percussive) # Percussive spectrum to audio signal in time librosa.output.write_wav('./Data/harmonic.wav',y_harm, sr) librosa.output.write_wav('./Data/percussive.wav', y_perc, sr) print('Original Audio'); ipd.display(ipd.Audio('Data/disco.wav')) print('Harmonic Audio'); ipd.display(ipd.Audio('Data/harmonic.wav')) print('Percussive Audio'); ipd.display(ipd.Audio('Data/percussive.wav')) # # ¡THANKS! # # | | | # |---|---| # |

joserzapata.github.io

| # |

@joserzapata

| # |

@joserzapata

| # |

joserzapata [@] upb.edu.co

| # | | |