#!/usr/bin/env python # coding: utf-8 # ## Violin Plot with Python Plotly # Violin plot is a trace that visually encodes the distribution of a data set, along with its summary statistics. # It displays the graph of the estimated probability density function (pdf) mirrored about y-axis, and inside the violin-like shaped region, the elements of a box plot (median, lower and upper quartile, whisker position). # # In this Jupyter Notebook we define functions to get the Plotly plot of a violin plot. In order to get more insights into distributional properties we add the option to overlay onto the same axis the rug plot of the data set. # In[2]: from IPython.display import HTML HTML('') # In[3]: import numpy as np import pandas as pd from scipy import stats # Compute the summary statistics of data: # In[4]: def calc_stats(data) : x=np.asarray(data, np. float) vals_min=np.min(x) vals_max=np.max(x) q2=np.percentile(x, 50, interpolation='linear') q1=np.percentile(x, 25, interpolation='lower') q3=np.percentile(x, 75, interpolation='higher') IQR=q3-q1 whisker_dist = 1.5 * IQR #in order to prevent drawing whiskers outside the interval #of data one defines the whisker positions as: d1 = np.min(x[x >= (q1 - whisker_dist)]) d2 = np.max(x[x <= (q3 + whisker_dist)]) return vals_min, vals_max, q1, q2 ,q3, d1,d2 # In[5]: import plotly.plotly as py from plotly.graph_objs import * import plotly.tools as tls # Functions that define violin components: # In[6]: def make_half_violin( x, y, fillcolor='#1f77b4', linecolor='rgb(50,50,50)'): text=['(pdf(y), y)=('+'{:0.2f}'.format(x[i])+', '+'{:0.2f}'.format(y[i])+')' for i in range(len(x))] return Scatter(x=x, y=y, mode='lines', name='', text=text, fill='tonextx', fillcolor= fillcolor, line=Line(width=0.5, color=linecolor, shape='spline'), hoverinfo='text', opacity=0.5 ) def make_rugplot(vals, pdf_max, distance, color='#1f77b4'): return Scatter(y=vals, x=[-pdf_max-distance]*len(vals), marker=Marker( color=color, symbol='line-ew-open' ), mode='markers', name='', showlegend=False, hoverinfo='y' ) def make_quartiles(q1, q3): return Scatter(x=[0, 0], y=[q1, q3], text=['lower-quartile: '+'{:0.2f}'.format(q1), 'upper-quartile: '+'{:0.2f}'.format(q3)], mode='lines', line=Line(width=4, color='rgb(0,0,0)'), hoverinfo='text' ) def make_median(q2): return Scatter(x=[0], y=[q2], text=['median: '+'{:0.2f}'.format(q2)], mode='markers', marker=dict(symbol='square', color='rgb(255,255,255)'), hoverinfo='text' ) def make_non_outlier_interval(d1,d2): return Scatter(x=[0, 0], y=[d1, d2], name='', mode='lines', line=Line(width=1.5, color='rgb(0,0,0)') ) # Set axes: # In[7]: def make_XAxis(xaxis_title, xaxis_range): xaxis=XAxis(title=xaxis_title, range=xaxis_range, showgrid=False, zeroline=False, showline=False, mirror=False, ticks='', showticklabels=False, ) return xaxis def make_YAxis(yaxis_title): yaxis = YAxis(title=yaxis_title, showticklabels=True, autorange=True, ticklen=4, showline=True, zeroline=False, showgrid=False, mirror=False) return yaxis # Data values, `vals`, can be given in a numeric list, numpy array of shape (n, ) or a pandas series. # Because a violin plot is symmetric with respect to a vertical axis, we define the range of x values # in the plot either # of the form `range=[-a,a]` or of the form `[-b,a]`, when a rug plot is overlaid. # In[8]: def create_violinplot(vals, fillcolor='#1f77b4', rugplot=True): vals=np.asarray(vals, np.float) vals_min, vals_max, q1, q2, q3, d1, d2=calc_stats(vals)#summary statistics pdf= stats.gaussian_kde(vals)# kernel density estimation of pdf xx=np.linspace(vals_min, vals_max, 100)# grid over the data interval yy=pdf(xx)#evaluate the pdf at the grid xx max_pdf=np.max(yy) distance=2.0*max_pdf/10 if rugplot else 0# distance from the violin plot to rugplot plot_xrange=[-max_pdf-distance-0.1, max_pdf+0.1]# range for x values in the plot plot_data=[make_half_violin(-yy, xx, fillcolor=fillcolor), make_half_violin(yy, xx, fillcolor=fillcolor), make_non_outlier_interval(d1, d2), make_quartiles(q1,q3), make_median(q2)] if rugplot: plot_data.append(make_rugplot(vals, max_pdf, distance=distance, color=fillcolor)) return plot_data, plot_xrange # Let us define first a single violin plot: # In[37]: df=pd.read_excel('Violin-plot-data.xlsx') df.head() # In[38]: x=list(df['Score']) plot_data, plot_xrange=create_violinplot(x, fillcolor='rgb(102,194,163)') # In[39]: layout=Layout(title='Violin and Rug Plot', autosize=False, font=Font(size=11), height=450, showlegend=False, width=350, xaxis=make_XAxis('', plot_xrange), yaxis=make_YAxis(''), hovermode='closest' ) # In[40]: layout['yaxis'].update(dict(showline=False, showticklabels=False, ticks='')) # In[41]: fig=Figure(data=Data(plot_data), layout=layout) # In[42]: py.sign_in('empet', 'my_api_key') py.iplot(fig, filename='Violin-Plot-Example') # Data summary encoded in a violin plot facilitate comparison of multiple data sets. # In the following we generate a few data sets and their violin plots: # In[9]: np.random.seed(619517) Nr=250 y = np.random.randn(Nr) gr = np.random.choice(list("ABCDE"), Nr) norm_params=[(0, 1.2), (0.7, 1), (-0.5, 1.4), (0.3, 1), (0.8, 0.9)]# mean and standard deviations for i, letter in enumerate("ABCDE"): y[gr == letter] *=norm_params[i][1]+ norm_params[i][0] df = pd.DataFrame(dict(Score=y, Group=gr)) df.head() # Group data: # In[10]: gb=df.groupby(['Group']) group_name=['A', 'B', 'C', 'D', 'E'] L=len(group_name) # Each violin plot will be displayed in a subplot: # In[11]: fig = tls.make_subplots(rows=1, cols=L, shared_yaxes=True, horizontal_spacing=0.025, print_grid=True) # Set colors for violins: # In[12]: violet_colors=['#604d9e','#6c4774','#9e70a2','#caaac2','#d6c7dd'] # Get plot data for each group, and assign them to the corresponding subplot: # In[13]: for k, gr in enumerate(group_name): vals= np.asarray( gb.get_group(gr)['Score'], np.float) plot_data, plot_xrange=create_violinplot(vals, fillcolor=violet_colors[k]) for item in plot_data: fig.append_trace(item, 1, k+1) fig['layout'].update({'xaxis{}'.format(k+1): make_XAxis('Group '+'{:d}'.format(k+1), plot_xrange)}) fig['layout'].update({'yaxis{}'.format(1): make_YAxis('')})# set the sharey axis style # In[14]: pl_width=900 pl_height=500 title = 'Violin Plots' fig['layout'].update(title=title, font= Font(family='Georgia, serif'), showlegend=False, hovermode='closest', autosize=False, width=pl_width, height=pl_height, margin=Margin( l=65, r=65, b=85, t=150 ) ) # In[15]: py.sign_in('empet', 'my_api_key') py.iplot(fig, filename='Multiple-Violins') # In[16]: from IPython.core.display import HTML def css_styling(): styles = open("./custom.css", "r").read() return HTML(styles) css_styling()