#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import numpy.random as random from IPython.display import Image # seaborn is under active development and throws some scary looking warnings import warnings # this will allow us to use the code in peace :) warnings.filterwarnings("ignore") # # #### Lecture 17: # # - Learn how to use the **seaborn** package to produce beautiful plots # # - Learn about kernel density estimates # # - Learn appropriate ways of representing different types of data # ### seaborn # # In this lecture, we will learn about **seaborn**. **seaborn** is a package with many tools for data visualization. It allows you to make pretty plots. Almost anything in **seaborn** can be done using **matplotlib**, but with **seaborn**'s built-in functions you can reduce a lot of **matplotlib** code down to a single line. **seaborn** isn't just a pretty face. Its real power is in statistical data analysis. It has a lot of functions built in for visualizing the distribution of your data, for example. # # Let's take a look at some of the plots we can make with this package. We can import it using: # In[2]: import seaborn as sns # ### Unusual distributions, kernel density estimates and jointplots # # In some cases, we have distributions of data that don't look like a simple (e.g., normal) distribution, for example, the data could be bimodal or have skewed shaped distributions (remember the histogram of the elevation data from around the world with two humps). # # Let's create some synthetic bimodal data by drawing from two separate normal/lognormal distributions and combine the two into two bimodal data sets. We do this by drawing from **random.normal( )** twice for two normal distributions ($x_1,x_2$) and twice from **random.lognormal( )** for two lognormal distributions ($y_1,y_2$). # # In[3]: xdata1=random.normal(20,25,5000) # first x draw xdata2=random.normal(100,25,5000) # second x draw ydata1=random.lognormal(2,0.1,8000) # first y draw ydata2=random.lognormal(3,0.1,2000) # second y draw xdata=np.append(xdata1,xdata2) # combine the two x data sets ydata=np.append(ydata1,ydata2) # combine the two y data sets # When we plot our xdata as a histogram, we can see that we have a broadly bimodal distribution. For fun, let's also plot the mean of the distribution as a red line. # In[4]: # make the histogram plt.hist(xdata,bins=50) # put on a heavy (linewidth=3) vertical red line at the mean of xdata plt.axvline(np.mean(xdata),color='red',linewidth=3); # We can see that our mean lies right between the twin peaks. Describing this distribution with statistics meant for normal distributions (mean or standard deviation) is just plain wrong. # # Another way to represent the distribution of a set of datapoints is known as a _kernel density estimate_ (kde). This places a 'kernel' (an assumed distribution at the data point level - usually a normal distribution) at each data point and then sums up the contributions from all the data points. Kernal density estimates avoid the awkwardness of choice of bin size associated with histograms, for example. (We just picked 50 in the plot above - why 50?). # # Here are some data represented on a bar plot in the lefthand plot. And on the right, we illustrate the idea behind kernal density estimates. The black lines are the locations of individual datapoints and the red dashed lines are the kernels at each point. The heavy blue line is the kernel density estimate (the sum of all the red dashed lines). # In[5]: Image('Figures/KDE.png') # \[_Source:_ https://commons.wikimedia.org/wiki/File:Comparison_of_1D_histogram_and_KDE.png _Wikimedia Creative Commons_\] # Happily, we can plot kernel density estimates using the **sns.kdeplot( )** function. The **shade** argument allows us to shade the area underneath the curve. By the way, in **matplotlib**, the same thing can be achieved using the function **plt.fill_between**. # In[6]: sns.kdeplot(xdata,shade=True); # Well that was fairly painless! We can also plot the kernal density estimate and the histogram on top of one another using the **sns.distplot( )** function. # In[6]: sns.distplot(xdata); # As you can see, this is a lot quicker than how we were plotting our distribution in the lecture on statistics! # # With our $ydata$ we can see that we also have a bimodal distribution, but there are far fewer data points in the wider mode (we only used 2000 of our 10000 points for this mode). # In[7]: sns.distplot(ydata); # What if our data had both $x$ and $y$ components. For example, measurements of length and width from a set of shark teeth with two species in it. How would we visualize it? Let's just try plotting the $x$ data on the x axis and $y$ data on the y axis as dots on a regular **matplotlib** plot. # In[8]: plt.plot(xdata,ydata,'.'); # It's much harder to see the bimodal distribution in the $x$ data in this case, and we can't really see that there are so much fewer data points in the $y$ data mode than the more tightly clustered one. Also, this is not a plot you would expect from length and width dimensions of teeth from a single species - wouldn't that be a linear plot? # # When we have a lot of datapoints, this type of plot gives us no easy way to estimate data density. # Fortunately, **seaborn** includes a cool plot called '**sns.jointplot( )**. This combines the histograms and the scatter plot into a single graph. # In[9]: sns.jointplot(xdata,ydata); # Jointplot can also do kernel density estimates! The **kind** argument on this plot gives us many other options plotting data as well: # In[11]: help(sns.jointplot) # Let's try the **kind=kde** option. (Warning, doing a 2d kde plot like this is a lot of work so this cell might take a while to run). # In[12]: sns.jointplot(xdata,ydata,kind='kde'); # Another type of plot that looks nice is the hexbin plot (kind of like a hexagonal 2d histogram) # In[13]: sns.jointplot(xdata,ydata,kind='hexbin'); # ### Multi dimensional data and pairplots # # What if you have data with more than two dimensions? A good example of this is with isotopic data from Ocean Island Basalts. Isotopic systems are used to "finger-print" different sources of melt in the mantle. It is used to characterize what is deep in the Earth using what gets brought up to form the ocean islands. By now there are data available for many different isotopic systems. Here we take a look at a small sample of what is in the [GeoRoc database (http://georoc.mpch-mainz.gwdg.de/georoc/)](http://georoc.mpch-mainz.gwdg.de/georoc/) for ocean island basalts. # In[12]: MantleArray=pd.read_csv('Datasets/GeoRoc/MantleArray_OIB.csv') MantleArray.head() # There are lots of different isotope ratios available. In this lecture, we focus on four different isotopic ratios: $^{87}$Sr/$^{86}$Sr , $^{206}$Pb/$^{204}$Pb , $ \varepsilon$ Nd and $ \varepsilon$ Hf. (The ratios with an $ \varepsilon$ are the isotope ratios relative to a standard value in parts per 10000, this allows you to see the variation better as it is normally very small). # # So, how do you plot multi-dimensional data? Our new plotting buddy **seaborn** has a function for multi-dimensional data known as the **sns.pairplot( )**. This makes a plot that takes data with many dimensions and plots each possible two dimensional combination against one another in a grid where each of the rows and columns represents a dimension. When a particular dimension is plotted against itself, however, it gives the kernal density estimates for the different data types (set by the keyword argument, **hue**). In this case we will be using $LOCATION$ to group the data). # # In our isotopic example, the top row plots $ \varepsilon$ Nd against all of the other isotope ratios in turn with the first plot on the left being the kernal density plot for each location with each in a different color. # # Let's see it in action (warning, cell may take a while to run because of the large number of locations): # In[15]: sns.pairplot(MantleArray,\ vars=['EPSILON_ND','SR87_SR86','PB206_PB204','EPSILON_HF'],\ hue='LOCATION'); # **sns.pairplot( )** gives us an interesting way of seeing trends in multidimensional space. Some ocean islands have quite different isotopic compositions from others! It's important to note that some projections of these data (e.g. $^{206}$Pb/$^{204}$Pb vs $ \varepsilon$ Nd) show this effect more prominently whereas others, like $ \varepsilon$ Nd vs $ \varepsilon$ Hf, are more subtle. In this plot (second from the bottom on the left), the Austral-Cook Islands are all the way on the top to the right, and Pitcairn is way at the bottom to the left. # # It might be surprising to some of you who know a little geophysics that there is heterogeneity between different ocean islands. You might have thought that the mantle is very well mixed by convection. Many geochemists believe that the source regions for the plumes that form these basalts have different compositions because subduction brings down crustal material with different compositions to the base of the mantle, and this material then upwells in plumes that form the ocean islands. According to that hypothesis, the variation in isotopic ratios we see here comes from mixing between these recycled crustal end members and a mantle composition. # # We will look further at this dataset when we learn how to plot things in three dimensions and also when we group the data into 'end-members' using a bit of machine learning. # ### Box and whisker, violin plots and swarm plots # # The kernal density estimate plots in the element versus itself plots in the **sns.pairplot( )** plot example above plot all of the isotopic data on one plot. This doesn't really allow us to appreciate fully the variability at a given location. There are two additional plots that might give us a better idea of that: the 'box and whisker' plot (very popular, but I hate them) and the 'violin' plots (love these!). # # **seaborn** has functions (actually classes) for both of these, **sns.boxplot( )** and **sns.violinplot( )**. In these plots, we can also place the data (and not just the distributions) and can use the handy **sns.swarmplot( )** to spread out the data along the x-axis so that we can see them better. We'll go through each of these tricks in the following. # # First, the boxplot: # # In[16]: help(sns.boxplot) # Let's look at the $\epsilon$ Hf data as a function of location (similar to the lower right hand corner plot in the pairplot example above: # In[17]: box=sns.boxplot(data=MantleArray,x='LOCATION',y='EPSILON_HF') # Oh my. Pretty plot, but the x axis tick labels overlap so badly we can't even see them. To rotate them, we use the figure object as we have done before. Instead of **fig**, we will call the figure object **box** in the following. This will let us use **plt.setp( )** to set properties (such as the x tick labels) on our plot object, **box**. # In[10]: help(plt.setp) # In[13]: box=sns.boxplot(data=MantleArray,x='LOCATION',y='EPSILON_HF') plt.setp(box.get_xticklabels(), rotation=90); # The x axis labels are better, but the plot still doesn't look great. The black diamonds are known as 'fliers' and could represent "outliers". # # As we have some pretty large "outliers" in this dataset, we need to either limit the range on the y axis or by making the plot larger. These fliers are annoying (although they represent something that maybe we shouldn't be ignoring...). It is possible to exclude them from the box and whisker plot, using the argument **fliersize=0**. # # You need to be aware that this can be a problematic practice as you are effectively hiding data from the viewer. Personally, I wouldn't do it... But lot's of folks do, so here is how: # In[14]: plt.figure(figsize=(10,8)) # make the plot bigger box=sns.boxplot(data=MantleArray,x='LOCATION',y='EPSILON_HF',fliersize=0) plt.setp(box.get_xticklabels(), rotation=90); # See! This looks much better. But looks can be deceiving! # # # Alternatively, we can put the data back on, but use a white dot with a black edge, and spread them out so we can see them. # In order to see the data points that overlap one another, **seaborn** has an option to plot the data points in a cloud using the function **sns.swarmplot( )**. Here we do both of these (and also make the points white with a black edge). The argument **alpha=0.5** is used here to make the points semi transparent. (**alpha** sets 'opacity' so **alpha=1** is fully opaque and **alpha=0**, points are invisible). # In[21]: plt.figure(figsize=(20,10)) box=sns.boxplot(data=MantleArray,x='LOCATION',y='EPSILON_HF',fliersize=0) plt.setp(box.get_xticklabels(), rotation=90); sns.swarmplot(data=MantleArray,x='LOCATION',y='EPSILON_HF',color='white',edgecolor='black',linewidth=1,alpha=0.5); # Very nice! Now you can see what the boxes are based on. And you can see right away that the dots are not drawn from normal distributions! # # Another way to visualize the same data is with a violin plot. It is similar to a box and whisker plot, but shows the kernel density estimates instead of just a 'box'. # In[22]: help(sns.violinplot) # In[23]: plt.figure(figsize=(20,10)) violin=sns.violinplot(data=MantleArray,x='LOCATION',y='EPSILON_HF',fliersize=0) plt.setp(violin.get_xticklabels(), rotation=90); # And if you want to put all the data back on (not just the ones inside the violins: # In[24]: plt.figure(figsize=(20,10)) violin=sns.violinplot(data=MantleArray,x='LOCATION',y='EPSILON_HF') plt.setp(violin.get_xticklabels(), rotation=90); sns.swarmplot(data=MantleArray,x='LOCATION',y='EPSILON_HF',color='white',edgecolor='black',linewidth=1,alpha=0.5);