#!/usr/bin/env python # coding: utf-8 # # Solution: Pandas and Seaborn # ## Data Science with Python # ## Yoav Ram # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import scipy import numpy as np import urllib.request import zipfile import os.path import pandas as pd import seaborn as sns sns.set_context('notebook') sns.set_palette('muted') # # Exercise 1 # # In this question we will look for a relation between litter or clutch size (number of offspring per birth) and the birth weight (the weight of the offpring) in the animal kingdom. # # For this analysis we will load the [AnAge](http://genomics.senescence.info/download.html#anage) dataset that we used in the [Pandas and Seaborn](../sessions/pandas-seaborn.ipynb) session. # Open the data file and read the data to a `DataFrame`. We are interested in the `Litter/Clutch size` and `Birth weight (g)` columns. # In[2]: data = pd.read_csv('../data/anage_data.txt', sep='\t') # If you examined the data you might have noticed that some rows have a `NaN` value in our columns of interest. # We need to remove these rows from the data. # You can use `np.isnan`, `np.isfinite` or any other method you'd like. # In[3]: data = data[np.isfinite(data['Litter/Clutch size'])] data = data[np.isfinite(data['Birth weight (g)'])] assert not np.isnan(data['Litter/Clutch size']).any() ### assert not np.isnan(data['Birth weight (g)']).any() ### # We are looking for a possible linear relationship between the variables. # # Plot a scatter plot of the data, and consider if you should transform the data using a transformation such as log, square-root, reciprocal ($1/x$) or some other transformation that makes sense to you. # Both columns might require transformations. # # Plot a new scatter plot of the transformed data. # # Don't forget the axes labels should change to reflect the transformation! # But in the plot I left below I didn't put the name of the transformation I chose so that it is a _secret_... # In[4]: data['Log Litter/Clutch size'] = np.log(data['Litter/Clutch size']) data['Log Birth weight (g)'] = np.log(data['Birth weight (g)']) data.plot(x='Log Litter/Clutch size', y='Log Birth weight (g)', marker='.', ls='') plt.xlabel('Log Litter/Clutch size') plt.ylabel('Log Birth weight (g)') plt.legend().set_visible(False) # After applying a transformation for which there is a good correlation between the litter size and the birth weight, create a linear model plot using Seaborn. # In[5]: sns.lmplot(x='Log Litter/Clutch size', y='Log Birth weight (g)', data=data, scatter_kws=dict(alpha=0.2)); plt.xlabel('Log Litter/Clutch size') plt.ylabel('Log Birth weight (g)'); # ### Bonus questions # # Now we should be satisfied that there is a linear relationship between the variables. # # Please perform linear regression on the transformed data using `scipy.stats.linregress`. # Print the intercept and slope of the regression. # # Note that if you used a different transformation you might get different values; that's OK as long as your results make sense. # In[6]: slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( x=data['Log Litter/Clutch size'], y=data['Log Birth weight (g)']) print("intercept: {:.3f}, slope: {:.3f}".format(intercept, slope)) ### # Note that `linregress` provides a p-value for the null hypothesis that the slope is 0. # # Think: What does it mean if the null hypothesis is rejected? # # Decide if the null hypothesis can be rejected and print a statement that summarizes the result regarding litter size and birth weight. # # Don't forget to print the p-value to support your claim. # In[7]: print("p-value={:.2g}".format(p_value)) if p_value < 0.05: if slope > 0: print("Litter size has significant and positive effect on birth weight") else: print("Litter size has significant and negative effect on birth weight") else: print("Litter size does not have a significant effect on birth weight") # Finally, use the slope and intercept to predict the birth weight of offspring in a litter with 10 offspring (don't forget the transformation!). # In[8]: print("In a litter with 10 offspring, the birth weight will be", np.exp(intercept + slope * np.log(10.)), "grams") # # Exercise 2 # # In this analysis we will compare the body temperature of animals to check if indeed there is such a thing as [warm-blooded](http://en.wikipedia.org/wiki/Warm-blooded) and cold-blooded animals. # # You'll need to reload the data since in the previous exercise you removed some of the rows. # In[9]: data = pd.read_csv('../data/anage_data.txt', sep='\t') # The temperatures are in Kelvin degrees in the `Temperature (K)` column, and we like Celsius degrees, so use transform the temperature to Celsius and save the result in a new column. # # Note: SciPy has a special function for Kelvin to Celsius conversion: `scipy.constants.convert_temperature`. # In[12]: from scipy.constants import convert_temperature data['Temperature (C)'] = convert_temperature(data['Temperature (K)'], 'K', 'C') # Plot a histogram of the temperatures (in Celsius). Don't forget to use meaningful `bins`. # In[13]: bins = np.linspace(0, 45, 45) plt.hist(data['Temperature (C)'], bins=bins, lw=0) plt.xlabel("Temperature (C)") plt.ylabel("# Species"); # Count how many species we have in the data frame in each `Class`. # Remove from the data classes with fewer than 10 species (you can do this manually by specifiyng the class names or automatically using the count you calculated). # # Don't forget to clean the data frame from rows with missing data in the temperature column. # In[14]: data = data[np.isfinite(data['Temperature (C)'])] data['Class'].value_counts() # In[15]: data = data[data["Class"] != 'Aves'] # Plot a separate histogram of the temperature for each animal Class. # # Use a [faceted figure](https://seaborn.pydata.org/examples/faceted_histogram.html). # Don't forget axes labels, proper bins, and a reasonable figure size. # In[16]: g = sns.FacetGrid(data=data, col="Class", margin_titles=True, sharex=True, sharey=False, height=4.5) g.map(plt.hist, 'Temperature (C)', bins=bins, lw=0) g.set_ylabels("# Species"); # ### Bonus questions # # Perform a [t-test](http://iaingallagher.tumblr.com/post/50980987285/t-tests-in-python) to verify that the temperature of mammals is, on average, larger then the temperature of amphibians. # Print the result of the t-test. # # Note: see the end of the [Pandas and Seaborn](../sessions/pandas-seaborn.ipynb) session, in which there was an example of using a t-test. # In[17]: mammalia = data.loc[data.Class=='Mammalia', 'Temperature (C)'] amphibia = data.loc[data.Class=='Amphibia', 'Temperature (C)'] t, p_value = scipy.stats.ttest_ind(mammalia, amphibia, equal_var=False) H0_rejected = p_value < 0.05 print("The temperatures of mammals and amphibians are different? {}".format(H0_rejected)) ### print("P-value: {:.2g}".format(p_value)) ### # **End**