import numpy as np import pandas as pd import patsy import seaborn as sns import statsmodels.formula.api as smf import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from matplotlib import cm import itertools %matplotlib inline sns.set_style("white") betas = np.array([1, 1, -2.]) x = np.linspace(0, 1) # vector for x y = np.linspace(0, 1) # vector for y X, Y = np.meshgrid(x, y) # grid of X and Y # the height Z is the sum of the weighted X and Ys (weighted by our betas): Z = X*betas[0] + \ Y*betas[1] + \ X*Y*betas[2] # create a matplotlib figure window, add 3D axes # http://matplotlib.org/examples/mplot3d/subplot3d_demo.html fig = plt.figure(figsize=(12, 4)) # fig = plt.figure(figsize=plt.figaspect(0.3)) # first view: ax = fig.add_subplot(1, 3, 1, projection='3d') ax.plot_surface(X, Y, Z, alpha=1, cmap=cm.coolwarm) ax.view_init(elev=15, azim=20) ax.set_xlabel('Fire') ax.set_xticks([0, .5, 1]) ax.set_ylabel('Hydrogen') ax.set_yticks([0, .5, 1]) ax.set_zlabel('Lift') ax.set_zticks([]) # second view: ax = fig.add_subplot(1, 3, 2, projection='3d') ax.plot_surface(X, Y, Z, alpha=1, cmap=cm.coolwarm) ax.view_init(elev=20, azim=55) ax.set_xlabel('Fire') ax.set_xticks([0, .5, 1]) ax.set_ylabel('Hydrogen') ax.set_yticks([0, .5, 1]) ax.set_zlabel('Lift') ax.set_zticks([]) # third view: ax = fig.add_subplot(1, 3, 3, projection='3d') ax.plot_surface(X, Y, Z, alpha=1, cmap=cm.coolwarm) ax.view_init(elev=10, azim=80) ax.set_xlabel('Fire') ax.set_xticks([0, .5, 1]) ax.set_ylabel('Hydrogen') ax.set_yticks([0, .5, 1]) ax.set_zlabel('Lift') ax.set_zticks([]) plt.show() # make a z vector z = x*betas[0] + \ y*betas[1] + \ x*y*betas[2] # put all the vectors into a new dataframe: df = pd.DataFrame({'fire': x, 'hydrogen': y, 'lift': z}) # fit regression with interaction, plot: sns.interactplot('fire', 'hydrogen', 'lift', df, cmap="coolwarm", filled=True, levels=25); # import the dataset csv file into a Pandas dataframe object: sleep = pd.read_csv('../Datasets/sleep.txt', sep='\t') sleep = sleep.dropna(axis=0) # drop on axis 0 (i.e. rows) # variable creation: sleep['log_BodyWt'] = np.log(sleep['BodyWt']) sleep['danger_cat'] = 'low' # set as a string, "low" for all. sleep.loc[sleep['Danger'] > 3, 'danger_cat'] = 'high' # set values of 4 and 5 to "high" sleep['life_cat'] = pd.qcut(sleep['LifeSpan'], [0, .33, .66, 1]) sleep['life_cat'] = sleep['life_cat'].cat.rename_categories(['short', 'med', 'long']) # rename categories sleep['life_cat'] = sleep['life_cat'].astype('object') # make it an object, rather than a category sleep.info() X = patsy.dmatrix('log_BodyWt * Gestation', sleep) X # convert to pandas df so seaborn is happy: tmp = pd.DataFrame(X, columns = ['intercept', 'log_bodyweight', 'gestation', 'interaction'] ) # plot pairplot: sns.pairplot(tmp, vars=['log_bodyweight', 'gestation', 'interaction']); fit_2 = smf.glm('TotalSleep ~ log_BodyWt * Gestation', sleep).fit() print(fit_2.summary()) x = np.linspace(sleep['log_BodyWt'].min(), sleep['log_BodyWt'].max()) # vector for x (log bodyweight) y = np.linspace(sleep['Gestation'].min(), sleep['Gestation'].max()) # vector for y (gestation) X, Y = np.meshgrid(x, y) # grid of X and Y # the height Z is the sum of the weighted X and Ys (weighted by our betas): Z = fit_2.params[0] + \ X*fit_2.params[1] + \ Y*fit_2.params[2] + \ X*Y*fit_2.params[3] # note the addition of the interaction term here! # create a matplotlib figure window, add 3D axes # http://matplotlib.org/examples/mplot3d/subplot3d_demo.html fig = plt.figure(figsize=(12, 4)) # first view: ax = fig.add_subplot(1, 3, 1, projection='3d') ax.plot_surface(X, Y, Z, cmap=cm.coolwarm) ax.view_init(elev=10, azim=20) ax.set_xlabel('log bodyweight (kg)') ax.set_ylabel('gestation (days)') ax.set_zlabel('Total sleep (hours)') # second view: ax = fig.add_subplot(1, 3, 2, projection='3d') ax.plot_surface(X, Y, Z, cmap=cm.coolwarm) ax.view_init(elev=10, azim=45) ax.set_xlabel('log bodyweight (kg)') ax.set_ylabel('gestation (days)') ax.set_zlabel('Total sleep (hours)') # third view: ax = fig.add_subplot(1, 3, 3, projection='3d') ax.plot_surface(X, Y, Z, cmap=cm.coolwarm) ax.view_init(elev=10, azim=70) ax.set_xlabel('log bodyweight (kg)') ax.set_ylabel('gestation (days)') ax.set_zlabel('Total sleep (hours)') plt.show() sns.interactplot("log_BodyWt", "Gestation", "TotalSleep", sleep, cmap="coolwarm", filled=True, levels=25); # read in the data, rename some values: planes = pd.read_csv('../Datasets/planes.txt', sep='\t') # column names to lowercase: planes.columns = [c.lower() for c in planes.columns] # turn "paper" variable into an object, not int64: planes['paper'] = planes['paper'].astype(object) planes.replace(to_replace={'paper': {1: '80', 2: '50'}, 'angle': {1: 'horz', 2: '45deg '}, 'plane': {1: 'sleek', 2: 'simple'}}, inplace=True) # rename "plane" to "design": planes = planes.rename(columns={'plane': 'design'}) print(planes) fit = smf.glm('distance ~ design * paper', planes).fit() print(fit.summary()) X = patsy.dmatrix('~ design * paper', planes) X # make a data frame containing the predictions, using the expand grid function. # define expand grid function: def expand_grid(data_dict): """ A port of R's expand.grid function for use with Pandas dataframes. from http://pandas.pydata.org/pandas-docs/stable/cookbook.html?highlight=expand%20grid """ rows = itertools.product(*data_dict.values()) return pd.DataFrame.from_records(rows, columns=data_dict.keys()) # build a new matrix with expand grid: preds = expand_grid({'paper': ['80', '50'], 'design': ['sleek', 'simple']}) preds_2 = preds.copy() preds['distance'] = fit.predict(preds) # also fit a model with no interaction for comparison: fit_2 = smf.glm('distance ~ paper + design', planes).fit() preds_2['distance'] = fit_2.predict(preds_2) # to plot model predictions against data, we should merge them together: # see answer (https://stackoverflow.com/questions/28239359/showing-data-and-model-predictions-in-one-plot-using-seaborn-and-statsmodels) merged = pd.concat(dict(data=planes, additive=preds_2, interaction=preds), names=['type']).reset_index() print(merged) # plot with seaborn: g = sns.factorplot('design', 'distance', 'paper', data=merged, col='type', kind='point', size=3.5) g.fig.subplots_adjust(wspace=0.3) sns.despine(offset=10); print(fit_2.summary()) X = patsy.dmatrix('~ log_BodyWt * danger_cat', sleep) X fit_2 = smf.glm('TotalSleep ~ log_BodyWt * danger_cat', sleep).fit() # interaction model print(fit_2.summary()) # Plot the two models against each other. # we do this slightly differently to the categorical examples above, # due to the need for some data re-arranging to plot the data as points # and the model predictions as lines. preds = expand_grid( {'log_BodyWt': np.linspace(-6, 9, 2), 'life_cat': ['short', 'med', 'long'], 'danger_cat': ['low', 'high']}) fit = smf.glm('TotalSleep ~ log_BodyWt + danger_cat', sleep).fit() # additive model fit_2 = smf.glm('TotalSleep ~ log_BodyWt * danger_cat', sleep).fit() # interaction model # add model predictions to data frames: preds['yhat'] = fit.predict(preds) preds['yhat_2'] = fit_2.predict(preds) merged = sleep.append(preds) # Plot additive model: g = sns.FacetGrid(merged, hue='danger_cat', size=4) # use the `map` method to add stuff to the facetgrid axes: g.map(plt.scatter, "log_BodyWt", "TotalSleep") g.map(plt.plot, "log_BodyWt", "yhat") sns.despine(offset=10) g.set_axis_labels('Bodyweight [log kg]', 'Sleep [hours]') g.add_legend(title='Danger') # plot interaction model: g = sns.FacetGrid(merged, hue='danger_cat', size=4) # use the `map` method to add stuff to the facetgrid axes: g.map(plt.scatter, "log_BodyWt", "TotalSleep") g.map(plt.plot, "log_BodyWt", "yhat_2") sns.despine(offset=10) g.set_axis_labels('Bodyweight [log kg]', 'Sleep [hours]') g.add_legend(title='Danger') from IPython.core.display import HTML def css_styling(): styles = open("../custom_style.css", "r").read() return HTML(styles) css_styling()