%matplotlib inline import numpy as np import matplotlib.pyplot as plt import pandas as pd !~/anaconda/bin/pip install brewer2mpl import brewer2mpl from matplotlib import rcParams #colorbrewer2 Dark2 qualitative color table dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7) dark2_colors = dark2_cmap.mpl_colors rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) from IPython.display import Image Image(filename='Italy.png') df=pd.read_csv("data/olive.csv") df.head(5) print df.columns df.rename(columns={df.columns[0]:'areastring'}, inplace=True) df.columns print 'regions\t', df.region.unique() print 'areas\t', df.area.unique() pd.crosstab(df.area, df.region) df.head() df.areastring=df.areastring.map(lambda x: x.split('.')[-1]) df.head() df[['palmitic','oleic']].head() print df['palmitic'] print "type of df[['palmitic']]:\t", type(df[['palmitic']]) print "type of df['palmitic']:\t\t", type(df['palmitic']) df.palmitic #your code here df.areastring.unique() acidlist=['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic'] #your code here dfsub=df[acidlist].apply(lambda x: x/100.0) dfsub.head() df[acidlist]=dfsub df.head() fig=plt.figure() plt.scatter(df.palmitic, df.linolenic) axis = fig.gca() #get current axis axis.set_title('linolenic vs palmitic') axis.set_xlabel('palmitic') axis.set_ylabel('linolenic') #fig can be got with fig.gcf() plt.hist(df.palmitic) fig, axes=plt.subplots(figsize=(10,10), nrows=2, ncols=2) axes[0][0].plot(df.palmitic, df.linolenic) axes[0][1].plot(df.palmitic, df.linolenic, '.') axes[1][0].scatter(df.palmitic, df.linolenic) axes[1][1].hist(df.palmitic) fig.tight_layout() xacids=['oleic','linolenic','eicosenoic'] yacids=['stearic','arachidic'] #your code here fig, axes=plt.subplots(figsize=(10,10), nrows=len(xacids), ncols=len(yacids)) for i, xacid in enumerate(xacids): for j, yacid in enumerate(yacids): axes[i][j].scatter(df[xacid],df[yacid]) axes[i][j].set_xlabel(xacid) axes[i][j].set_ylabel(yacid) fig.tight_layout() region_groupby = df.groupby('region') print type(region_groupby) region_groupby.head() for key, value in region_groupby: print "( key, type(value) ) = (", key, ",", type(value), ")" v=value v.head() dfrd=region_groupby.describe() print type(dfrd) dfrd.head(20) vecs=[] keys=[] for key, value in region_groupby: k=key v=value.std() print k, type(v), v dfbystd=df.groupby('region').std() dfbystd.head() dfbymean=region_groupby.aggregate(np.mean) dfbymean.head() region_groupby.aggregate(lambda x: x.palmitic.sum()) #probably not what u had in mind :-) region_groupby.apply(lambda f: f.mean()) region_groupby.apply(lambda f: f.palmitic.mean()) renamedict_std={k:k+"_std" for k in acidlist} renamedict_mean={k:k+"_mean" for k in acidlist} dfbystd.rename(inplace=True, columns=renamedict_std) dfbymean.rename(inplace=True, columns=renamedict_mean) dfbystd.head() dfpalmiticmean = dfbymean[['palmitic_mean']] dfpalmiticstd = dfbystd[['palmitic_std']] newdfbyregion=dfpalmiticmean.join(dfpalmiticstd) newdfbyregion.head() df.shape weights=np.random.uniform(size=df.shape[0]) smallerdf=df[['palmitic']] otherdf=df[['region']] otherdf['weight'] = weights otherdf.head() #your code here smallerdf=smallerdf.join(otherdf) smallerdf.head() #your code here def wfunc(f): return (f.palmitic*f.weight).sum()/f.weight.sum() smallerdf.groupby('region').apply(wfunc) #your code here def myfunc(column): return np.sum(column) region_groupby.aggregate(myfunc) rkeys=[1,2,3] rvals=['South','Sardinia','North'] rmap={e[0]:e[1] for e in zip(rkeys,rvals)} rmap mdf2=df.groupby('region').aggregate(np.mean) mdf2=mdf2[acidlist] mdf2.head() ax=mdf2.plot(kind='barh', stacked=True) ax.set_yticklabels(rvals) ax.set_xlim([0,100]) fig, axes=plt.subplots(figsize=(10,20), nrows=len(acidlist), ncols=1) i=0 colors=[dark2_cmap.mpl_colormap(col) for col in [1.0,0.5,0.0]] for ax in axes.flatten(): acid=acidlist[i] seriesacid=df[acid]#get the Pandas series minmax=[seriesacid.min(), seriesacid.max()] counts=[] nbins=30 histbinslist = np.linspace(minmax[0], minmax[1], nbins) counts=-np.diff([seriesacid[seriesacid>x].count() for x in histbinslist]).min() for k,g in df.groupby('region'): style = {'histtype':'step', 'color':colors[k-1], 'alpha':1.0, 'bins':histbinslist, 'label':rmap[k]} ax.hist(g[acid],**style) ax.set_xlim(minmax) ax.set_title(acid) ax.grid(False) #construct legend ax.set_ylim([0, counts]) ax.legend() i=i+1 fig.tight_layout() mask=(df.eicosenoic < 0.05) mask np.sum(mask), np.mean(mask) loweico=df[df.eicosenoic < 0.02] pd.crosstab(loweico.area, loweico.region) acidlistminusoleic=['palmitic', 'palmitoleic', 'stearic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic'] #your code here ax=region_groupby.aggregate(np.mean)[acidlistminusoleic].plot(kind="barh", stacked=True) ax.set_yticklabels(rvals); # just do the boxplot without the marginals to split the north out def make2d(df, scatterx, scattery, by="region", labeler={}): figure=plt.figure(figsize=(8,8)) ax=plt.gca() cs=list(np.linspace(0,1,len(df.groupby(by)))) xlimsd={} ylimsd={} xs={} ys={} cold={} for k,g in df.groupby(by): col=cs.pop() x=g[scatterx] y=g[scattery] xs[k]=x ys[k]=y c=dark2_cmap.mpl_colormap(col) cold[k]=c ax.scatter(x, y, c=c, label=labeler.get(k,k), s=40, alpha=0.4); xlimsd[k]=ax.get_xlim() ylimsd[k]=ax.get_ylim() xlims=[min([xlimsd[k][0] for k in xlimsd.keys()]), max([xlimsd[k][1] for k in xlimsd.keys()])] ylims=[min([ylimsd[k][0] for k in ylimsd.keys()]), max([ylimsd[k][1] for k in ylimsd.keys()])] ax.set_xlim(xlims) ax.set_ylim(ylims) ax.set_xlabel(scatterx) ax.set_ylabel(scattery) ax.grid(False) return ax a=make2d(df, "linoleic","arachidic", labeler=rmap) a.legend(loc='upper right'); import pandas.tools.rplot as rplot dfcopy=df.copy() dfcopy['region']=dfcopy['region'].map(rmap) imap={e[0]:e[1] for e in zip (df.area.unique(), df.areastring.unique())} #dfcopy['area']=dfcopy['area'].map(imap) plot = rplot.RPlot(dfcopy, x='linoleic', y='oleic'); plot.add(rplot.TrellisGrid(['region', '.'])) plot.add(rplot.GeomPoint(size=40.0, alpha=0.3, colour=rplot.ScaleRandomColour('area'))); fig=plot.render() print df.areastring.unique() #your code here plot = rplot.RPlot(dfcopy, x='palmitic', y='palmitoleic'); plot.add(rplot.TrellisGrid(['region', '.'])) plot.add(rplot.GeomPoint(size=40.0, alpha=0.3, colour=rplot.ScaleRandomColour('area'))); fig=plot.render() print dfcopy.areastring.unique() #adapted from https://github.com/roban/quarum/blob/master/margplot.py from mpl_toolkits.axes_grid1 import make_axes_locatable def setup_mhist(axes, figure): ax1=axes divider = make_axes_locatable(ax1) ax2 = divider.append_axes("top", 1.5, pad=0.0, sharex=ax1) ax3 = divider.append_axes("right", 1.5, pad=0.0, sharey=ax1) #xscale=yscale='log' #ax2.set_yscale(yscale) #ax3.set_xscale(xscale) #ax2.set_ylim([0,1]) #ax3.set_xlim([0,5]) ax2.grid(False) ax3.grid(False) ax2.grid(axis="y", color="white", linestyle='-', lw=1) ax3.grid(axis="x", color="white", linestyle='-', lw=1) remove_border(ax2, right=True, left=False) remove_border(ax3, right=False, left=True, bottom=False, top=True) figure.subplots_adjust(left=0.15, right=0.95) return [ax1,ax2,ax3] #BUG: need to get appropriate min and max amongst the multiple marginal hists #BUG: need to get highest frequency marked as label when we do this. def make_mhist(axeslist, x, y, color='b', mms=8): ax1 = axeslist[0] ax2 = axeslist[1] ax3 = axeslist[2] #print list(ax2.get_yticklabels()) for tl in (ax2.get_xticklabels() + ax2.get_yticklabels() + ax3.get_xticklabels() + ax3.get_yticklabels()): tl.set_visible(False) #for tl in ( ax2.get_xticklabels() + ax3.get_yticklabels()): # tl.set_visible(False) histbinslist = [np.ceil(len(x)/20.), np.ceil(len(y)/20.)] histbinslist = copy.copy(histbinslist) #style = {'histtype':'stepfilled', 'color':color, 'alpha':0.6, 'normed':True, 'stacked':True} style = {'histtype':'stepfilled', 'color':color, 'alpha':0.4} nbins = histbinslist[0] x_range = [np.min(x), np.max(x)] histbinslist[0] = np.linspace(x_range[0], x_range[1], nbins) ax2.hist(x, histbinslist[0], **style) nbins = histbinslist[1] y_range = [np.min(y), np.max(y)] histbinslist[1] = np.linspace(y_range[0], y_range[1], nbins) ax3.hist(y, histbinslist[1], orientation='horizontal', **style) import random import copy def scatter_by(df, scatterx, scattery, by=None, figure=None, axes=None, colorscale=dark2_cmap, labeler={}, mfunc=None, setupfunc=None, mms=8): cs=copy.deepcopy(colorscale.mpl_colors) if not figure: figure=plt.figure(figsize=(8,8)) if not axes: axes=figure.gca() x=df[scatterx] y=df[scattery] if not by: col=random.choice(cs) axes.scatter(x, y, cmap=colorscale, c=col) if setupfunc: axeslist=setupfunc(axes, figure) else: axeslist=[axes] if mfunc: mfunc(axeslist,x,y,color=col, mms=mms) else: cs=list(np.linspace(0,1,len(df.groupby(by)))) xlimsd={} ylimsd={} xs={} ys={} cold={} for k,g in df.groupby(by): col=cs.pop() x=g[scatterx] y=g[scattery] xs[k]=x ys[k]=y c=colorscale.mpl_colormap(col) cold[k]=c axes.scatter(x, y, c=c, label=labeler.get(k,k), s=40, alpha=0.3); xlimsd[k]=axes.get_xlim() ylimsd[k]=axes.get_ylim() xlims=[min([xlimsd[k][0] for k in xlimsd.keys()]), max([xlimsd[k][1] for k in xlimsd.keys()])] ylims=[min([ylimsd[k][0] for k in ylimsd.keys()]), max([ylimsd[k][1] for k in ylimsd.keys()])] axes.set_xlim(xlims) axes.set_ylim(ylims) if setupfunc: axeslist=setupfunc(axes, figure) else: axeslist=[axes] if mfunc: for k in xs.keys(): mfunc(axeslist,xs[k],ys[k],color=cold[k], mms=mms); axes.set_xlabel(scatterx); axes.set_ylabel(scattery); return axes def make_rug(axeslist, x, y, color='b', mms=8): axes=axeslist[0] zerosx1=np.zeros(len(x)) zerosx2=np.zeros(len(x)) xlims=axes.get_xlim() ylims=axes.get_ylim() zerosx1.fill(ylims[1]) zerosx2.fill(xlims[1]) axes.plot(x, zerosx1, marker='|', color=color, ms=mms) axes.plot(zerosx2, y, marker='_', color=color, ms=mms) axes.set_xlim(xlims) axes.set_ylim(ylims) return axes #BUG: remove ticks and maybe even border on top and right ax=scatter_by(df, 'linoleic', 'eicosenoic', by='region', labeler=rmap, mfunc=make_rug, mms=20) ax.grid(False) ax.legend(loc='upper right'); ax=scatter_by(df, 'linoleic', 'arachidic', by='region', labeler=rmap, setupfunc=setup_mhist, mfunc=make_mhist, mms=20) ax.grid(False) ax.legend(loc='upper right'); ax=scatter_by(df, 'linoleic', 'eicosenoic', by='region', labeler=rmap, setupfunc=setup_mhist, mfunc=make_mhist, mms=20) ax.grid(False) ax.legend(loc='upper right'); import scipy.stats as stats mu=0. sigma=1. samples=np.random.normal(mu, sigma, 10000) plt.hist(samples,bins=25, normed=True) nd=stats.norm() plt.hist(nd.rvs(size=10000), bins=25, alpha=0.5,normed=True) x=np.linspace(-4.0,4.0,100) plt.plot(x,nd.pdf(x)) plt.plot(x,nd.cdf(x)) mean = [0,0] cov = [[1,0],[0,5]] # diagonal covariance, points lie on x or y-axis m=300 nrvs = np.random.multivariate_normal(mean,cov,(m,m)) duets=nrvs.reshape(m*m,2) print duets[:,1] normaldf=pd.DataFrame(dict(x=duets[:,0], y=duets[:,1])) normaldf.head() ax=scatter_by(normaldf, 'x', 'y', figure=plt.figure(figsize=(8,10)),setupfunc=setup_mhist, mfunc=make_mhist, mms=20) #ax.grid(False) H, xedges, yedges = np.histogram2d(normaldf.x, normaldf.y, bins=(50, 50), normed=True) extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]] plt.imshow(H, extent=extent, interpolation='nearest') plt.colorbar() from pandas.tools.plotting import scatter_matrix scatter_matrix(df[['linoleic','arachidic','eicosenoic']], alpha=0.3, figsize=(10, 10), diagonal='kde'); plt.figure(figsize=(24,5)) for key, group in df.groupby('region'): plt.subplot(int('13'+str(key))) group[acidlistminusoleic].boxplot(grid=False) ax=plt.gca() ax.set_title(rvals[key-1]) remove_border(ax, left=False, bottom=False) ax.grid(axis="y", color="gray", linestyle=':', lw=1) from pandas.tools.plotting import parallel_coordinates dfna=df[['region', 'palmitic', 'palmitoleic', 'stearic', 'oleic', 'linolenic', 'linoleic', 'arachidic', 'eicosenoic']] dfna_norm = (dfna - dfna.mean()) / (dfna.max() - dfna.min()) dfna_norm['region']=df['region'].map(lambda x: rmap[x]) parallel_coordinates(dfna_norm, 'region', colors=[dark2_cmap.mpl_colormap(col) for col in [1.0,0.5,0.0]], alpha=0.05) ax2=mdf2.plot(kind='barh', stacked=True, color=dark2_colors, grid=False, legend=False) remove_border(ax2, left=False, bottom=False) ax2.grid(axis="x", color="white", linestyle='-', lw=1) ax2.legend(loc='right', bbox_to_anchor=(1.3,0.5)) labels2=['South','Sardinia','North'] ax2.set_yticklabels(labels2) ax2.set_ylabel(''); ax2.set_xlim(right=100.0); #your code here def makenice_barplot(frame, by, bymap, columnlist, aggregatefunc=np.mean, largest=100.0): df2=df.groupby(by).aggregate(aggregatefunc) df2=df2[columnlist] ax2=df2.plot(kind='barh', stacked=True, color=dark2_colors, grid=False, legend=False) remove_border(ax2, left=False, bottom=False) ax2.grid(axis="x", color="white", linestyle='-', lw=1) ax2.legend(loc='right', bbox_to_anchor=(1.3,0.5)) labels=[bymap[i] for i in df2.index] print labels ax2.set_yticklabels(labels) ax2.set_ylabel(''); ax2.set_xlim(right=largest); makenice_barplot(df,'region', rmap, acidlistminusoleic, largest=30) fig=plt.figure(figsize=(10,10)) ax=scatter_by(df, 'linoleic', 'arachidic', by='area', figure=fig, labeler=imap, setupfunc=setup_mhist, mfunc=make_mhist, mms=20) ax.grid(False) ax.legend(loc='right', bbox_to_anchor=(1.7,0.5)); indices=np.tril_indices(8) plts=[] for i,j in zip(indices[0], indices[1]): if i!=j: plts.append((i,j)) print plts fig, axes = plt.subplots(nrows=14, ncols=2, figsize=(14,40)); k=0 af=axes.flatten() for a in af: i,j=plts[k] a=scatter_by(df, acidlist[i], acidlist[j], by='region', axes=a, labeler=rmap, mfunc=make_rug, mms=20); a.grid(False); k=k+1 af[0].legend(loc='best'); fig.tight_layout();