%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

!~/anaconda/bin/pip install brewer2mpl

import brewer2mpl
from matplotlib import rcParams

#colorbrewer2 Dark2 qualitative color table
dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7)
dark2_colors = dark2_cmap.mpl_colors

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

from IPython.display import Image
Image(filename='Italy.png')

df=pd.read_csv("data/olive.csv")
df.head(5)

print df.columns
df.rename(columns={df.columns[0]:'areastring'}, inplace=True)
df.columns

print 'regions\t', df.region.unique()
print 'areas\t', df.area.unique()

pd.crosstab(df.area, df.region)

df.head()

df.areastring=df.areastring.map(lambda x: x.split('.')[-1])
df.head()

df[['palmitic','oleic']].head()

print df['palmitic']

print "type of df[['palmitic']]:\t", type(df[['palmitic']]) 
print "type of df['palmitic']:\t\t", type(df['palmitic'])

df.palmitic

#your code here

df.areastring.unique()

acidlist=['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic']

#your code here

dfsub=df[acidlist].apply(lambda x: x/100.0)
dfsub.head()

df[acidlist]=dfsub
df.head()

fig=plt.figure()
plt.scatter(df.palmitic, df.linolenic)
axis = fig.gca() #get current axis
axis.set_title('linolenic vs palmitic')
axis.set_xlabel('palmitic')
axis.set_ylabel('linolenic')
#fig can be got with fig.gcf()

plt.hist(df.palmitic)

fig, axes=plt.subplots(figsize=(10,10), nrows=2, ncols=2)
axes[0][0].plot(df.palmitic, df.linolenic)
axes[0][1].plot(df.palmitic, df.linolenic, '.')
axes[1][0].scatter(df.palmitic, df.linolenic)
axes[1][1].hist(df.palmitic)
fig.tight_layout()

xacids=['oleic','linolenic','eicosenoic']
yacids=['stearic','arachidic']

#your code here
fig, axes=plt.subplots(figsize=(10,10), nrows=len(xacids), ncols=len(yacids))
for i, xacid in enumerate(xacids):
    for j, yacid in enumerate(yacids):
        axes[i][j].scatter(df[xacid],df[yacid])
        axes[i][j].set_xlabel(xacid)
        axes[i][j].set_ylabel(yacid)
fig.tight_layout()

region_groupby = df.groupby('region')
print type(region_groupby)
region_groupby.head()

for key, value in region_groupby:
    print "( key, type(value) ) = (", key, ",", type(value), ")"
    v=value

v.head()

dfrd=region_groupby.describe()
print type(dfrd)
dfrd.head(20)

vecs=[]
keys=[]
for key, value in region_groupby:
    k=key
    v=value.std()
print k, type(v), v

dfbystd=df.groupby('region').std()
dfbystd.head()

dfbymean=region_groupby.aggregate(np.mean)
dfbymean.head()

region_groupby.aggregate(lambda x: x.palmitic.sum()) #probably not what u had in mind :-)

region_groupby.apply(lambda f: f.mean())

region_groupby.apply(lambda f: f.palmitic.mean())

renamedict_std={k:k+"_std" for k in acidlist}
renamedict_mean={k:k+"_mean" for k in acidlist}
dfbystd.rename(inplace=True, columns=renamedict_std)
dfbymean.rename(inplace=True, columns=renamedict_mean) 
dfbystd.head()

dfpalmiticmean = dfbymean[['palmitic_mean']] 
dfpalmiticstd = dfbystd[['palmitic_std']] 

newdfbyregion=dfpalmiticmean.join(dfpalmiticstd)
newdfbyregion.head()

df.shape

weights=np.random.uniform(size=df.shape[0])
smallerdf=df[['palmitic']]
otherdf=df[['region']]
otherdf['weight'] = weights
otherdf.head()

#your code here
smallerdf=smallerdf.join(otherdf)
smallerdf.head()

#your code here
def wfunc(f):
    return (f.palmitic*f.weight).sum()/f.weight.sum()
smallerdf.groupby('region').apply(wfunc)

#your code here

def myfunc(column):
    return np.sum(column)
    
region_groupby.aggregate(myfunc)

rkeys=[1,2,3]
rvals=['South','Sardinia','North']
rmap={e[0]:e[1] for e in zip(rkeys,rvals)}
rmap

mdf2=df.groupby('region').aggregate(np.mean)
mdf2=mdf2[acidlist]
mdf2.head()

ax=mdf2.plot(kind='barh', stacked=True)
ax.set_yticklabels(rvals)
ax.set_xlim([0,100])

fig, axes=plt.subplots(figsize=(10,20), nrows=len(acidlist), ncols=1)
i=0
colors=[dark2_cmap.mpl_colormap(col) for col in [1.0,0.5,0.0]]
for ax in axes.flatten():
    acid=acidlist[i]
    seriesacid=df[acid]#get the Pandas series
    minmax=[seriesacid.min(), seriesacid.max()]
    counts=[]
    nbins=30
    histbinslist = np.linspace(minmax[0],  minmax[1], nbins)
    counts=-np.diff([seriesacid[seriesacid>x].count() for x in histbinslist]).min()
    for k,g in df.groupby('region'):
        style = {'histtype':'step', 'color':colors[k-1], 'alpha':1.0, 'bins':histbinslist, 'label':rmap[k]}
        ax.hist(g[acid],**style)
        ax.set_xlim(minmax)
        ax.set_title(acid)
        ax.grid(False)
    #construct legend
    ax.set_ylim([0, counts])
    ax.legend()
    i=i+1
fig.tight_layout()


mask=(df.eicosenoic < 0.05)
mask

np.sum(mask), np.mean(mask)

loweico=df[df.eicosenoic < 0.02]
pd.crosstab(loweico.area, loweico.region)

acidlistminusoleic=['palmitic', 'palmitoleic', 'stearic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic']
#your code here

ax=region_groupby.aggregate(np.mean)[acidlistminusoleic].plot(kind="barh", stacked=True)
ax.set_yticklabels(rvals);

# just do the boxplot without the marginals to split the north out
def make2d(df, scatterx, scattery, by="region", labeler={}):
    figure=plt.figure(figsize=(8,8))
    ax=plt.gca()
    cs=list(np.linspace(0,1,len(df.groupby(by))))
    xlimsd={}
    ylimsd={}
    xs={}
    ys={}
    cold={}
    for k,g in df.groupby(by):
        col=cs.pop()
        x=g[scatterx]
        y=g[scattery]
        xs[k]=x
        ys[k]=y
        c=dark2_cmap.mpl_colormap(col)
        cold[k]=c
        ax.scatter(x, y, c=c, label=labeler.get(k,k), s=40, alpha=0.4);
        xlimsd[k]=ax.get_xlim()
        ylimsd[k]=ax.get_ylim()
    xlims=[min([xlimsd[k][0] for k in xlimsd.keys()]), max([xlimsd[k][1] for k in xlimsd.keys()])]
    ylims=[min([ylimsd[k][0] for k in ylimsd.keys()]), max([ylimsd[k][1] for k in ylimsd.keys()])]
    ax.set_xlim(xlims)
    ax.set_ylim(ylims)
    ax.set_xlabel(scatterx)
    ax.set_ylabel(scattery)
    ax.grid(False)
    return ax
a=make2d(df, "linoleic","arachidic", labeler=rmap)
a.legend(loc='upper right');

import pandas.tools.rplot as rplot
dfcopy=df.copy()
dfcopy['region']=dfcopy['region'].map(rmap)
imap={e[0]:e[1] for e in zip (df.area.unique(), df.areastring.unique())}
#dfcopy['area']=dfcopy['area'].map(imap)
plot = rplot.RPlot(dfcopy, x='linoleic', y='oleic');
plot.add(rplot.TrellisGrid(['region', '.']))
plot.add(rplot.GeomPoint(size=40.0, alpha=0.3, colour=rplot.ScaleRandomColour('area')));

fig=plot.render()
print df.areastring.unique()


#your code here

plot = rplot.RPlot(dfcopy, x='palmitic', y='palmitoleic');
plot.add(rplot.TrellisGrid(['region', '.']))
plot.add(rplot.GeomPoint(size=40.0, alpha=0.3, colour=rplot.ScaleRandomColour('area')));
fig=plot.render()
print dfcopy.areastring.unique()

#adapted from https://github.com/roban/quarum/blob/master/margplot.py
from mpl_toolkits.axes_grid1 import make_axes_locatable
def setup_mhist(axes, figure):
    ax1=axes
    divider = make_axes_locatable(ax1)
    ax2 = divider.append_axes("top", 1.5, pad=0.0, sharex=ax1)
    ax3 = divider.append_axes("right", 1.5, pad=0.0, sharey=ax1)
    #xscale=yscale='log'
    #ax2.set_yscale(yscale)
    #ax3.set_xscale(xscale)
    #ax2.set_ylim([0,1])
    #ax3.set_xlim([0,5])
    ax2.grid(False)
    ax3.grid(False)
    ax2.grid(axis="y", color="white", linestyle='-', lw=1)
    ax3.grid(axis="x", color="white", linestyle='-', lw=1)
    remove_border(ax2, right=True, left=False)
    remove_border(ax3, right=False, left=True, bottom=False, top=True)
    figure.subplots_adjust(left=0.15, right=0.95)
    return [ax1,ax2,ax3]

#BUG: need to get appropriate min and max amongst the multiple marginal hists
#BUG: need to get highest frequency marked as label when we do this.
def make_mhist(axeslist, x, y, color='b', mms=8):
    ax1 = axeslist[0]
    ax2 = axeslist[1]
    ax3 = axeslist[2]
    #print list(ax2.get_yticklabels())
    for tl in (ax2.get_xticklabels() + ax2.get_yticklabels() +
               ax3.get_xticklabels() + ax3.get_yticklabels()):
        tl.set_visible(False)
    #for tl in ( ax2.get_xticklabels() + ax3.get_yticklabels()):
    #    tl.set_visible(False)
    histbinslist = [np.ceil(len(x)/20.), np.ceil(len(y)/20.)]
    histbinslist = copy.copy(histbinslist)
    #style = {'histtype':'stepfilled', 'color':color, 'alpha':0.6, 'normed':True, 'stacked':True}
    style = {'histtype':'stepfilled', 'color':color, 'alpha':0.4}
    nbins = histbinslist[0]
    x_range = [np.min(x), np.max(x)]
    histbinslist[0] = np.linspace(x_range[0],  x_range[1], nbins)

    ax2.hist(x, histbinslist[0], **style)

    nbins = histbinslist[1]
    y_range = [np.min(y), np.max(y)]
    histbinslist[1] = np.linspace(y_range[0], y_range[1], nbins)
    ax3.hist(y, histbinslist[1], orientation='horizontal', **style)

import random
import copy
def scatter_by(df, scatterx, scattery, by=None, figure=None, axes=None, colorscale=dark2_cmap, labeler={}, mfunc=None, setupfunc=None, mms=8):
    cs=copy.deepcopy(colorscale.mpl_colors)
    if not figure:
        figure=plt.figure(figsize=(8,8))
    if not axes:
        axes=figure.gca()
    x=df[scatterx]
    y=df[scattery]
    if not by:
        col=random.choice(cs)
        axes.scatter(x, y, cmap=colorscale, c=col)
        if setupfunc:
            axeslist=setupfunc(axes, figure)
        else:
            axeslist=[axes]
        if mfunc:
            mfunc(axeslist,x,y,color=col, mms=mms)
    else:
        cs=list(np.linspace(0,1,len(df.groupby(by))))
        xlimsd={}
        ylimsd={}
        xs={}
        ys={}
        cold={}
        for k,g in df.groupby(by):
            col=cs.pop()
            x=g[scatterx]
            y=g[scattery]
            xs[k]=x
            ys[k]=y
            c=colorscale.mpl_colormap(col)
            cold[k]=c
            axes.scatter(x, y, c=c, label=labeler.get(k,k), s=40, alpha=0.3);
            xlimsd[k]=axes.get_xlim()
            ylimsd[k]=axes.get_ylim()
        xlims=[min([xlimsd[k][0] for k in xlimsd.keys()]), max([xlimsd[k][1] for k in xlimsd.keys()])]
        ylims=[min([ylimsd[k][0] for k in ylimsd.keys()]), max([ylimsd[k][1] for k in ylimsd.keys()])]
        axes.set_xlim(xlims)
        axes.set_ylim(ylims)
        if setupfunc:
            axeslist=setupfunc(axes, figure)
        else:
            axeslist=[axes]
        if mfunc:
            for k in xs.keys():
                mfunc(axeslist,xs[k],ys[k],color=cold[k], mms=mms);
    axes.set_xlabel(scatterx);
    axes.set_ylabel(scattery);
    
    return axes

def make_rug(axeslist, x, y, color='b', mms=8):
    axes=axeslist[0]
    zerosx1=np.zeros(len(x))
    zerosx2=np.zeros(len(x))
    xlims=axes.get_xlim()
    ylims=axes.get_ylim()
    zerosx1.fill(ylims[1])
    zerosx2.fill(xlims[1])
    axes.plot(x, zerosx1, marker='|', color=color, ms=mms)
    axes.plot(zerosx2, y, marker='_', color=color, ms=mms)
    axes.set_xlim(xlims)
    axes.set_ylim(ylims)
    return axes
    
#BUG: remove ticks and maybe even border on top and right

ax=scatter_by(df, 'linoleic', 'eicosenoic', by='region', labeler=rmap, mfunc=make_rug, mms=20)
ax.grid(False)
ax.legend(loc='upper right');

ax=scatter_by(df, 'linoleic', 'arachidic', by='region', labeler=rmap, setupfunc=setup_mhist, mfunc=make_mhist, mms=20)
ax.grid(False)
ax.legend(loc='upper right');

ax=scatter_by(df, 'linoleic', 'eicosenoic', by='region', labeler=rmap, setupfunc=setup_mhist, mfunc=make_mhist, mms=20)
ax.grid(False)
ax.legend(loc='upper right');

import scipy.stats as stats
mu=0.
sigma=1.
samples=np.random.normal(mu, sigma, 10000)
plt.hist(samples,bins=25, normed=True)
nd=stats.norm()
plt.hist(nd.rvs(size=10000), bins=25, alpha=0.5,normed=True)
x=np.linspace(-4.0,4.0,100)
plt.plot(x,nd.pdf(x))
plt.plot(x,nd.cdf(x))

mean = [0,0]
cov = [[1,0],[0,5]] # diagonal covariance, points lie on x or y-axis
m=300
nrvs = np.random.multivariate_normal(mean,cov,(m,m))
duets=nrvs.reshape(m*m,2)
print duets[:,1]
normaldf=pd.DataFrame(dict(x=duets[:,0], y=duets[:,1]))
normaldf.head()
ax=scatter_by(normaldf, 'x', 'y',  figure=plt.figure(figsize=(8,10)),setupfunc=setup_mhist, mfunc=make_mhist, mms=20)
#ax.grid(False)

H, xedges, yedges = np.histogram2d(normaldf.x, normaldf.y, bins=(50, 50), normed=True)
extent = [xedges[0], xedges[-1], yedges[-1], yedges[0]]
plt.imshow(H, extent=extent, interpolation='nearest')
plt.colorbar()

from pandas.tools.plotting import scatter_matrix
scatter_matrix(df[['linoleic','arachidic','eicosenoic']], alpha=0.3, figsize=(10, 10), diagonal='kde');

plt.figure(figsize=(24,5))
for key, group in df.groupby('region'):
    plt.subplot(int('13'+str(key)))
    group[acidlistminusoleic].boxplot(grid=False)
    ax=plt.gca()
    ax.set_title(rvals[key-1])
    remove_border(ax, left=False, bottom=False)
    ax.grid(axis="y", color="gray", linestyle=':', lw=1)


from pandas.tools.plotting import parallel_coordinates
dfna=df[['region', 'palmitic', 'palmitoleic', 'stearic', 'oleic', 'linolenic', 'linoleic', 'arachidic', 'eicosenoic']]
dfna_norm = (dfna - dfna.mean()) / (dfna.max() - dfna.min())
dfna_norm['region']=df['region'].map(lambda x: rmap[x])
parallel_coordinates(dfna_norm, 'region', colors=[dark2_cmap.mpl_colormap(col) for col in [1.0,0.5,0.0]], alpha=0.05)

ax2=mdf2.plot(kind='barh', stacked=True, color=dark2_colors, grid=False, legend=False)
remove_border(ax2, left=False, bottom=False)
ax2.grid(axis="x", color="white", linestyle='-', lw=1)
ax2.legend(loc='right', bbox_to_anchor=(1.3,0.5))
labels2=['South','Sardinia','North']
ax2.set_yticklabels(labels2)
ax2.set_ylabel('');
ax2.set_xlim(right=100.0);

#your code here
def makenice_barplot(frame, by, bymap, columnlist, aggregatefunc=np.mean, largest=100.0):
    df2=df.groupby(by).aggregate(aggregatefunc)
    df2=df2[columnlist]
    ax2=df2.plot(kind='barh', stacked=True, color=dark2_colors, grid=False, legend=False)
    remove_border(ax2, left=False, bottom=False)
    ax2.grid(axis="x", color="white", linestyle='-', lw=1)
    ax2.legend(loc='right', bbox_to_anchor=(1.3,0.5))
    labels=[bymap[i] for i in df2.index]
    print labels
    ax2.set_yticklabels(labels)
    ax2.set_ylabel('');
    ax2.set_xlim(right=largest);
makenice_barplot(df,'region', rmap, acidlistminusoleic, largest=30)

fig=plt.figure(figsize=(10,10))
ax=scatter_by(df, 'linoleic', 'arachidic', by='area', figure=fig, labeler=imap, setupfunc=setup_mhist, mfunc=make_mhist, mms=20)
ax.grid(False)
ax.legend(loc='right', bbox_to_anchor=(1.7,0.5));

indices=np.tril_indices(8)
plts=[]
for i,j in zip(indices[0], indices[1]):
    if i!=j:
        plts.append((i,j))
print plts

fig, axes = plt.subplots(nrows=14, ncols=2, figsize=(14,40));
k=0
af=axes.flatten()
for a in af:
    i,j=plts[k]
    a=scatter_by(df, acidlist[i], acidlist[j], by='region', axes=a, labeler=rmap, mfunc=make_rug, mms=20);
    a.grid(False);
    k=k+1
af[0].legend(loc='best');
fig.tight_layout();