import pandas as pd
import numpy as np

#pData = pd.read_csv('https://dl.dropbox.com/u/7710864/data/csv_hid/ss06pid.csv')
pData = pd.read_csv('../data/ss06pid.csv')
pData

# pandas boxplot
pData.boxplot(column='AGEP');

# pandas boxplot grouped by certain column
# setting width and axis names is rather tricky
pData.boxplot(column='AGEP', by='DDRS');

# pandas barplot
pData['CIT'].value_counts().plot(kind='bar');

# pandas histogram plot
pData['AGEP'].hist(bins=18);

# pandas histogram plot with more bins
pData['AGEP'].hist(bins=100); plt.title('Age');

# pandas density plot
pData['AGEP'].plot(kind='kde', linewidth=3);

# pandas density plot, multiple distributions
pData['AGEP'].plot(kind='kde', linewidth=3);
pData['AGEP'][pData['SEX'] == 1].plot(kind='kde', linewidth=3, style='orange');

# pandas 'scatter' plot
pData.plot(x='JWMNP', y='WAGP', style='o');

# scatterplot -- size matters
pData.plot(x='JWMNP', y='WAGP', style='o', markersize=3);

# scatterplot using colours
# here I switch to generic matplotlib plotting to be more flexible on styles
scatter(pData['JWMNP'], pData['WAGP'], c=pData['SEX'], s=15, cmap='autumn');
xlim(0,200)
ylim(0, 250000)
xlabel('JWNMP')
ylabel('WAGP');

# scatterplots using size -- hard to see
percentMaxAge = pData['AGEP'].astype(float) / pData['AGEP'].astype(float).max()

scatter(pData['JWMNP'], pData['WAGP'], s=percentMaxAge*0.5);
xlim(0,200)
ylim(0, 250000)
xlabel('JWNMP')
ylabel('WAGP');

# scatterplots -- overlaying lines/points
scatter(pData['JWMNP'], pData['WAGP'], s=15)
xlim(0,200)
ylim(0, 250000)
xlabel('JWNMP')
ylabel('WAGP')

plot(np.repeat(100, pData.shape[0]), pData['WAGP'], 'grey', linewidth=5)

plot(np.linspace(0, 200, num=100), np.linspace(0, 20e5, num=100), 'ro', markersize=10);

# scatterplots -- numeric variables as factors
ageGroups = pd.qcut(pData['AGEP'], 5)
pData['ageGroups'] = ageGroups.labels

cols = ['b', 'r', 'g', 'm', 'y']

i = 0
for k, df in pData.groupby('ageGroups'):
    scatter(df['JWMNP'], df['WAGP'], c=cols[i], label=ageGroups.levels[k], alpha=.6)
    i += 1
    
legend()
xlim(-2, 200)
ylim(0, 250000)
xlabel('JWMNP')
ylabel('WAGP');

x = np.random.normal(size=1e5)
y = np.random.normal(size=1e5)
plot(x, y, 'o');

# a lot of points -- sampling
import random

sampledValues = random.sample(np.arange(1e5), 1000)
plot(x[sampledValues], y[sampledValues], 'o');

%load_ext rmagic

%Rpush x y

%R smoothScatter(x, y)

# a lot of points -- hexbin
hexbin(x, y);

# qq-plots is available in statsmodels
from statsmodels.graphics.gofplots import qqplot

x = np.random.normal(size=20)
y = np.random.normal(size=20)

# note: it seems like it's only possible to plot against distributions in scipy.stats.distributions (by default: normal)
# (i.e. not against a distribution of another variable)
qqplot(x, line='45', fit=True);

# spaghetti plot
X = np.array(np.random.normal(size=(20, 5)))

# there's no automatic cycle of markers
# but it's possible to do in matplotlib
# see: http://stackoverflow.com/questions/7358118/matplotlib-black-white-colormap-with-dashes-dots-etc
plot(X);

# 'heatmaps'
matshow(pData.ix[0:10, 161:237], aspect='auto', cmap='hot');

# maps
from mpl_toolkits.basemap import Basemap

figsize(9, 15)

m = Basemap()
m.drawcoastlines();
m.drawcountries();

lon = np.random.uniform(-180, 180, 40)
lat = np.random.uniform(-90, 90, 40)

m.plot(lon, lat, 'o');

# missing values and plots
x = np.array([NaN, NaN, NaN, 4, 5, 6, 7, 8, 9, 10])
y = np.arange(1, 11)

figsize(7, 5)
plot(x, y, 'o');
xlim(0, 11); ylim(0, 11);

# missing values and plots
x = np.random.normal(size=100)
y = np.random.normal(size=100)

y[x < 0] = NaN

tt = pd.DataFrame(zip(x, np.isnan(y)), columns=['x', 'isnan y'])

tt.boxplot(column='x', by='isnan y');