import pandas as pd import numpy as np #pData = pd.read_csv('https://dl.dropbox.com/u/7710864/data/csv_hid/ss06pid.csv') pData = pd.read_csv('../data/ss06pid.csv') pData # pandas boxplot pData.boxplot(column='AGEP'); # pandas boxplot grouped by certain column # setting width and axis names is rather tricky pData.boxplot(column='AGEP', by='DDRS'); # pandas barplot pData['CIT'].value_counts().plot(kind='bar'); # pandas histogram plot pData['AGEP'].hist(bins=18); # pandas histogram plot with more bins pData['AGEP'].hist(bins=100); plt.title('Age'); # pandas density plot pData['AGEP'].plot(kind='kde', linewidth=3); # pandas density plot, multiple distributions pData['AGEP'].plot(kind='kde', linewidth=3); pData['AGEP'][pData['SEX'] == 1].plot(kind='kde', linewidth=3, style='orange'); # pandas 'scatter' plot pData.plot(x='JWMNP', y='WAGP', style='o'); # scatterplot -- size matters pData.plot(x='JWMNP', y='WAGP', style='o', markersize=3); # scatterplot using colours # here I switch to generic matplotlib plotting to be more flexible on styles scatter(pData['JWMNP'], pData['WAGP'], c=pData['SEX'], s=15, cmap='autumn'); xlim(0,200) ylim(0, 250000) xlabel('JWNMP') ylabel('WAGP'); # scatterplots using size -- hard to see percentMaxAge = pData['AGEP'].astype(float) / pData['AGEP'].astype(float).max() scatter(pData['JWMNP'], pData['WAGP'], s=percentMaxAge*0.5); xlim(0,200) ylim(0, 250000) xlabel('JWNMP') ylabel('WAGP'); # scatterplots -- overlaying lines/points scatter(pData['JWMNP'], pData['WAGP'], s=15) xlim(0,200) ylim(0, 250000) xlabel('JWNMP') ylabel('WAGP') plot(np.repeat(100, pData.shape[0]), pData['WAGP'], 'grey', linewidth=5) plot(np.linspace(0, 200, num=100), np.linspace(0, 20e5, num=100), 'ro', markersize=10); # scatterplots -- numeric variables as factors ageGroups = pd.qcut(pData['AGEP'], 5) pData['ageGroups'] = ageGroups.labels cols = ['b', 'r', 'g', 'm', 'y'] i = 0 for k, df in pData.groupby('ageGroups'): scatter(df['JWMNP'], df['WAGP'], c=cols[i], label=ageGroups.levels[k], alpha=.6) i += 1 legend() xlim(-2, 200) ylim(0, 250000) xlabel('JWMNP') ylabel('WAGP'); x = np.random.normal(size=1e5) y = np.random.normal(size=1e5) plot(x, y, 'o'); # a lot of points -- sampling import random sampledValues = random.sample(np.arange(1e5), 1000) plot(x[sampledValues], y[sampledValues], 'o'); %load_ext rmagic %Rpush x y %R smoothScatter(x, y) # a lot of points -- hexbin hexbin(x, y); # qq-plots is available in statsmodels from statsmodels.graphics.gofplots import qqplot x = np.random.normal(size=20) y = np.random.normal(size=20) # note: it seems like it's only possible to plot against distributions in scipy.stats.distributions (by default: normal) # (i.e. not against a distribution of another variable) qqplot(x, line='45', fit=True); # spaghetti plot X = np.array(np.random.normal(size=(20, 5))) # there's no automatic cycle of markers # but it's possible to do in matplotlib # see: http://stackoverflow.com/questions/7358118/matplotlib-black-white-colormap-with-dashes-dots-etc plot(X); # 'heatmaps' matshow(pData.ix[0:10, 161:237], aspect='auto', cmap='hot'); # maps from mpl_toolkits.basemap import Basemap figsize(9, 15) m = Basemap() m.drawcoastlines(); m.drawcountries(); lon = np.random.uniform(-180, 180, 40) lat = np.random.uniform(-90, 90, 40) m.plot(lon, lat, 'o'); # missing values and plots x = np.array([NaN, NaN, NaN, 4, 5, 6, 7, 8, 9, 10]) y = np.arange(1, 11) figsize(7, 5) plot(x, y, 'o'); xlim(0, 11); ylim(0, 11); # missing values and plots x = np.random.normal(size=100) y = np.random.normal(size=100) y[x < 0] = NaN tt = pd.DataFrame(zip(x, np.isnan(y)), columns=['x', 'isnan y']) tt.boxplot(column='x', by='isnan y');