import pandas as pd import numpy as np # Set some Pandas options pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 20) pd.set_option('display.max_rows', 25) from datetime import datetime now = datetime.now() now now.day now.weekday() from datetime import date, time time(3, 24) date(1970, 9, 3) my_age = now - datetime(1970, 9, 3) my_age my_age.days/365. segments = pd.read_csv("data/AIS/transit_segments.csv") segments.head() segments.seg_length.hist(bins=500) segments.seg_length.apply(np.log).hist(bins=500) segments.st_time.dtype datetime.strptime(segments.st_time.ix[0], '%m/%d/%y %H:%M') from dateutil.parser import parse parse(segments.st_time.ix[0]) segments.st_time.apply(lambda d: datetime.strptime(d, '%m/%d/%y %H:%M')) pd.to_datetime(segments.st_time) pd.to_datetime([None]) vessels = pd.read_csv("data/AIS/vessel_information.csv", index_col='mmsi') vessels.head() [v for v in vessels.type.unique() if v.find('/')==-1] vessels.type.value_counts() df1 = pd.DataFrame(dict(id=range(4), age=np.random.randint(18, 31, size=4))) df2 = pd.DataFrame(dict(id=range(3)+range(3), score=np.random.random(size=6))) df1, df2 pd.merge(df1, df2) pd.merge(df1, df2, how='outer') segments.head(1) vessels.head(1) segments_merged = pd.merge(vessels, segments, left_index=True, right_on='mmsi') segments_merged.head() vessels.merge(segments, left_index=True, right_on='mmsi').head() segments['type'] = 'foo' pd.merge(vessels, segments, left_index=True, right_on='mmsi').head() np.concatenate([np.random.random(5), np.random.random(5)]) np.r_[np.random.random(5), np.random.random(5)] np.c_[np.random.random(5), np.random.random(5)] mb1 = pd.read_excel('data/microbiome/MID1.xls', 'Sheet 1', index_col=0, header=None) mb2 = pd.read_excel('data/microbiome/MID2.xls', 'Sheet 1', index_col=0, header=None) mb1.shape, mb2.shape mb1.head() mb1.columns = mb2.columns = ['Count'] mb1.index.name = mb2.index.name = 'Taxon' mb1.head() mb1.index[:3] mb1.index.is_unique pd.concat([mb1, mb2], axis=0).shape pd.concat([mb1, mb2], axis=0).index.is_unique pd.concat([mb1, mb2], axis=1).shape pd.concat([mb1, mb2], axis=1).head() pd.concat([mb1, mb2], axis=1).values[:5] pd.concat([mb1, mb2], axis=1, join='inner').head() mb1.combine_first(mb2).head() pd.concat([mb1, mb2], keys=['patient1', 'patient2']).head() pd.concat([mb1, mb2], keys=['patient1', 'patient2']).index.is_unique pd.concat(dict(patient1=mb1, patient2=mb2), axis=1).head() cdystonia = pd.read_csv("data/cdystonia.csv", index_col=None) cdystonia.head() stacked = cdystonia.stack() stacked stacked.unstack().head() cdystonia2 = cdystonia.set_index(['patient','obs']) cdystonia2.head() cdystonia2.index.is_unique twstrs_wide = cdystonia2['twstrs'].unstack('obs') twstrs_wide.head() cdystonia_long = cdystonia[['patient','site','id','treat','age','sex']].drop_duplicates().merge( twstrs_wide, right_index=True, left_on='patient', how='inner').head() cdystonia_long cdystonia.set_index(['patient','site','id','treat','age','sex','week'])['twstrs'].unstack('week').head() pd.melt(cdystonia_long, id_vars=['patient','site','id','treat','age','sex'], var_name='obs', value_name='twsters').head() cdystonia.pivot(index='patient', columns='obs', values='twstrs').head() cdystonia.pivot('patient', 'obs') cdystonia.pivot_table(rows=['site', 'treat'], cols='week', values='twstrs', aggfunc=max).head(20) pd.crosstab(cdystonia.sex, cdystonia.site) vessels.duplicated(cols='names') vessels.drop_duplicates(['names']) cdystonia.treat.value_counts() treatment_map = {'Placebo': 0, '5000U': 1, '10000U': 2} cdystonia['treatment'] = cdystonia.treat.map(treatment_map) cdystonia.treatment vals = pd.Series([float(i)**10 for i in range(10)]) vals np.log(vals) vals = vals.replace(0, 1e-6) np.log(vals) cdystonia2.treat.replace({'Placebo': 0, '5000U': 1, '10000U': 2}) top5 = vessels.type.apply(lambda s: s in vessels.type.value_counts().index[:5]) vessels5 = vessels[top5] pd.get_dummies(vessels5.type).head(10) cdystonia.age.describe() pd.cut(cdystonia.age, [20,30,40,50,60,70,80,90])[:30] pd.cut(cdystonia.age, [20,30,40,50,60,70,80,90], right=False)[:30] pd.cut(cdystonia.age, [20,40,60,80,90], labels=['young','middle-aged','old','ancient'])[:30] pd.qcut(cdystonia.age, 4)[:30] quantiles = pd.qcut(segments.seg_length, [0, 0.01, 0.05, 0.95, 0.99, 1]) quantiles[:30] pd.get_dummies(quantiles).head(10) new_order = np.random.permutation(len(segments)) new_order[:30] segments.take(new_order).head() segments.head() cdystonia_grouped = cdystonia.groupby(cdystonia.patient) cdystonia_grouped for patient, group in cdystonia_grouped: print patient print group print cdystonia_grouped.agg(mean).head() cdystonia_grouped.mean().head() cdystonia_grouped.mean().add_suffix('_mean').head() # The median of the `twstrs` variable cdystonia_grouped['twstrs'].quantile(0.5) cdystonia.groupby(['week','site']).mean().head() normalize = lambda x: (x - x.mean())/x.std() cdystonia_grouped.transform(normalize).head() cdystonia_grouped['twstrs'].mean().head() # This gives the same result as a DataFrame cdystonia_grouped[['twstrs']].mean().head() chunks = dict(list(cdystonia_grouped)) chunks[4] dict(list(cdystonia.groupby(cdystonia.dtypes, axis=1))) cdystonia2.head(10) cdystonia2.groupby(level='obs', axis=0)['twstrs'].mean() def top(df, column, n=5): return df.sort_index(by=column, ascending=False)[:n] top3segments = segments_merged.groupby('mmsi').apply(top, column='seg_length', n=3)[['names', 'seg_length']] top3segments top3segments.head(20) mb1.index[:3] class_index = mb1.index.map(lambda x: ' '.join(x.split(' ')[:3])) mb_class = mb1.copy() mb_class.index = class_index mb_class.head() mb_class.groupby(level=0).sum().head(10) from IPython.core.display import HTML HTML(filename='data/titanic.html')