from IPython.core.display import HTML
HTML("")
import pandas as pd
# Set some Pandas options
pd.set_option('html', False)
pd.set_option('max_columns', 30)
pd.set_option('max_rows', 20)
counts = pd.Series([632, 1638, 569, 115])
counts
counts.values
counts.index
bacteria = pd.Series([632, 1638, 569, 115],
index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])
bacteria
bacteria['Actinobacteria']
bacteria[[name.endswith('bacteria') for name in bacteria.index]]
[name.endswith('bacteria') for name in bacteria.index]
bacteria[0]
bacteria.name = 'counts'
bacteria.index.name = 'phylum'
bacteria
np.log(bacteria)
bacteria[bacteria>1000]
bacteria_dict = {'Firmicutes': 632, 'Proteobacteria': 1638, 'Actinobacteria': 569, 'Bacteroidetes': 115}
pd.Series(bacteria_dict)
bacteria2 = pd.Series(bacteria_dict, index=['Cyanobacteria','Firmicutes','Proteobacteria','Actinobacteria'])
bacteria2
bacteria2.isnull()
bacteria + bacteria2
data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
'patient':[1, 1, 1, 1, 2, 2, 2, 2],
'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria',
'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']})
data
data[['phylum','value','patient']]
data.columns
data['value']
data.value
type(data.value)
type(data[['value']])
data.ix[3]
data = pd.DataFrame({0: {'patient': 1, 'phylum': 'Firmicutes', 'value': 632},
1: {'patient': 1, 'phylum': 'Proteobacteria', 'value': 1638},
2: {'patient': 1, 'phylum': 'Actinobacteria', 'value': 569},
3: {'patient': 1, 'phylum': 'Bacteroidetes', 'value': 115},
4: {'patient': 2, 'phylum': 'Firmicutes', 'value': 433},
5: {'patient': 2, 'phylum': 'Proteobacteria', 'value': 1130},
6: {'patient': 2, 'phylum': 'Actinobacteria', 'value': 754},
7: {'patient': 2, 'phylum': 'Bacteroidetes', 'value': 555}})
data
data = data.T
data
vals = data.value
vals
vals[5] = 0
vals
data
vals = data.value.copy()
vals[5] = 1000
data
data.value[3] = 14
data
data['year'] = 2013
data
data.treatment = 1
data
data.treatment
treatment = pd.Series([0]*4 + [1]*2)
treatment
data['treatment'] = treatment
data
month = ['Jan', 'Feb', 'Mar', 'Apr']
data['month'] = month
data['month'] = ['Jan']*len(data)
data
del data['month']
data
data.values
df = pd.DataFrame({'foo': [1,2,3], 'bar':[0.4, -1.0, 4.5]})
df.values
data.index
data.index[0] = 15
bacteria2.index = bacteria.index
bacteria2
!cat data/microbiome.csv
mb = pd.read_csv("data/microbiome.csv")
mb
pd.read_csv("data/microbiome.csv", header=None).head()
mb = pd.read_table("data/microbiome.csv", sep=',')
mb = pd.read_csv("data/microbiome.csv", index_col=['Taxon','Patient'])
mb.head()
pd.read_csv("data/microbiome.csv", skiprows=[3,4,6]).head()
pd.read_csv("data/microbiome.csv", nrows=4)
data_chunks = pd.read_csv("data/microbiome.csv", chunksize=15)
mean_tissue = {chunk.Taxon[0]:chunk.Tissue.mean() for chunk in data_chunks}
mean_tissue
!cat data/microbiome_missing.csv
pd.read_csv("data/microbiome_missing.csv").head(20)
pd.isnull(pd.read_csv("data/microbiome_missing.csv")).head(20)
pd.read_csv("data/microbiome_missing.csv", na_values=['?', -99999]).head(20)
mb_file = pd.ExcelFile('data/microbiome/MID1.xls')
mb_file
mb1 = mb_file.parse("Sheet 1", header=None)
mb1.columns = ["Taxon", "Count"]
mb1.head()
mb2 = pd.read_excel('data/microbiome/MID2.xls', sheetname='Sheet 1', header=None)
mb2.head()
baseball = pd.read_csv("data/baseball.csv", index_col='id')
baseball.head()
player_id = baseball.player + baseball.year.astype(str)
baseball_newind = baseball.copy()
baseball_newind.index = player_id
baseball_newind.head()
baseball_newind.index.is_unique
pd.Series(baseball_newind.index).value_counts()
baseball_newind.ix['wickmbo012007']
player_unique = baseball.player + baseball.team + baseball.year.astype(str)
baseball_newind = baseball.copy()
baseball_newind.index = player_unique
baseball_newind.head()
baseball_newind.index.is_unique
baseball.reindex(baseball.index[::-1]).head()
id_range = range(baseball.index.values.min(), baseball.index.values.max())
baseball.reindex(id_range).head()
baseball.reindex(id_range, method='ffill', columns=['player','year']).head()
baseball.reindex(id_range, fill_value='mr.nobody', columns=['player']).head()
baseball.shape
baseball.drop([89525, 89526])
baseball.drop(['ibb','hbp'], axis=1)
# Sample Series object
hits = baseball_newind.h
hits
# Numpy-style indexing
hits[:3]
# Indexing by label
hits[['womacto01CHN2006','schilcu01BOS2006']]
hits['womacto01CHN2006':'gonzalu01ARI2006']
hits['womacto01CHN2006':'gonzalu01ARI2006'] = 5
hits
baseball_newind[['h','ab']]
baseball_newind[baseball_newind.ab>500]
baseball_newind.ix['gonzalu01ARI2006', ['h','X2b', 'X3b', 'hr']]
baseball_newind.ix[['gonzalu01ARI2006','finlest01SFN2006'], 5:8]
baseball_newind.ix[:'myersmi01NYA2006', 'hr']
baseball_newind.xs('myersmi01NYA2006')
hr2006 = baseball[baseball.year==2006].xs('hr', axis=1)
hr2006.index = baseball.player[baseball.year==2006]
hr2007 = baseball[baseball.year==2007].xs('hr', axis=1)
hr2007.index = baseball.player[baseball.year==2007]
hr2006 = pd.Series(baseball.hr[baseball.year==2006].values, index=baseball.player[baseball.year==2006])
hr2007 = pd.Series(baseball.hr[baseball.year==2007].values, index=baseball.player[baseball.year==2007])
hr_total = hr2006 + hr2007
hr_total
hr_total[hr_total.notnull()]
hr2007.add(hr2006, fill_value=0)
baseball.hr - baseball.hr.max()
baseball.ix[89521]["player"]
stats = baseball[['h','X2b', 'X3b', 'hr']]
diff = stats - stats.xs(89521)
diff[:10]
stats.apply(np.median)
stat_range = lambda x: x.max() - x.min()
stats.apply(stat_range)
slg = lambda x: (x['h']-x['X2b']-x['X3b']-x['hr'] + 2*x['X2b'] + 3*x['X3b'] + 4*x['hr'])/(x['ab']+1e-6)
baseball.apply(slg, axis=1).apply(lambda x: '%.3f' % x)
baseball_newind.sort_index().head()
baseball_newind.sort_index(ascending=False).head()
baseball_newind.sort_index(axis=1).head()
baseball.hr.order(ascending=False)
baseball[['player','sb','cs']].sort_index(ascending=[False,True], by=['sb', 'cs']).head(10)
baseball.hr.rank()
pd.Series([100,100]).rank()
baseball.hr.rank(method='first')
baseball.rank(ascending=False).head()
baseball[['r','h','hr']].rank(ascending=False).head()
baseball_h = baseball.set_index(['year', 'team', 'player'])
baseball_h.head(10)
baseball_h.index[:10]
baseball_h.index.is_unique
baseball_h.ix[(2007, 'ATL', 'francju01')]
mb = pd.read_csv("data/microbiome.csv", index_col=['Taxon','Patient'])
mb.head(10)
mb.index
mb.ix['Proteobacteria']
frame = pd.DataFrame(np.arange(12).reshape(( 4, 3)),
index =[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns =[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame
frame.ix['a']['Ohio']
frame.ix['b', 2]['Colorado']
mb.swaplevel('Patient', 'Taxon').head()
mb.sortlevel('Patient', ascending=False).head()
foo = pd.Series([NaN, -3, None, 'foobar'])
foo
foo.isnull()
bacteria2
bacteria2.dropna()
bacteria2[bacteria2.notnull()]
data
data.dropna()
data.dropna(how='all')
data.ix[7, 'year'] = nan
data
data.dropna(thresh=4)
data.dropna(axis=1)
bacteria2.fillna(0)
data.fillna({'year': 2013, 'treatment':2})
data
_ = data.year.fillna(2013, inplace=True)
data
bacteria2.fillna(method='bfill')
bacteria2.fillna(bacteria2.mean())
baseball.sum()
baseball.mean()
bacteria2
bacteria2.mean()
bacteria2.mean(skipna=False)
extra_bases = baseball[['X2b','X3b','hr']].sum(axis=1)
extra_bases.order(ascending=False)
baseball.describe()
baseball.player.describe()
baseball.hr.cov(baseball.X2b)
baseball.hr.corr(baseball.X2b)
baseball.ab.corr(baseball.h)
baseball.corr()
mb.head()
mb.sum(level='Taxon')
mb.to_csv("mb.csv")
baseball.to_pickle("baseball_pickle")
pd.read_pickle("baseball_pickle")