! pip install seaborn # Pandas or "Panel Data Analysis" toolkit for Data Frames/Data Tables import pandas as pd # Numpy or "Numerical Python" import numpy as np # Powerful R-style/statistical plotting import seaborn as sns # These styles are my personal preferences # For more options, see this page: http://web.stanford.edu/~mwaskom/software/seaborn/tutorial/aesthetics.html sns.set(style='whitegrid', context='notebook') # Show the figures directly in the IPython notebok %matplotlib inline cd ~/Downloads/biom262_2015_01_29_cleaning_data ls ! head GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt ! head -n 23 GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt ! tail GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt # Exercise: modify the `tail` command below to look at the last 17 lines of the file. ! tail GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt ! wc -l GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt # Exercise: do a web search for "unix count number of columns" and # check the command on the file, GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt ! # Column-counting code goes here subject_phenotypes = pd.read_table('GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt') subject_phenotypes.head() # Run this cell. Try entering 'subjid' and 'subject id' as well. subject_phenotypes['SUBJID'] subject_phenotypes.SUBJID # Exercise: Show the last 8 rows. # Code for looking at the last 8 rows goes here. # A python "dictionary", or mapping from one thing to the next. # In this case, we're mapping the numbers 1 and 2 to strings # indicating the gender. gender = {1: 'fillmein', 2: 'fillmein'} # Run this cell. How do you access the other gender? gender[2] subject_phenotypes["don't worry"] = "be yonce" # Code to look at the top of the dataframe goes here # hint: remember the command "head"? How did we use it to look at the dataframe when we first loaded it? # Code to add a column with programmer's choice of name and value goes here subject_phenotypes.GENDER.map(gender) for g in subject_phenotypes.GENDER: print gender[g] # Code to check if the dataframe subject_phenotypes has changed goes here # You can check if it has changed by looking at it using your favorite body endpoint # Code to `map` `gender` onto DTHHRDY goes here # Code to create a new column called "gender" in `subject_phenotypes` that is the result of using `map` # with the `gender` dictionary on the "GENDER" column of `subject_phenotypes`. # Code to check if the dataframe has changed goes here # Code to create a new column in `subject_phenotypes` that is a human-readable version of the column `DTHHRDY` goes here sns.factorplot('gender', data=subject_phenotypes) # Edit the argument `hue=...` sns.factorplot('gender', data=subject_phenotypes, hue=human_readable_DTHHRDY) sns.factorplot('gender', data=subject_phenotypes, hue='DTHHRDY', col='AGE') # Code goes below # Code to look at the top of GTEx_Data_2014-01-17_Annotations_SampleAttributesDS.txt goes here # Code to count the number of lines in GTEx_Data_2014-01-17_Annotations_SampleAttributesDS.txt goes here # Code to count the number of columns in GTEx_Data_2014-01-17_Annotations_SampleAttributesDS.txt goes here # Code to read the table GTEx_Data_2014-01-17_Annotations_SampleAttributesDS.txt goes here pd.read_table('GTEx_Data_2014-01-17_Annotations_SampleAttributesDS.txt') # Code to look at the top of the `DataFrame` you just created # Code to read the excel file GTEx_Data_2014-01-17_Annotations_SampleAttributesDD.xlsx, and set the first column as the "index" goes here sample_attributes_dd = pd.read_excel('GTEx_Data_2014-01-17_Annotations_SampleAttributesDD.xlsx', index_col=0) # Code to look at the top of the file goes here sample_attributes_dd.head() # Code for showing the descriptive column in the dataframe `sample_attributes_dd` sample_attributes.rename(columns=sample_attributes_dd['TYPE']) # Code to rename the columns of sample_attributes using a column from sample_attributes_dd sample_attributes = sample_attributes.rename(columns=sample_attributes_dd.VARDESC) # Code for looking at the top of the new sample_attributes goes here sample_attributes sample_attributes.ix[:5, :20] # Code for looking at the top of `subject_phenotypes` goes here # Exercise: Code to show a column of `subject_phenotypes` goes here # Exercise: Code to show a column of `sample_attributes` goes here s = 'you have as many hours in a day as beyonce' s.split() s.split('d') # Code to split `s` goes here list s.split()[4] # Code to get "day" from s.split() goes here s.split()[:5] # Code to split `s` on "a", and get the first 3 elements ' '.join(s.split()[:5]) '!'.join(s.split('e')[:4]) # Code for split 1, join 1 goes here # Code for split 2, join 2 goes here # Code for split 3, join 3 goes here subject_phenotypes['SUBJID'].map(lambda x: x.split('-')[0]) # Code to split the column 'Type of nucleic acid isolation batch' on whitespace and get the third element goes here sample_attributes['Tissue Type, more specific detail of tissue type'].map(lambda x: x.split('-')[0]) sample_attributes['Tissue Type, more specific detail of tissue type'].map(lambda x: x.split('-')[0] if isinstance(x, str) else np.nan) # Code goes here sample_attributes['Tissue Type, more specific detail of tissue type'].map(lambda x: '_'.join(x.split()[:3]) if isinstance(x, str) else np.nan) # Code goes here # Code to look at the top of `subject_phenotypes` # Code to look at the top of `sample_attributes` # Code goes here dataframe1 = pd.DataFrame([['cucumber', 'watery'], ['broccoli', 'crunchy'], ['kale', 'chewy'], ['mango', 'sweet'] ], columns=['vegetable', 'description']) dataframe1 dataframe2 = pd.DataFrame([['broccoli', 'harvested', 8], ['broccoli', 'planted', 5], ['kale', 'planted', 6], ['kale', 'harvested', 9], ['cucumber', 'harvested', 7], ['cucumber', 'planted', 4], ['strawberry', 'planted', 10], ['strawberry', 'harvested', 2]], columns=['crop', 'action', 'number']) dataframe2 dataframe1.merge(dataframe2, left_on='vegetable', right_on='crop') # Code goes here # Code goes here