import pandas as pd
# learn more about pandas http://pandas.pydata.org/pandas-docs/stable/indexing.html
# Import the Pandas library
import pandas as pd
# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
#Print the `head` of the train and test dataframes
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
test.head()
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
train.to_csv('/Users/chengjun/github/cjc/data/tatanic_train.csv')
test.to_csv('/Users/chengjun/github/cjc/data/tatanic_test.csv')
You can easily explore a DataFrame
train.shape
(891, 12)
train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
import pandas as pd
train = pd.read_csv('/Users/chengjun/github/cjc/data/tatanic_train.csv',\
sep = ",", header=0)
train[:3]
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
# Passengers that survived vs passengers that passed away
train["Survived"][:5]
0 0 1 1 2 1 3 1 4 0 Name: Survived, dtype: int64
# Passengers that survived vs passengers that passed away
train["Survived"].value_counts()
0 549 1 342 Name: Survived, dtype: int64
# As proportions
train["Survived"].value_counts(normalize = True)
0 0.616162 1 0.383838 Name: Survived, dtype: float64
train['Sex'].value_counts()
male 577 female 314 Name: Sex, dtype: int64
# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male'].value_counts()
0 468 1 109 Name: Survived, dtype: int64
# Females that survived vs Females that passed away
train["Survived"][train["Sex"] == 'female'].value_counts()
1 233 0 81 Name: Survived, dtype: int64
# Normalized male survival
train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
0 0.811092 1 0.188908 Name: Survived, dtype: float64
# Normalized female survival
train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
1 0.742038 0 0.257962 Name: Survived, dtype: float64
# Create the column Child, and indicate whether child or not a child. Print the new column.
train["Child"] = float('NaN')
train.Child[train.Age < 5] = 1
train.Child[train.Age >= 5] = 0
print train.Child[:3]
0 0.0 1 0.0 2 0.0 Name: Child, dtype: float64
/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance() /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
# Normalized Survival Rates for under 18
train.Survived[train.Child == 1].value_counts(normalize = True)
1 0.675 0 0.325 Name: Survived, dtype: float64
# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize = True)
0 0.609792 1 0.390208 Name: Survived, dtype: float64
# Create a copy of test: test_one
test_one = test
# Initialize a Survived column to 0
test_one['Survived'] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.Survived[test_one.Sex =='female'] = 1
print test_one.Survived[:3]
0 0 1 1 2 0 Name: Survived, dtype: int64
/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from ipykernel import kernelapp as app /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance() /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
train.sort_values(by = ['Age'])[:5]
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Child | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
803 | 803 | 804 | 1 | 3 | Thomas, Master. Assad Alexander | 0 | 0.42 | 0 | 1 | 2625 | 8.5167 | NaN | 1 | 1.0 |
755 | 755 | 756 | 1 | 2 | Hamalainen, Master. Viljo | 0 | 0.67 | 1 | 1 | 250649 | 14.5000 | NaN | 0 | 1.0 |
644 | 644 | 645 | 1 | 3 | Baclini, Miss. Eugenie | 1 | 0.75 | 2 | 1 | 2666 | 19.2583 | NaN | 1 | 1.0 |
469 | 469 | 470 | 1 | 3 | Baclini, Miss. Helene Barbara | 1 | 0.75 | 2 | 1 | 2666 | 19.2583 | NaN | 1 | 1.0 |
78 | 78 | 79 | 1 | 2 | Caldwell, Master. Alden Gates | 0 | 0.83 | 0 | 2 | 248738 | 29.0000 | NaN | 0 | 1.0 |
train.sort_values(by = ['Age'], ascending = False)[:5]
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Child | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
630 | 630 | 631 | 1 | 1 | Barkworth, Mr. Algernon Henry Wilson | 0 | 80.0 | 0 | 0 | 27042 | 30.0000 | A23 | 0 | 0.0 |
851 | 851 | 852 | 0 | 3 | Svensson, Mr. Johan | 0 | 74.0 | 0 | 0 | 347060 | 7.7750 | NaN | 0 | 0.0 |
493 | 493 | 494 | 0 | 1 | Artagaveytia, Mr. Ramon | 0 | 71.0 | 0 | 0 | PC 17609 | 49.5042 | NaN | 1 | 0.0 |
96 | 96 | 97 | 0 | 1 | Goldschmidt, Mr. George B | 0 | 71.0 | 0 | 0 | PC 17754 | 34.6542 | A5 | 1 | 0.0 |
116 | 116 | 117 | 0 | 3 | Connors, Mr. Patrick | 0 | 70.5 | 0 | 0 | 370369 | 7.7500 | NaN | 2 | 0.0 |
train.sort_values(by = ['Pclass', 'Age'], ascending = False)[:5]
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Child | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
851 | 851 | 852 | 0 | 3 | Svensson, Mr. Johan | 0 | 74.0 | 0 | 0 | 347060 | 7.7750 | NaN | 0 | 0.0 |
116 | 116 | 117 | 0 | 3 | Connors, Mr. Patrick | 0 | 70.5 | 0 | 0 | 370369 | 7.7500 | NaN | 2 | 0.0 |
280 | 280 | 281 | 0 | 3 | Duane, Mr. Frank | 0 | 65.0 | 0 | 0 | 336439 | 7.7500 | NaN | 2 | 0.0 |
483 | 483 | 484 | 1 | 3 | Turkula, Mrs. (Hedwig) | 1 | 63.0 | 0 | 0 | 4134 | 9.5875 | NaN | 0 | 0.0 |
326 | 326 | 327 | 0 | 3 | Nysveen, Mr. Johan Hansen | 0 | 61.0 | 0 | 0 | 345364 | 6.2375 | NaN | 0 | 0.0 |
train.groupby(['SibSp']).sum()
Unnamed: 0 | PassengerId | Survived | Pclass | Age | Parch | Fare | Child | |
---|---|---|---|---|---|---|---|---|
SibSp | ||||||||
0 | 276257 | 276865 | 210 | 1430 | 14788.25 | 113 | 15620.7530 | 9.0 |
1 | 91694 | 91903 | 112 | 430 | 5506.42 | 137 | 9226.8004 | 15.0 |
2 | 11520 | 11548 | 13 | 66 | 565.50 | 18 | 1449.1041 | 4.0 |
3 | 5129 | 5145 | 4 | 41 | 167.00 | 21 | 1102.5418 | 4.0 |
4 | 6851 | 6869 | 3 | 54 | 127.00 | 27 | 573.4000 | 7.0 |
5 | 1679 | 1684 | 0 | 15 | 51.00 | 10 | 234.5000 | 1.0 |
8 | 3365 | 3372 | 0 | 21 | NaN | 14 | 486.8500 | NaN |
train.groupby(['SibSp', 'Survived']).sum()
Unnamed: 0 | PassengerId | Pclass | Age | Parch | Fare | Child | ||
---|---|---|---|---|---|---|---|---|
SibSp | Survived | |||||||
0 | 0 | 183902 | 184300 | 1004 | 9672.50 | 43 | 6611.6071 | 1.0 |
1 | 92355 | 92565 | 426 | 5115.75 | 70 | 9009.1459 | 8.0 | |
1 | 0 | 40533 | 40630 | 232 | 2739.00 | 71 | 3256.1710 | 1.0 |
1 | 51161 | 51273 | 198 | 2767.42 | 66 | 5970.6294 | 14.0 | |
2 | 0 | 5206 | 5221 | 39 | 364.00 | 5 | 511.2000 | 0.0 |
1 | 6314 | 6327 | 27 | 201.50 | 13 | 937.9041 | 4.0 | |
3 | 0 | 3889 | 3901 | 34 | 57.00 | 17 | 539.6918 | 4.0 |
1 | 1240 | 1244 | 7 | 110.00 | 4 | 562.8500 | 0.0 | |
4 | 0 | 6289 | 6304 | 45 | 102.00 | 21 | 502.7000 | 6.0 |
1 | 562 | 565 | 9 | 25.00 | 6 | 70.7000 | 1.0 | |
5 | 0 | 1679 | 1684 | 15 | 51.00 | 10 | 234.5000 | 1.0 |
8 | 0 | 3365 | 3372 | 21 | NaN | 14 | 486.8500 | NaN |
learn more about pandas http://pandas.pydata.org/pandas-docs/stable/indexing.html