'''
EXERCISE: "Human Learning" with iris data
'''
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
%matplotlib inline
# load the famous iris data
iris = load_iris()
type(iris)
sklearn.datasets.base.Bunch
type(iris.data)
numpy.ndarray
# what do you think these attributes represent?
iris.data[:5,]
array([[ 5.1, 3.5, 1.4, 0.2], [ 4.9, 3. , 1.4, 0.2], [ 4.7, 3.2, 1.3, 0.2], [ 4.6, 3.1, 1.5, 0.2], [ 5. , 3.6, 1.4, 0.2]])
iris.data.shape
(150, 4)
iris.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
iris.target
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
iris.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='|S10')
# intro to numpy
type(iris.data)
numpy.ndarray
## PART 1: Read data into pandas and explore
pd.DataFrame(iris.data).head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
iris.feature_names
# the feature_names are a bit messy, let's
# clean them up. remove the (cm)
# at the end and replace any spaces with an underscore
# create a list called "features" that
# holds the cleaned column names
features = [i.replace(' ','_')[:-5] for i in iris.feature_names] # <FILL IN>
features
['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# read the iris data into pandas, with our refined column names
df = pd.DataFrame(iris.data, columns=features)
df.head()
df.as_matrix()
array([[ 5.1, 3.5, 1.4, 0.2], [ 4.9, 3. , 1.4, 0.2], [ 4.7, 3.2, 1.3, 0.2], [ 4.6, 3.1, 1.5, 0.2], [ 5. , 3.6, 1.4, 0.2], [ 5.4, 3.9, 1.7, 0.4], [ 4.6, 3.4, 1.4, 0.3], [ 5. , 3.4, 1.5, 0.2], [ 4.4, 2.9, 1.4, 0.2], [ 4.9, 3.1, 1.5, 0.1], [ 5.4, 3.7, 1.5, 0.2], [ 4.8, 3.4, 1.6, 0.2], [ 4.8, 3. , 1.4, 0.1], [ 4.3, 3. , 1.1, 0.1], [ 5.8, 4. , 1.2, 0.2], [ 5.7, 4.4, 1.5, 0.4], [ 5.4, 3.9, 1.3, 0.4], [ 5.1, 3.5, 1.4, 0.3], [ 5.7, 3.8, 1.7, 0.3], [ 5.1, 3.8, 1.5, 0.3], [ 5.4, 3.4, 1.7, 0.2], [ 5.1, 3.7, 1.5, 0.4], [ 4.6, 3.6, 1. , 0.2], [ 5.1, 3.3, 1.7, 0.5], [ 4.8, 3.4, 1.9, 0.2], [ 5. , 3. , 1.6, 0.2], [ 5. , 3.4, 1.6, 0.4], [ 5.2, 3.5, 1.5, 0.2], [ 5.2, 3.4, 1.4, 0.2], [ 4.7, 3.2, 1.6, 0.2], [ 4.8, 3.1, 1.6, 0.2], [ 5.4, 3.4, 1.5, 0.4], [ 5.2, 4.1, 1.5, 0.1], [ 5.5, 4.2, 1.4, 0.2], [ 4.9, 3.1, 1.5, 0.1], [ 5. , 3.2, 1.2, 0.2], [ 5.5, 3.5, 1.3, 0.2], [ 4.9, 3.1, 1.5, 0.1], [ 4.4, 3. , 1.3, 0.2], [ 5.1, 3.4, 1.5, 0.2], [ 5. , 3.5, 1.3, 0.3], [ 4.5, 2.3, 1.3, 0.3], [ 4.4, 3.2, 1.3, 0.2], [ 5. , 3.5, 1.6, 0.6], [ 5.1, 3.8, 1.9, 0.4], [ 4.8, 3. , 1.4, 0.3], [ 5.1, 3.8, 1.6, 0.2], [ 4.6, 3.2, 1.4, 0.2], [ 5.3, 3.7, 1.5, 0.2], [ 5. , 3.3, 1.4, 0.2], [ 7. , 3.2, 4.7, 1.4], [ 6.4, 3.2, 4.5, 1.5], [ 6.9, 3.1, 4.9, 1.5], [ 5.5, 2.3, 4. , 1.3], [ 6.5, 2.8, 4.6, 1.5], [ 5.7, 2.8, 4.5, 1.3], [ 6.3, 3.3, 4.7, 1.6], [ 4.9, 2.4, 3.3, 1. ], [ 6.6, 2.9, 4.6, 1.3], [ 5.2, 2.7, 3.9, 1.4], [ 5. , 2. , 3.5, 1. ], [ 5.9, 3. , 4.2, 1.5], [ 6. , 2.2, 4. , 1. ], [ 6.1, 2.9, 4.7, 1.4], [ 5.6, 2.9, 3.6, 1.3], [ 6.7, 3.1, 4.4, 1.4], [ 5.6, 3. , 4.5, 1.5], [ 5.8, 2.7, 4.1, 1. ], [ 6.2, 2.2, 4.5, 1.5], [ 5.6, 2.5, 3.9, 1.1], [ 5.9, 3.2, 4.8, 1.8], [ 6.1, 2.8, 4. , 1.3], [ 6.3, 2.5, 4.9, 1.5], [ 6.1, 2.8, 4.7, 1.2], [ 6.4, 2.9, 4.3, 1.3], [ 6.6, 3. , 4.4, 1.4], [ 6.8, 2.8, 4.8, 1.4], [ 6.7, 3. , 5. , 1.7], [ 6. , 2.9, 4.5, 1.5], [ 5.7, 2.6, 3.5, 1. ], [ 5.5, 2.4, 3.8, 1.1], [ 5.5, 2.4, 3.7, 1. ], [ 5.8, 2.7, 3.9, 1.2], [ 6. , 2.7, 5.1, 1.6], [ 5.4, 3. , 4.5, 1.5], [ 6. , 3.4, 4.5, 1.6], [ 6.7, 3.1, 4.7, 1.5], [ 6.3, 2.3, 4.4, 1.3], [ 5.6, 3. , 4.1, 1.3], [ 5.5, 2.5, 4. , 1.3], [ 5.5, 2.6, 4.4, 1.2], [ 6.1, 3. , 4.6, 1.4], [ 5.8, 2.6, 4. , 1.2], [ 5. , 2.3, 3.3, 1. ], [ 5.6, 2.7, 4.2, 1.3], [ 5.7, 3. , 4.2, 1.2], [ 5.7, 2.9, 4.2, 1.3], [ 6.2, 2.9, 4.3, 1.3], [ 5.1, 2.5, 3. , 1.1], [ 5.7, 2.8, 4.1, 1.3], [ 6.3, 3.3, 6. , 2.5], [ 5.8, 2.7, 5.1, 1.9], [ 7.1, 3. , 5.9, 2.1], [ 6.3, 2.9, 5.6, 1.8], [ 6.5, 3. , 5.8, 2.2], [ 7.6, 3. , 6.6, 2.1], [ 4.9, 2.5, 4.5, 1.7], [ 7.3, 2.9, 6.3, 1.8], [ 6.7, 2.5, 5.8, 1.8], [ 7.2, 3.6, 6.1, 2.5], [ 6.5, 3.2, 5.1, 2. ], [ 6.4, 2.7, 5.3, 1.9], [ 6.8, 3. , 5.5, 2.1], [ 5.7, 2.5, 5. , 2. ], [ 5.8, 2.8, 5.1, 2.4], [ 6.4, 3.2, 5.3, 2.3], [ 6.5, 3. , 5.5, 1.8], [ 7.7, 3.8, 6.7, 2.2], [ 7.7, 2.6, 6.9, 2.3], [ 6. , 2.2, 5. , 1.5], [ 6.9, 3.2, 5.7, 2.3], [ 5.6, 2.8, 4.9, 2. ], [ 7.7, 2.8, 6.7, 2. ], [ 6.3, 2.7, 4.9, 1.8], [ 6.7, 3.3, 5.7, 2.1], [ 7.2, 3.2, 6. , 1.8], [ 6.2, 2.8, 4.8, 1.8], [ 6.1, 3. , 4.9, 1.8], [ 6.4, 2.8, 5.6, 2.1], [ 7.2, 3. , 5.8, 1.6], [ 7.4, 2.8, 6.1, 1.9], [ 7.9, 3.8, 6.4, 2. ], [ 6.4, 2.8, 5.6, 2.2], [ 6.3, 2.8, 5.1, 1.5], [ 6.1, 2.6, 5.6, 1.4], [ 7.7, 3. , 6.1, 2.3], [ 6.3, 3.4, 5.6, 2.4], [ 6.4, 3.1, 5.5, 1.8], [ 6. , 3. , 4.8, 1.8], [ 6.9, 3.1, 5.4, 2.1], [ 6.7, 3.1, 5.6, 2.4], [ 6.9, 3.1, 5.1, 2.3], [ 5.8, 2.7, 5.1, 1.9], [ 6.8, 3.2, 5.9, 2.3], [ 6.7, 3.3, 5.7, 2.5], [ 6.7, 3. , 5.2, 2.3], [ 6.3, 2.5, 5. , 1.9], [ 6.5, 3. , 5.2, 2. ], [ 6.2, 3.4, 5.4, 2.3], [ 5.9, 3. , 5.1, 1.8]])
# EXERCISEEE
# create a list of species (should be 150 elements)
# using iris.target and iris.target_names
# resulting list should only have the words "setosa", "versicolor", and "virginica"
'''
species ==
['setosa',
'setosa',
'setosa',
'setosa',
...
...
'virginica',
'virginica']
Hint: use the iris.target_names and iris.target arrays
'''
"\nspecies == \n['setosa',\n 'setosa',\n 'setosa',\n 'setosa',\n...\n...\n 'virginica',\n 'virginica']\n\nHint: use the iris.target_names and iris.target arrays\n"
# ANSWERRRR
species = [iris.target_names[i] for i in iris.target]
list(iris.target_names[iris.target])
['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'versicolor', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica', 'virginica']
# add the species list as a new DataFrame column
df['species'] = species
df.head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# explore data numerically, looking for differences between species
# try grouping by species and check out the different predictors
# explore data numerically, looking for differences between species
df.describe()
df.groupby('species').describe()
petal_length | petal_width | sepal_length | sepal_width | ||
---|---|---|---|---|---|
species | |||||
setosa | count | 50.000000 | 50.000000 | 50.000000 | 50.000000 |
mean | 1.464000 | 0.244000 | 5.006000 | 3.418000 | |
std | 0.173511 | 0.107210 | 0.352490 | 0.381024 | |
min | 1.000000 | 0.100000 | 4.300000 | 2.300000 | |
25% | 1.400000 | 0.200000 | 4.800000 | 3.125000 | |
50% | 1.500000 | 0.200000 | 5.000000 | 3.400000 | |
75% | 1.575000 | 0.300000 | 5.200000 | 3.675000 | |
max | 1.900000 | 0.600000 | 5.800000 | 4.400000 | |
versicolor | count | 50.000000 | 50.000000 | 50.000000 | 50.000000 |
mean | 4.260000 | 1.326000 | 5.936000 | 2.770000 | |
std | 0.469911 | 0.197753 | 0.516171 | 0.313798 | |
min | 3.000000 | 1.000000 | 4.900000 | 2.000000 | |
25% | 4.000000 | 1.200000 | 5.600000 | 2.525000 | |
50% | 4.350000 | 1.300000 | 5.900000 | 2.800000 | |
75% | 4.600000 | 1.500000 | 6.300000 | 3.000000 | |
max | 5.100000 | 1.800000 | 7.000000 | 3.400000 | |
virginica | count | 50.000000 | 50.000000 | 50.000000 | 50.000000 |
mean | 5.552000 | 2.026000 | 6.588000 | 2.974000 | |
std | 0.551895 | 0.274650 | 0.635880 | 0.322497 | |
min | 4.500000 | 1.400000 | 4.900000 | 2.200000 | |
25% | 5.100000 | 1.800000 | 6.225000 | 2.800000 | |
50% | 5.550000 | 2.000000 | 6.500000 | 3.000000 | |
75% | 5.875000 | 2.300000 | 6.900000 | 3.175000 | |
max | 6.900000 | 2.500000 | 7.900000 | 3.800000 |
df.groupby('species').sepal_length.mean()
species setosa 5.006 versicolor 5.936 virginica 6.588 Name: sepal_length, dtype: float64
df.groupby('species')['sepal_length', 'sepal_width', 'petal_length', 'petal_width'].mean()
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
species | ||||
setosa | 5.006 | 3.418 | 1.464 | 0.244 |
versicolor | 5.936 | 2.770 | 4.260 | 1.326 |
virginica | 6.588 | 2.974 | 5.552 | 2.026 |
'''
agg is a new function we haven't seen yet. It will
aggregate each column using specified lists of functions.
We have been using some of its shortcuts but using
agg allows us to put in many functions at a time
df.groupby('species').agg(np.mean)
==
df.groupby('species').mean()
BUT
df.groupby('species').agg([np.min, np.max])
doesn't have a short form
'''
df.groupby('species').agg([np.min, np.max])
sepal_length | sepal_width | petal_length | petal_width | |||||
---|---|---|---|---|---|---|---|---|
amin | amax | amin | amax | amin | amax | amin | amax | |
species | ||||||||
setosa | 4.3 | 5.8 | 2.3 | 4.4 | 1.0 | 1.9 | 0.1 | 0.6 |
versicolor | 4.9 | 7.0 | 2.0 | 3.4 | 3.0 | 5.1 | 1.0 | 1.8 |
virginica | 4.9 | 7.9 | 2.2 | 3.8 | 4.5 | 6.9 | 1.4 | 2.5 |
color_map = {
'versicolor': 'r',
'setosa': 'b',
'virginica': 'g'
}
colors = [color_map[s] for s in df['species']]
pd.scatter_matrix(df, color=colors, figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11d6018d0>, <matplotlib.axes._subplots.AxesSubplot object at 0x11d3b0a10>, <matplotlib.axes._subplots.AxesSubplot object at 0x11d517110>, <matplotlib.axes._subplots.AxesSubplot object at 0x11d328390>], [<matplotlib.axes._subplots.AxesSubplot object at 0x126bbf810>, <matplotlib.axes._subplots.AxesSubplot object at 0x11d3a4c50>, <matplotlib.axes._subplots.AxesSubplot object at 0x126d6d2d0>, <matplotlib.axes._subplots.AxesSubplot object at 0x12700e150>], [<matplotlib.axes._subplots.AxesSubplot object at 0x12706cdd0>, <matplotlib.axes._subplots.AxesSubplot object at 0x1270fe0d0>, <matplotlib.axes._subplots.AxesSubplot object at 0x127153e50>, <matplotlib.axes._subplots.AxesSubplot object at 0x1271e1310>], [<matplotlib.axes._subplots.AxesSubplot object at 0x12718e790>, <matplotlib.axes._subplots.AxesSubplot object at 0x1272c5e10>, <matplotlib.axes._subplots.AxesSubplot object at 0x12745ffd0>, <matplotlib.axes._subplots.AxesSubplot object at 0x1274cf750>]], dtype=object)
# parallel coordinates help us visualize multivariate data
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(df, 'species')
<matplotlib.axes._subplots.AxesSubplot at 0x11e1e1250>
# I used values in order to see all of the data at once
# without .values, a dataframe is returned
## PART 2: Write a function to predict the species for each observation
### EXERCISE!!!!! ###
# note data must be in order
# ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
# define a function that takes in a row of data and returns a predicted species
def classify_iris(data):
if False and False or False and False:
return 'rose'
elif False and (True or False):
return 'lily'
else:
return 'I have no clue'
# example use
classify_iris([3.4, 5.4, 7.5, 2.3])
'I have no clue'
def test_my_function(func):
# make predictions and store as numpy array
preds = np.array([func(row) for row in df.values])
# calculate the accuracy of the predictions
return np.mean(preds == df.species.values)
test_my_function(classify_iris)
0.0
# EXAMPLEE using only petal_width
def sinan_classify_iris(data):
if data[3] < 3:
return 'setosa'
else:
return None
test_my_function(sinan_classify_iris) # TRY AND BEAT ME
0.33333333333333331