In [68]:

'''
EXERCISE: "Human Learning" with iris data
'''
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
%matplotlib inline

In [69]:

# load the famous iris data
iris = load_iris()
type(iris)

Out[69]:

sklearn.datasets.base.Bunch

In [70]:

type(iris.data)

Out[70]:

numpy.ndarray

In [71]:

# what do you think these attributes represent?
iris.data[:5,]

Out[71]:

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])

In [72]:

iris.data.shape

Out[72]:

(150, 4)

In [73]:

iris.feature_names

Out[73]:

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [74]:

iris.target

Out[74]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [75]:

iris.target_names

Out[75]:

array(['setosa', 'versicolor', 'virginica'], 
      dtype='|S10')

In [76]:

# intro to numpy
type(iris.data)

Out[76]:

numpy.ndarray

In [77]:

## PART 1: Read data into pandas and explore
pd.DataFrame(iris.data).head()

Out[77]:

	0	1	2	3
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

In [78]:

iris.feature_names
# the feature_names are a bit messy, let's 
# clean them up. remove the (cm)
# at the end and replace any spaces with an underscore
# create a list called "features" that 
# holds the cleaned column names
features = [i.replace(' ','_')[:-5] for i in iris.feature_names] # <FILL IN>
features

Out[78]:

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [79]:

# read the iris data into pandas, with our refined column names
df = pd.DataFrame(iris.data, columns=features)

In [80]:

df.head()
df.as_matrix()

Out[80]:

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4.8,  3.1,  1.6,  0.2],
       [ 5.4,  3.4,  1.5,  0.4],
       [ 5.2,  4.1,  1.5,  0.1],
       [ 5.5,  4.2,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5. ,  3.2,  1.2,  0.2],
       [ 5.5,  3.5,  1.3,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 4.4,  3. ,  1.3,  0.2],
       [ 5.1,  3.4,  1.5,  0.2],
       [ 5. ,  3.5,  1.3,  0.3],
       [ 4.5,  2.3,  1.3,  0.3],
       [ 4.4,  3.2,  1.3,  0.2],
       [ 5. ,  3.5,  1.6,  0.6],
       [ 5.1,  3.8,  1.9,  0.4],
       [ 4.8,  3. ,  1.4,  0.3],
       [ 5.1,  3.8,  1.6,  0.2],
       [ 4.6,  3.2,  1.4,  0.2],
       [ 5.3,  3.7,  1.5,  0.2],
       [ 5. ,  3.3,  1.4,  0.2],
       [ 7. ,  3.2,  4.7,  1.4],
       [ 6.4,  3.2,  4.5,  1.5],
       [ 6.9,  3.1,  4.9,  1.5],
       [ 5.5,  2.3,  4. ,  1.3],
       [ 6.5,  2.8,  4.6,  1.5],
       [ 5.7,  2.8,  4.5,  1.3],
       [ 6.3,  3.3,  4.7,  1.6],
       [ 4.9,  2.4,  3.3,  1. ],
       [ 6.6,  2.9,  4.6,  1.3],
       [ 5.2,  2.7,  3.9,  1.4],
       [ 5. ,  2. ,  3.5,  1. ],
       [ 5.9,  3. ,  4.2,  1.5],
       [ 6. ,  2.2,  4. ,  1. ],
       [ 6.1,  2.9,  4.7,  1.4],
       [ 5.6,  2.9,  3.6,  1.3],
       [ 6.7,  3.1,  4.4,  1.4],
       [ 5.6,  3. ,  4.5,  1.5],
       [ 5.8,  2.7,  4.1,  1. ],
       [ 6.2,  2.2,  4.5,  1.5],
       [ 5.6,  2.5,  3.9,  1.1],
       [ 5.9,  3.2,  4.8,  1.8],
       [ 6.1,  2.8,  4. ,  1.3],
       [ 6.3,  2.5,  4.9,  1.5],
       [ 6.1,  2.8,  4.7,  1.2],
       [ 6.4,  2.9,  4.3,  1.3],
       [ 6.6,  3. ,  4.4,  1.4],
       [ 6.8,  2.8,  4.8,  1.4],
       [ 6.7,  3. ,  5. ,  1.7],
       [ 6. ,  2.9,  4.5,  1.5],
       [ 5.7,  2.6,  3.5,  1. ],
       [ 5.5,  2.4,  3.8,  1.1],
       [ 5.5,  2.4,  3.7,  1. ],
       [ 5.8,  2.7,  3.9,  1.2],
       [ 6. ,  2.7,  5.1,  1.6],
       [ 5.4,  3. ,  4.5,  1.5],
       [ 6. ,  3.4,  4.5,  1.6],
       [ 6.7,  3.1,  4.7,  1.5],
       [ 6.3,  2.3,  4.4,  1.3],
       [ 5.6,  3. ,  4.1,  1.3],
       [ 5.5,  2.5,  4. ,  1.3],
       [ 5.5,  2.6,  4.4,  1.2],
       [ 6.1,  3. ,  4.6,  1.4],
       [ 5.8,  2.6,  4. ,  1.2],
       [ 5. ,  2.3,  3.3,  1. ],
       [ 5.6,  2.7,  4.2,  1.3],
       [ 5.7,  3. ,  4.2,  1.2],
       [ 5.7,  2.9,  4.2,  1.3],
       [ 6.2,  2.9,  4.3,  1.3],
       [ 5.1,  2.5,  3. ,  1.1],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.3,  3.3,  6. ,  2.5],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 7.1,  3. ,  5.9,  2.1],
       [ 6.3,  2.9,  5.6,  1.8],
       [ 6.5,  3. ,  5.8,  2.2],
       [ 7.6,  3. ,  6.6,  2.1],
       [ 4.9,  2.5,  4.5,  1.7],
       [ 7.3,  2.9,  6.3,  1.8],
       [ 6.7,  2.5,  5.8,  1.8],
       [ 7.2,  3.6,  6.1,  2.5],
       [ 6.5,  3.2,  5.1,  2. ],
       [ 6.4,  2.7,  5.3,  1.9],
       [ 6.8,  3. ,  5.5,  2.1],
       [ 5.7,  2.5,  5. ,  2. ],
       [ 5.8,  2.8,  5.1,  2.4],
       [ 6.4,  3.2,  5.3,  2.3],
       [ 6.5,  3. ,  5.5,  1.8],
       [ 7.7,  3.8,  6.7,  2.2],
       [ 7.7,  2.6,  6.9,  2.3],
       [ 6. ,  2.2,  5. ,  1.5],
       [ 6.9,  3.2,  5.7,  2.3],
       [ 5.6,  2.8,  4.9,  2. ],
       [ 7.7,  2.8,  6.7,  2. ],
       [ 6.3,  2.7,  4.9,  1.8],
       [ 6.7,  3.3,  5.7,  2.1],
       [ 7.2,  3.2,  6. ,  1.8],
       [ 6.2,  2.8,  4.8,  1.8],
       [ 6.1,  3. ,  4.9,  1.8],
       [ 6.4,  2.8,  5.6,  2.1],
       [ 7.2,  3. ,  5.8,  1.6],
       [ 7.4,  2.8,  6.1,  1.9],
       [ 7.9,  3.8,  6.4,  2. ],
       [ 6.4,  2.8,  5.6,  2.2],
       [ 6.3,  2.8,  5.1,  1.5],
       [ 6.1,  2.6,  5.6,  1.4],
       [ 7.7,  3. ,  6.1,  2.3],
       [ 6.3,  3.4,  5.6,  2.4],
       [ 6.4,  3.1,  5.5,  1.8],
       [ 6. ,  3. ,  4.8,  1.8],
       [ 6.9,  3.1,  5.4,  2.1],
       [ 6.7,  3.1,  5.6,  2.4],
       [ 6.9,  3.1,  5.1,  2.3],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 6.8,  3.2,  5.9,  2.3],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 6.7,  3. ,  5.2,  2.3],
       [ 6.3,  2.5,  5. ,  1.9],
       [ 6.5,  3. ,  5.2,  2. ],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.9,  3. ,  5.1,  1.8]])

In [81]:

# EXERCISEEE

# create a list of species (should be 150 elements) 
# using iris.target and iris.target_names
# resulting list should only have the words "setosa", "versicolor", and "virginica"
'''
species ==  
['setosa',
 'setosa',
 'setosa',
 'setosa',
...
...
 'virginica',
 'virginica']

Hint: use the iris.target_names and iris.target arrays
'''

Out[81]:

"\nspecies ==  \n['setosa',\n 'setosa',\n 'setosa',\n 'setosa',\n...\n...\n 'virginica',\n 'virginica']\n\nHint: use the iris.target_names and iris.target arrays\n"

In [82]:

# ANSWERRRR


species = [iris.target_names[i] for i in iris.target]
list(iris.target_names[iris.target])

Out[82]:

['setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'versicolor',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica',
 'virginica']

In [83]:

# add the species list as a new DataFrame column
df['species'] = species
df.head()

Out[83]:

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

In [84]:

# explore data numerically, looking for differences between species
# try grouping by species and check out the different predictors
# explore data numerically, looking for differences between species
df.describe()
df.groupby('species').describe()

Out[84]:

		petal_length	petal_width	sepal_length	sepal_width
species
setosa	count	50.000000	50.000000	50.000000	50.000000
	mean	1.464000	0.244000	5.006000	3.418000
	std	0.173511	0.107210	0.352490	0.381024
	min	1.000000	0.100000	4.300000	2.300000
	25%	1.400000	0.200000	4.800000	3.125000
	50%	1.500000	0.200000	5.000000	3.400000
	75%	1.575000	0.300000	5.200000	3.675000
	max	1.900000	0.600000	5.800000	4.400000
versicolor	count	50.000000	50.000000	50.000000	50.000000
	mean	4.260000	1.326000	5.936000	2.770000
	std	0.469911	0.197753	0.516171	0.313798
	min	3.000000	1.000000	4.900000	2.000000
	25%	4.000000	1.200000	5.600000	2.525000
	50%	4.350000	1.300000	5.900000	2.800000
	75%	4.600000	1.500000	6.300000	3.000000
	max	5.100000	1.800000	7.000000	3.400000
virginica	count	50.000000	50.000000	50.000000	50.000000
	mean	5.552000	2.026000	6.588000	2.974000
	std	0.551895	0.274650	0.635880	0.322497
	min	4.500000	1.400000	4.900000	2.200000
	25%	5.100000	1.800000	6.225000	2.800000
	50%	5.550000	2.000000	6.500000	3.000000
	75%	5.875000	2.300000	6.900000	3.175000
	max	6.900000	2.500000	7.900000	3.800000

In [85]:

df.groupby('species').sepal_length.mean()

Out[85]:

species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

In [86]:

df.groupby('species')['sepal_length', 'sepal_width', 'petal_length', 'petal_width'].mean()

Out[86]:

	sepal_length	sepal_width	petal_length	petal_width
species
setosa	5.006	3.418	1.464	0.244
versicolor	5.936	2.770	4.260	1.326
virginica	6.588	2.974	5.552	2.026

In [87]:

'''
agg is a new function we haven't seen yet. It will
aggregate each column using specified lists of functions.
We have been using some of its shortcuts but using
agg allows us to put in many functions at a time

df.groupby('species').agg(np.mean)
==
df.groupby('species').mean()

BUT 
df.groupby('species').agg([np.min, np.max])

doesn't have a short form
'''

df.groupby('species').agg([np.min, np.max])

Out[87]:

	sepal_length		sepal_width		petal_length		petal_width
	amin	amax	amin	amax	amin	amax	amin	amax
species
setosa	4.3	5.8	2.3	4.4	1.0	1.9	0.1	0.6
versicolor	4.9	7.0	2.0	3.4	3.0	5.1	1.0	1.8
virginica	4.9	7.9	2.2	3.8	4.5	6.9	1.4	2.5

In [97]:

color_map = {
    'versicolor': 'r',
    'setosa': 'b',
    'virginica': 'g'
}
colors = [color_map[s] for s in df['species']]
pd.scatter_matrix(df, color=colors, figsize=(10,10))

Out[97]:

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11d6018d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d3b0a10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d517110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d328390>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x126bbf810>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11d3a4c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x126d6d2d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12700e150>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x12706cdd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1270fe0d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x127153e50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1271e1310>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x12718e790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1272c5e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12745ffd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1274cf750>]], dtype=object)

In [96]:

# parallel coordinates help us visualize multivariate data

from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(df, 'species')

Out[96]:

<matplotlib.axes._subplots.AxesSubplot at 0x11e1e1250>

In [90]:

# I used values in order to see all of the data at once
# without .values, a dataframe is returned


## PART 2: Write a function to predict the species for each observation


### EXERCISE!!!!! ###

# note data must be in order
# ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# define a function that takes in a row of data and returns a predicted species
def classify_iris(data):
    if False and False or False and False:
        return 'rose'
    elif False and (True or False):
        return 'lily'
    else:
        return 'I have no clue'

# example use
classify_iris([3.4, 5.4, 7.5, 2.3])

Out[90]:

'I have no clue'

In [91]:

def test_my_function(func):
    # make predictions and store as numpy array
    preds = np.array([func(row) for row in df.values])


    # calculate the accuracy of the predictions
    return np.mean(preds == df.species.values)

In [92]:

test_my_function(classify_iris)

Out[92]:

0.0

In [ ]:

In [93]:

# EXAMPLEE using only petal_width
def sinan_classify_iris(data):
    if data[3] < 3:
        return 'setosa'
    else:
        return None

In [94]:

test_my_function(sinan_classify_iris) # TRY AND BEAT ME

Out[94]:

0.33333333333333331

In [ ]:

	0	1	2	3
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	0	1	2	3
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	0	1	2	3
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2