# Import required libraries
from tpot import TPOT
from sklearn.cross_validation import StratifiedShuffleSplit
import pandas as pd
import numpy as np
# Load the data
titanic = pd.read_csv('/Users/chengjun/github/cjc2016/data/tatanic_train.csv')
titanic.head(5)
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
titanic.groupby('Sex').Survived.value_counts()
Sex Survived female 1 233 0 81 male 0 468 1 109 dtype: int64
titanic.groupby(['Pclass','Sex']).Survived.value_counts()
Pclass Sex Survived 1 female 1 91 0 3 male 0 77 1 45 2 female 1 70 0 6 male 0 91 1 17 3 female 0 72 1 72 male 0 300 1 47 dtype: int64
id = pd.crosstab([titanic.Pclass, titanic.Sex], titanic.Survived.astype(float))
id.div(id.sum(1).astype(float), 0)
Survived | 0.0 | 1.0 | |
---|---|---|---|
Pclass | Sex | ||
1 | female | 0.031915 | 0.968085 |
male | 0.631148 | 0.368852 | |
2 | female | 0.078947 | 0.921053 |
male | 0.842593 | 0.157407 | |
3 | female | 0.500000 | 0.500000 |
male | 0.864553 | 0.135447 |
titanic.rename(columns={'Survived': 'class'}, inplace=True)
titanic.dtypes
Unnamed: 0 int64 PassengerId int64 class int64 Pclass int64 Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object
for cat in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']:
print("Number of levels in category '{0}': \b {1:2.2f} ".format(cat, titanic[cat].unique().size))
Number of levels in category 'Name': 891.00 Number of levels in category 'Sex': 2.00 Number of levels in category 'Ticket': 681.00 Number of levels in category 'Cabin': 148.00 Number of levels in category 'Embarked': 4.00
for cat in ['Sex', 'Embarked']:
print("Levels for catgeory '{0}': {1}".format(cat, titanic[cat].unique()))
Levels for catgeory 'Sex': ['male' 'female'] Levels for catgeory 'Embarked': ['S' 'C' 'Q' nan]
titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})
titanic['Embarked'] = titanic['Embarked'].map({'S':0,'C':1,'Q':2})
titanic = titanic.fillna(-999)
pd.isnull(titanic).any()
Unnamed: 0 False PassengerId False class False Pclass False Name False Sex False Age False SibSp False Parch False Ticket False Fare False Cabin False Embarked False dtype: bool
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
CabinTrans = mlb.fit_transform([{str(val)} for val in titanic['Cabin'].values])
CabinTrans
array([[1, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [1, 0, 0, ..., 0, 0, 0], ..., [1, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [1, 0, 0, ..., 0, 0, 0]])
titanic_new = titanic.drop(['Name','Ticket','Cabin','class'], axis=1)
assert (len(titanic['Cabin'].unique()) == len(mlb.classes_)), "Not Equal" #check correct encoding done
titanic_new = np.hstack((titanic_new.values,CabinTrans))
np.isnan(titanic_new).any()
False
titanic_new[0].size
157
titanic_new[:1]
array([[ 0. , 1. , 3. , 0. , 22. , 1. , 0. , 7.25, 0. , 1. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])
titanic_class = titanic['class'].values
training_indices, validation_indices = next(iter(StratifiedShuffleSplit(titanic_class, n_iter=1,
train_size=0.75, test_size=0.25)))
training_indices.size, validation_indices.size
(668, 223)
tpot = TPOT(generations=5, verbosity=2)
tpot.fit(titanic_new[training_indices], titanic_class[training_indices])
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-25-32026d4a6747> in <module>() 1 tpot = TPOT(generations=5, verbosity=2) ----> 2 tpot.fit(titanic_new[training_indices], titanic_class[training_indices]) /Users/chengjun/anaconda/lib/python2.7/site-packages/tpot/tpot.pyc in fit(self, features, classes) 229 self._toolbox.register('evaluate', self._evaluate_individual, training_testing_data=training_testing_data) 230 --> 231 pop = self._toolbox.population(n=self.population_size) 232 233 def pareto_eq(ind1, ind2): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/tools/init.pyc in initRepeat(container, func, n) 21 See the :ref:`list-of-floats` and :ref:`population` tutorials for more examples. 22 """ ---> 23 return container(func() for _ in xrange(n)) 24 25 def initIterate(container, generator): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/tools/init.pyc in <genexpr>((_,)) 21 See the :ref:`list-of-floats` and :ref:`population` tutorials for more examples. 22 """ ---> 23 return container(func() for _ in xrange(n)) 24 25 def initIterate(container, generator): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/tools/init.pyc in initIterate(container, generator) 47 more examples. 48 """ ---> 49 return container(generator()) 50 51 def initCycle(container, seq_func, n=1): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/gp.pyc in genHalfAndHalf(pset, min_, max_, type_) 536 """ 537 method = random.choice((genGrow, genFull)) --> 538 return method(pset, min_, max_, type_) 539 540 def genRamped(pset, min_, max_, type_=__type__): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/gp.pyc in genFull(pset, min_, max_, type_) 502 """Expression generation stops when the depth is equal to height.""" 503 return depth == height --> 504 return generate(pset, min_, max_, condition, type_) 505 506 def genGrow(pset, min_, max_, type_=__type__): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/gp.pyc in generate(pset, min_, max_, condition, type_) 581 else: 582 try: --> 583 prim = random.choice(pset.primitives[type_]) 584 except IndexError: 585 _, _, traceback = sys.exc_info() /Users/chengjun/anaconda/lib/python2.7/random.pyc in choice(self, seq) 273 def choice(self, seq): 274 """Choose a random element from a non-empty sequence.""" --> 275 return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty 276 277 def shuffle(self, x, random=None): IndexError: The gp.generate function tried to add a primitive of type '<type 'object'>', but there is none available.
from tpot import TPOT
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,
train_size=0.75, test_size=0.25)
tpot = TPOT(generations=5)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_mnist_pipeline.py')
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-28-3c86222f998d> in <module>() 8 9 tpot = TPOT(generations=5) ---> 10 tpot.fit(X_train, y_train) 11 print(tpot.score(X_test, y_test)) 12 tpot.export('tpot_mnist_pipeline.py') /Users/chengjun/anaconda/lib/python2.7/site-packages/tpot/tpot.pyc in fit(self, features, classes) 229 self._toolbox.register('evaluate', self._evaluate_individual, training_testing_data=training_testing_data) 230 --> 231 pop = self._toolbox.population(n=self.population_size) 232 233 def pareto_eq(ind1, ind2): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/tools/init.pyc in initRepeat(container, func, n) 21 See the :ref:`list-of-floats` and :ref:`population` tutorials for more examples. 22 """ ---> 23 return container(func() for _ in xrange(n)) 24 25 def initIterate(container, generator): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/tools/init.pyc in <genexpr>((_,)) 21 See the :ref:`list-of-floats` and :ref:`population` tutorials for more examples. 22 """ ---> 23 return container(func() for _ in xrange(n)) 24 25 def initIterate(container, generator): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/tools/init.pyc in initIterate(container, generator) 47 more examples. 48 """ ---> 49 return container(generator()) 50 51 def initCycle(container, seq_func, n=1): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/gp.pyc in genHalfAndHalf(pset, min_, max_, type_) 536 """ 537 method = random.choice((genGrow, genFull)) --> 538 return method(pset, min_, max_, type_) 539 540 def genRamped(pset, min_, max_, type_=__type__): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/gp.pyc in genFull(pset, min_, max_, type_) 502 """Expression generation stops when the depth is equal to height.""" 503 return depth == height --> 504 return generate(pset, min_, max_, condition, type_) 505 506 def genGrow(pset, min_, max_, type_=__type__): /Users/chengjun/anaconda/lib/python2.7/site-packages/deap/gp.pyc in generate(pset, min_, max_, condition, type_) 581 else: 582 try: --> 583 prim = random.choice(pset.primitives[type_]) 584 except IndexError: 585 _, _, traceback = sys.exc_info() /Users/chengjun/anaconda/lib/python2.7/random.pyc in choice(self, seq) 273 def choice(self, seq): 274 """Choose a random element from a non-empty sequence.""" --> 275 return seq[int(self.random() * len(seq))] # raises IndexError if seq is empty 276 277 def shuffle(self, x, random=None): IndexError: The gp.generate function tried to add a primitive of type '<type 'object'>', but there is none available.