In [1]:
import pandas as pd
import numpy as np
import skflow as sf
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
In [2]:
np.random.seed(42)
In [3]:
data = pd.read_csv('tf_examples/data/titanic_train.csv')
data.head()
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S
In [4]:
predictors = ['Age', 'SibSp', 'Fare', 'Parch', 'Pclass', 'Sex']
In [5]:
X_train, X_test, y_train, y_test = train_test_split(data[predictors].replace({'Sex':{'male':0, 
                                                                                     'female':1}}).fillna(data.mean()), 
                                                    data.Survived)
In [6]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)
Out[6]:
0.79372197309417036
In [7]:
classifier = sf.TensorFlowLinearClassifier(n_classes=2, 
                                           batch_size=128, 
                                           steps=500, 
                                           learning_rate=0.05)
In [8]:
classifier.fit(X_train, y_train)
Step #1, avg. loss: 5.86508
Step #51, avg. loss: 2.95665
Step #101, avg. loss: 2.78301
Step #151, avg. loss: 2.82941
Step #201, avg. loss: 2.80895
Step #251, avg. loss: 2.85910
Step #301, avg. loss: 2.74752
Step #351, avg. loss: 2.73762
Step #401, avg. loss: 2.79703
Step #451, avg. loss: 2.81233
Out[8]:
TensorFlowLinearClassifier(batch_size=128, continue_training=False,
              early_stopping_rounds=None,
              keep_checkpoint_every_n_hours=10000, learning_rate=0.05,
              max_to_keep=5, n_classes=2, optimizer='SGD', steps=500,
              tf_master='', tf_random_seed=42, verbose=1)
In [9]:
classifier.score(X_test, y_test)
Out[9]:
0.60089686098654704
In [10]:
classifier = sf.TensorFlowDNNClassifier(
     hidden_units=[10, 20, 20, 10], 
     n_classes=2, 
     batch_size=128, 
     steps=20000, 
     learning_rate=0.05)
In [11]:
classifier.fit(X_train, y_train)
Step #1, avg. loss: 5.08049
Step #2001, avg. loss: 0.58594
Step #4001, avg. loss: 0.51789
Step #6001, avg. loss: 0.47891
Step #8001, avg. loss: 0.45563
Step #10001, avg. loss: 0.43729
Step #12001, avg. loss: 0.41228
Step #14001, avg. loss: 0.39740
Step #16001, avg. loss: 0.39062
Step #18001, avg. loss: 0.38242
Out[11]:
TensorFlowDNNClassifier(batch_size=128, continue_training=False,
            early_stopping_rounds=None, hidden_units=[10, 20, 20, 10],
            keep_checkpoint_every_n_hours=10000, learning_rate=0.05,
            max_to_keep=5, n_classes=2, optimizer='SGD', steps=20000,
            tf_master='', tf_random_seed=42, verbose=1)
In [12]:
classifier.score(X_test, y_test)
Out[12]:
0.76681614349775784
In [13]:
from tensorflow import tanh

def dnn_tanh(X, y):
    layers = sf.ops.dnn(X, [10, 20, 10], tanh)
    return sf.models.logistic_regression(layers, y)

classifier = sf.TensorFlowEstimator(
    model_fn=dnn_tanh, 
    n_classes=2,
    batch_size=128,
    steps=500,
    learning_rate=0.05)
In [14]:
classifier.fit(X_train, y_train)
Step #1, avg. loss: 0.67051
Step #51, avg. loss: 0.62905
Step #101, avg. loss: 0.60878
Step #151, avg. loss: 0.60869
Step #201, avg. loss: 0.59843
Step #251, avg. loss: 0.61128
Step #301, avg. loss: 0.59994
Step #351, avg. loss: 0.59463
Step #401, avg. loss: 0.59706
Step #451, avg. loss: 0.59240
Out[14]:
TensorFlowEstimator(batch_size=128, continue_training=False,
          early_stopping_rounds=None, keep_checkpoint_every_n_hours=10000,
          learning_rate=0.05, max_to_keep=5,
          model_fn=<function dnn_tanh at 0x10bbc22f0>, n_classes=2,
          num_cores=4, optimizer='SGD', steps=500, tf_master='',
          tf_random_seed=42, verbose=1)
In [15]:
score = accuracy_score(classifier.predict(X_test), y_test)
score
Out[15]:
0.68161434977578472

Digit recognition

In [16]:
from sklearn import datasets
digits = datasets.load_digits()
In [17]:
X = digits.images
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.2, random_state=42)

This function creates a 2-dimensional convolutional layer with max pooling.

In [18]:
def conv_model(X, y):
    X = tf.expand_dims(X, 3)
    features = tf.reduce_max(sf.ops.conv2d(X, 12, [3, 3]), [1, 2])
    features = tf.reshape(features, [-1, 12])
    return sf.models.logistic_regression(features, y)
In [19]:
classifier = sf.TensorFlowEstimator(model_fn=conv_model, n_classes=10,
                                        steps=5000, learning_rate=0.05,
                                        batch_size=128)
classifier.fit(X_train, y_train)
Step #1, avg. loss: 13.47284
Step #501, avg. loss: 1.53449
Step #1001, avg. loss: 0.74431
Step #1501, avg. loss: 0.70765
Step #2001, avg. loss: 0.66011
Step #2501, avg. loss: 0.66769
Step #3001, avg. loss: 0.64826
Step #3501, avg. loss: 0.62919
Step #4001, avg. loss: 0.61455
Step #4501, avg. loss: 0.58433
Out[19]:
TensorFlowEstimator(batch_size=128, continue_training=False,
          early_stopping_rounds=None, keep_checkpoint_every_n_hours=10000,
          learning_rate=0.05, max_to_keep=5,
          model_fn=<function conv_model at 0x10c9720d0>, n_classes=10,
          num_cores=4, optimizer='SGD', steps=5000, tf_master='',
          tf_random_seed=42, verbose=1)
In [20]:
score = accuracy_score(classifier.predict(X_test), y_test)
score
Out[20]:
0.72499999999999998