import pandas as pd
import numpy as np
import skflow as sf
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
np.random.seed(42)
data = pd.read_csv('tf_examples/data/titanic_train.csv')
data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35 | 0 | 0 | 373450 | 8.0500 | NaN | S |
predictors = ['Age', 'SibSp', 'Fare', 'Parch', 'Pclass', 'Sex']
X_train, X_test, y_train, y_test = train_test_split(data[predictors].replace({'Sex':{'male':0,
'female':1}}).fillna(data.mean()),
data.Survived)
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)
0.79372197309417036
classifier = sf.TensorFlowLinearClassifier(n_classes=2,
batch_size=128,
steps=500,
learning_rate=0.05)
classifier.fit(X_train, y_train)
Step #1, avg. loss: 5.86508 Step #51, avg. loss: 2.95665 Step #101, avg. loss: 2.78301 Step #151, avg. loss: 2.82941 Step #201, avg. loss: 2.80895 Step #251, avg. loss: 2.85910 Step #301, avg. loss: 2.74752 Step #351, avg. loss: 2.73762 Step #401, avg. loss: 2.79703 Step #451, avg. loss: 2.81233
TensorFlowLinearClassifier(batch_size=128, continue_training=False, early_stopping_rounds=None, keep_checkpoint_every_n_hours=10000, learning_rate=0.05, max_to_keep=5, n_classes=2, optimizer='SGD', steps=500, tf_master='', tf_random_seed=42, verbose=1)
classifier.score(X_test, y_test)
0.60089686098654704
classifier = sf.TensorFlowDNNClassifier(
hidden_units=[10, 20, 20, 10],
n_classes=2,
batch_size=128,
steps=20000,
learning_rate=0.05)
classifier.fit(X_train, y_train)
Step #1, avg. loss: 5.08049 Step #2001, avg. loss: 0.58594 Step #4001, avg. loss: 0.51789 Step #6001, avg. loss: 0.47891 Step #8001, avg. loss: 0.45563 Step #10001, avg. loss: 0.43729 Step #12001, avg. loss: 0.41228 Step #14001, avg. loss: 0.39740 Step #16001, avg. loss: 0.39062 Step #18001, avg. loss: 0.38242
TensorFlowDNNClassifier(batch_size=128, continue_training=False, early_stopping_rounds=None, hidden_units=[10, 20, 20, 10], keep_checkpoint_every_n_hours=10000, learning_rate=0.05, max_to_keep=5, n_classes=2, optimizer='SGD', steps=20000, tf_master='', tf_random_seed=42, verbose=1)
classifier.score(X_test, y_test)
0.76681614349775784
from tensorflow import tanh
def dnn_tanh(X, y):
layers = sf.ops.dnn(X, [10, 20, 10], tanh)
return sf.models.logistic_regression(layers, y)
classifier = sf.TensorFlowEstimator(
model_fn=dnn_tanh,
n_classes=2,
batch_size=128,
steps=500,
learning_rate=0.05)
classifier.fit(X_train, y_train)
Step #1, avg. loss: 0.67051 Step #51, avg. loss: 0.62905 Step #101, avg. loss: 0.60878 Step #151, avg. loss: 0.60869 Step #201, avg. loss: 0.59843 Step #251, avg. loss: 0.61128 Step #301, avg. loss: 0.59994 Step #351, avg. loss: 0.59463 Step #401, avg. loss: 0.59706 Step #451, avg. loss: 0.59240
TensorFlowEstimator(batch_size=128, continue_training=False, early_stopping_rounds=None, keep_checkpoint_every_n_hours=10000, learning_rate=0.05, max_to_keep=5, model_fn=<function dnn_tanh at 0x10bbc22f0>, n_classes=2, num_cores=4, optimizer='SGD', steps=500, tf_master='', tf_random_seed=42, verbose=1)
score = accuracy_score(classifier.predict(X_test), y_test)
score
0.68161434977578472
from sklearn import datasets
digits = datasets.load_digits()
X = digits.images
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
This function creates a 2-dimensional convolutional layer with max pooling.
def conv_model(X, y):
X = tf.expand_dims(X, 3)
features = tf.reduce_max(sf.ops.conv2d(X, 12, [3, 3]), [1, 2])
features = tf.reshape(features, [-1, 12])
return sf.models.logistic_regression(features, y)
classifier = sf.TensorFlowEstimator(model_fn=conv_model, n_classes=10,
steps=5000, learning_rate=0.05,
batch_size=128)
classifier.fit(X_train, y_train)
Step #1, avg. loss: 13.47284 Step #501, avg. loss: 1.53449 Step #1001, avg. loss: 0.74431 Step #1501, avg. loss: 0.70765 Step #2001, avg. loss: 0.66011 Step #2501, avg. loss: 0.66769 Step #3001, avg. loss: 0.64826 Step #3501, avg. loss: 0.62919 Step #4001, avg. loss: 0.61455 Step #4501, avg. loss: 0.58433
TensorFlowEstimator(batch_size=128, continue_training=False, early_stopping_rounds=None, keep_checkpoint_every_n_hours=10000, learning_rate=0.05, max_to_keep=5, model_fn=<function conv_model at 0x10c9720d0>, n_classes=10, num_cores=4, optimizer='SGD', steps=5000, tf_master='', tf_random_seed=42, verbose=1)
score = accuracy_score(classifier.predict(X_test), y_test)
score
0.72499999999999998