Modeling in Python¶

In [26]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn as sl

/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [21]:

data = {"x0" : [1, 2, 3, 4, 5], 
       "x1" : [0.01, -0.01, 0.25, -4.1, 0.0], 
       "y" : [-1.5, 0.0, 3.6, 1.3, -2.0]}
df = pd.DataFrame(data)
df

Out[21]:

	x0	x1	y
0	1	0.01	-1.5
1	2	-0.01	0.0
2	3	0.25	3.6
3	4	-4.10	1.3
4	5	0.00	-2.0

In [9]:

df.values

Out[9]:

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [10]:

df.columns

Out[10]:

Index(['x0', 'x1', 'y'], dtype='object')

In [17]:

df.columns = ["one", "two", "three"]
df

Out[17]:

	one	two	three
0	1	0.01	-1.5
1	2	-0.01	0.0
2	3	0.25	3.6
3	4	-4.10	1.3
4	5	0.00	-2.0

patsy¶

In [19]:

import patsy

The patsy.dmatrices function takes a formula string along with a dataset and produces design matrices for a linear model:

In [22]:

y, X = patsy.dmatrices("y ~ x0 + x1", data = df)

In [23]:

Out[23]:

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

In [24]:

Out[24]:

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

statsmodels¶

In [28]:

model = sm.OLS(y, X)

In [29]:

fit = model.fit()

In [35]:

print(fit.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                 -0.915
Method:                 Least Squares   F-statistic:                   0.04431
Date:                Mon, 30 Oct 2017   Prob (F-statistic):              0.958
Time:                        03:13:21   Log-Likelihood:                -10.515
No. Observations:                   5   AIC:                             27.03
Df Residuals:                       2   BIC:                             25.86
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3129      3.313      0.094      0.933     -13.940      14.566
x0            -0.0791      1.057     -0.075      0.947      -4.628       4.470
x1            -0.2655      0.896     -0.296      0.795      -4.122       3.592
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   1.653
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.702
Skew:                           0.875   Prob(JB):                        0.704
Kurtosis:                       2.447   Cond. No.                         8.84
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/stats/stattools.py:72: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.
  "samples were given." % int(n), ValueWarning)

scikit-learn¶

In [36]:

from sklearn.linear_model import LogisticRegression

In [40]:

train = pd.read_csv("datasets/titanic/train.csv")
train.head()

Out[40]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

In [41]:

impute_value = train['Age'].median()
train['Age'] = train['Age'].fillna(impute_value)
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
predictors = ['Pclass', 'IsFemale', 'Age']
X_train = train[predictors].values
y_train = train['Survived'].values

In [37]:

model = LogisticRegression()

In [42]:

fit = model.fit(X_train, y_train)

In [45]:

print(fit)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [ ]: