import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn as sl
/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools
data = {"x0" : [1, 2, 3, 4, 5],
"x1" : [0.01, -0.01, 0.25, -4.1, 0.0],
"y" : [-1.5, 0.0, 3.6, 1.3, -2.0]}
df = pd.DataFrame(data)
df
x0 | x1 | y | |
---|---|---|---|
0 | 1 | 0.01 | -1.5 |
1 | 2 | -0.01 | 0.0 |
2 | 3 | 0.25 | 3.6 |
3 | 4 | -4.10 | 1.3 |
4 | 5 | 0.00 | -2.0 |
df.values
array([[ 1. , 0.01, -1.5 ], [ 2. , -0.01, 0. ], [ 3. , 0.25, 3.6 ], [ 4. , -4.1 , 1.3 ], [ 5. , 0. , -2. ]])
df.columns
Index(['x0', 'x1', 'y'], dtype='object')
df.columns = ["one", "two", "three"]
df
one | two | three | |
---|---|---|---|
0 | 1 | 0.01 | -1.5 |
1 | 2 | -0.01 | 0.0 |
2 | 3 | 0.25 | 3.6 |
3 | 4 | -4.10 | 1.3 |
4 | 5 | 0.00 | -2.0 |
import patsy
The patsy.dmatrices
function takes a formula string along with a dataset and produces design matrices for a linear model:
y, X = patsy.dmatrices("y ~ x0 + x1", data = df)
y
DesignMatrix with shape (5, 1) y -1.5 0.0 3.6 1.3 -2.0 Terms: 'y' (column 0)
X
DesignMatrix with shape (5, 3) Intercept x0 x1 1 1 0.01 1 2 -0.01 1 3 0.25 1 4 -4.10 1 5 0.00 Terms: 'Intercept' (column 0) 'x0' (column 1) 'x1' (column 2)
model = sm.OLS(y, X)
fit = model.fit()
print(fit.summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.042 Model: OLS Adj. R-squared: -0.915 Method: Least Squares F-statistic: 0.04431 Date: Mon, 30 Oct 2017 Prob (F-statistic): 0.958 Time: 03:13:21 Log-Likelihood: -10.515 No. Observations: 5 AIC: 27.03 Df Residuals: 2 BIC: 25.86 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 0.3129 3.313 0.094 0.933 -13.940 14.566 x0 -0.0791 1.057 -0.075 0.947 -4.628 4.470 x1 -0.2655 0.896 -0.296 0.795 -4.122 3.592 ============================================================================== Omnibus: nan Durbin-Watson: 1.653 Prob(Omnibus): nan Jarque-Bera (JB): 0.702 Skew: 0.875 Prob(JB): 0.704 Kurtosis: 2.447 Cond. No. 8.84 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/home/ubuntu/anaconda3/lib/python3.6/site-packages/statsmodels/stats/stattools.py:72: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given. "samples were given." % int(n), ValueWarning)
from sklearn.linear_model import LogisticRegression
train = pd.read_csv("datasets/titanic/train.csv")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
impute_value = train['Age'].median()
train['Age'] = train['Age'].fillna(impute_value)
train['IsFemale'] = (train['Sex'] == 'female').astype(int)
predictors = ['Pclass', 'IsFemale', 'Age']
X_train = train[predictors].values
y_train = train['Survived'].values
model = LogisticRegression()
fit = model.fit(X_train, y_train)
print(fit)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)