#!/usr/bin/env python # coding: utf-8 # # Modeling in Python # In[26]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import statsmodels.api as sm import sklearn as sl # In[21]: data = {"x0" : [1, 2, 3, 4, 5], "x1" : [0.01, -0.01, 0.25, -4.1, 0.0], "y" : [-1.5, 0.0, 3.6, 1.3, -2.0]} df = pd.DataFrame(data) df # In[9]: df.values # In[10]: df.columns # In[17]: df.columns = ["one", "two", "three"] df # ## patsy # In[19]: import patsy # The `patsy.dmatrices` function takes a formula string along with a dataset and produces design matrices for a linear model: # In[22]: y, X = patsy.dmatrices("y ~ x0 + x1", data = df) # In[23]: y # In[24]: X # ## statsmodels # In[28]: model = sm.OLS(y, X) # In[29]: fit = model.fit() # In[35]: print(fit.summary()) # ## scikit-learn # In[36]: from sklearn.linear_model import LogisticRegression # In[40]: train = pd.read_csv("datasets/titanic/train.csv") train.head() # In[41]: impute_value = train['Age'].median() train['Age'] = train['Age'].fillna(impute_value) train['IsFemale'] = (train['Sex'] == 'female').astype(int) predictors = ['Pclass', 'IsFemale', 'Age'] X_train = train[predictors].values y_train = train['Survived'].values # In[37]: model = LogisticRegression() # In[42]: fit = model.fit(X_train, y_train) # In[45]: print(fit) # In[ ]: