# imports import pandas as pd import matplotlib.pyplot as plt # this allows plots to appear directly in the notebook %matplotlib inline # read data into a DataFrame data = pd.read_csv('https://raw.githubusercontent.com/justmarkham/scikit-learn-videos/master/data/Advertising.csv', index_col=0) data.head() # print the shape of the DataFrame data.shape # visualize the relationship between the features and the response using scatterplots fig, axs = plt.subplots(1, 3, sharey=True) data.plot(kind='scatter', x='TV', y='Sales', ax=axs[0], figsize=(16, 8)) data.plot(kind='scatter', x='Radio', y='Sales', ax=axs[1]) data.plot(kind='scatter', x='Newspaper', y='Sales', ax=axs[2]) # this is the standard import if you're using "formula notation" (similar to R) import statsmodels.formula.api as smf # create a fitted model in one line lm = smf.ols(formula='Sales ~ TV', data=data).fit() # print the coefficients lm.params # manually calculate the prediction 7.032594 + 0.047537*50 # you have to create a DataFrame since the Statsmodels formula interface expects it X_new = pd.DataFrame({'TV': [50]}) X_new.head() # use the model to make predictions on a new value lm.predict(X_new) # create a DataFrame with the minimum and maximum values of TV X_new = pd.DataFrame({'TV': [data.TV.min(), data.TV.max()]}) X_new.head() # make predictions for those x values and store them preds = lm.predict(X_new) preds # first, plot the observed data data.plot(kind='scatter', x='TV', y='Sales') # then, plot the least squares line plt.plot(X_new, preds, c='red', linewidth=2) # print the confidence intervals for the model coefficients lm.conf_int() # print the p-values for the model coefficients lm.pvalues # print the R-squared value for the model lm.rsquared # create a fitted model with all three features lm = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=data).fit() # print the coefficients lm.params # print a summary of the fitted model lm.summary() # only include TV and Radio in the model lm = smf.ols(formula='Sales ~ TV + Radio', data=data).fit() lm.rsquared # add Newspaper to the model (which we believe has no association with Sales) lm = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=data).fit() lm.rsquared # create X and y feature_cols = ['TV', 'Radio', 'Newspaper'] X = data[feature_cols] y = data.Sales # follow the usual sklearn pattern: import, instantiate, fit from sklearn.linear_model import LinearRegression lm = LinearRegression() lm.fit(X, y) # print intercept and coefficients print lm.intercept_ print lm.coef_ # pair the feature names with the coefficients zip(feature_cols, lm.coef_) # predict for a new observation lm.predict([100, 25, 25]) # calculate the R-squared lm.score(X, y) import numpy as np # set a seed for reproducibility np.random.seed(12345) # create a Series of booleans in which roughly half are True nums = np.random.rand(len(data)) mask_large = nums > 0.5 # initially set Size to small, then change roughly half to be large data['Size'] = 'small' data.loc[mask_large, 'Size'] = 'large' data.head() # create a new Series called IsLarge data['IsLarge'] = data.Size.map({'small':0, 'large':1}) data.head() # create X and y feature_cols = ['TV', 'Radio', 'Newspaper', 'IsLarge'] X = data[feature_cols] y = data.Sales # instantiate, fit lm = LinearRegression() lm.fit(X, y) # print coefficients zip(feature_cols, lm.coef_) # set a seed for reproducibility np.random.seed(123456) # assign roughly one third of observations to each group nums = np.random.rand(len(data)) mask_suburban = (nums > 0.33) & (nums < 0.66) mask_urban = nums > 0.66 data['Area'] = 'rural' data.loc[mask_suburban, 'Area'] = 'suburban' data.loc[mask_urban, 'Area'] = 'urban' data.head() # create three dummy variables using get_dummies, then exclude the first dummy column area_dummies = pd.get_dummies(data.Area, prefix='Area').iloc[:, 1:] # concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns) data = pd.concat([data, area_dummies], axis=1) data.head() # create X and y feature_cols = ['TV', 'Radio', 'Newspaper', 'IsLarge', 'Area_suburban', 'Area_urban'] X = data[feature_cols] y = data.Sales # instantiate, fit lm = LinearRegression() lm.fit(X, y) # print coefficients zip(feature_cols, lm.coef_)