import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('/Users/danielforsyth/Desktop/salaries.csv') df.head() df = df[['Player','Cap Hit']] df.head() per = pd.read_csv('/Users/danielforsyth/Desktop/per.csv') per.head() per['MPG'] = per['MP'] / per['G'] per = per[['Player','PER','MPG']] per.head() final = pd.merge(df, per, on='Player', how='outer') final.head() final = final[final.MPG >6.09] final.dropna() final.head() pd.options.display.mpl_style = 'default' from matplotlib import rcParams rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 plt.scatter(final['PER'],final['Cap Hit']) `smaller_frame=final[['Cap Hit', 'PER', 'MPG']] from pandas.tools.plotting import scatter_matrix axeslist=scatter_matrix(smaller_frame, alpha=0.8, figsize=(12, 12), diagonal="kde") for ax in axeslist.flatten(): ax.grid(False) final.corr() final.dtypes final = final.dropna() cap = final['Cap Hit'][:, np.newaxis] per = final['PER'][:, np.newaxis] from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(per, cap) clf1 = LinearRegression() clf1.fit(X_train, y_train) predicted_train = clf1.predict(X_train) predicted_test = clf1.predict(X_test) trains=X_train.reshape(1,-1).flatten() tests=X_test.reshape(1,-1).flatten() print clf1.coef_, clf1.intercept_ plt.scatter(per,cap,c='r') plt.plot(trains, predicted_train, c='b', alpha=0.5) lr = LinearRegression() lr.fit(per,cap) b_0 = lr.intercept_ coeff = lr.coef_ pred = lr.predict(33.68) pred ncaa = pd.read_csv('/Users/danielforsyth/Desktop/ncaa.csv') ncaa = ncaa[ncaa.MPG > 6.09] ncaa.head() ncaa = ncaa[['PLAYER','PER']] ncaa.head() ncaa_per = ncaa['PER'][:, np.newaxis] ncaa_player = ncaa['PLAYER'][:, np.newaxis] predictions = [] for i in ncaa_per: pred = lr.predict(i) pred = float(pred) pred = format(pred, '.2f') predictions.append(pred) ncaa['Predicted Salary'] = predictions ncaa.head(15)