#import packages that will be used in the project.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf
#read in the raw data dowmloaded from website
#the data include over 7,000 colleges over the whole country from 2014-2015
data=pd.read_csv('data\MERGED2014_15_PP.csv', na_values = 'NULL')
data.info()
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (6,1169,1170,1171,1172,1173,1174,1175,1177,1178,1179,1180,1183,1184,1185,1186,1187,1188,1196,1199,1200,1209,1212,1213,1222,1223,1224,1225,1226,1227,1235,1236,1237,1238,1239,1240,1248,1251,1252,1253,1255,1257,1261,1264,1265,1266,1268,1270,1274,1275,1276,1277,1278,1279,1281,1287,1288,1289,1290,1291,1292,1294,1303,1304,1316,1317,1326,1327,1328,1329,1330,1331,1335,1339,1340,1341,1342,1343,1344,1348,1379,1380,1381,1382,1383,1384,1385,1386,1387,1388,1389,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1411,1426,1427,1475,1476,1479,1480,1483,1484,1487,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499,1500,1501,1502,1503,1517,1529,1530,1532,1537,1540,1541,1542,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599,1600,1601,1602,1606,1609,1610,1613,1614,1615,1708,1729) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7703 entries, 0 to 7702 Columns: 1899 entries, UNITID to OMENRUP8_PTNFT_POOLED_SUPP dtypes: float64(643), int64(11), object(1245) memory usage: 111.6+ MB
#keep the data that is useful
#ID('UNITID'), institution name, state, the highest level of award offered('ICLEVEL'), school type('CONTROL'), admission rate,
#sat scores, including reading math and writing('SATVRMID', 'SATMTMID', 'SATWRMID');
#composit ACT scores of 25% persenttile and 75 percentile('ACTCM25','ACTCM75')
#average annual cost of attendance (COSTT4_A), number of student involved in fall semaster(UGDS)
#gender ratio('UGDS_WOMEN', 'UGDS_MEN');
#race ratio('UGDS_WHITE','UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NRA')
#ratio of different levels of family income('INC_PCT_LO', 'INC_PCT_M1','INC_PCT_M2','INC_PCT_H1', 'INC_PCT_H2')
#completion rate('C100_4', 'C100_L4')
#mean/median income after first enroll in the school for 6 or 10 years
keep = ['UNITID', 'INSTNM', 'STABBR', 'ICLEVEL', 'CONTROL','ADM_RATE','SATVRMID', 'SATMTMID', 'SATWRMID', 'ACTCM25','ACTCM75',
'COSTT4_A','COSTT4_P','UGDS', 'UGDS_MEN', 'UGDS_WOMEN', 'UGDS_WHITE','UGDS_BLACK', 'UGDS_HISP',
'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NRA', 'INC_PCT_LO', 'INC_PCT_M1','INC_PCT_M2','INC_PCT_H1', 'INC_PCT_H2', 'C100_4',
'C100_L4', 'MN_EARN_WNE_P10', 'MN_EARN_WNE_P6','MD_EARN_WNE_P10', 'MD_EARN_WNE_P6']
data=data[keep]
#because the some data was suppressed, read them as na
data.to_csv('data\project_data.csv')
data = pd.read_csv('data/project_data.csv', na_values='PrivacySuppressed')
#get the data about low income families
low_earn = data[['INC_PCT_LO', 'MN_EARN_WNE_P10','MD_EARN_WNE_P10']].dropna()
low_earn.dropna(inplace=True)
low_earn.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 5957 entries, 0 to 7698 Data columns (total 3 columns): INC_PCT_LO 5957 non-null float64 MN_EARN_WNE_P10 5957 non-null float64 MD_EARN_WNE_P10 5957 non-null float64 dtypes: float64(3) memory usage: 186.2 KB
#plot the data of low income famliy
fig, ax = plt.subplots(figsize=(10,5))
ax.scatter(low_earn.INC_PCT_LO, low_earn.MN_EARN_WNE_P10, color='red', alpha=0.15, marker='o')
ax.scatter(low_earn.INC_PCT_LO, low_earn.MD_EARN_WNE_P10, color='blue', alpha=0.1, marker='o')
ax.set_title('After school earning on percentage of low income family(0-30,000$)' )
ax.set_ylabel('Mean earning after 10 years enrollment')
ax.set_xlabel('Percentage of low income family')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.savefig('data\low_income.png')
plt.show()
#get the data about high income family
hi_earn = data[['INC_PCT_H2', 'MN_EARN_WNE_P10']].dropna()
hi_earn.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4666 entries, 0 to 7698 Data columns (total 2 columns): INC_PCT_H2 4666 non-null float64 MN_EARN_WNE_P10 4666 non-null float64 dtypes: float64(2) memory usage: 109.4 KB
#plot the data and see how the relationship between them
fig, ax = plt.subplots(figsize=(10,5))
ax.scatter(hi_earn.INC_PCT_H2, hi_earn.MN_EARN_WNE_P10, color='pink', alpha=0.4, marker='^')
ax.set_title('After school earning on percentage of high income family(above $110,001)' )
ax.set_ylabel('Mean earning after 10 years enrollment')
ax.set_xlabel('Percentage of high income family')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.savefig('data\hi_income.png')
plt.show()
#plot low income family and high income family data,
#and estimate a regression with 95% confidence interval
fig, ax=plt.subplots(figsize=(12,7))
sns.regplot(x='INC_PCT_LO', y='MN_EARN_WNE_P10', data=low_earn, ax=ax,
scatter_kws = {'alpha':0.1, 'color':'black'}, color = 'blue',ci=95)
sns.regplot(x='INC_PCT_H2', y='MN_EARN_WNE_P10', data=hi_earn, ax=ax,
scatter_kws = {'alpha':0.1, 'color':'purple'}, color = 'red',ci=95)
sns.despine(ax=ax)
ax.set_title('After school earning on percentage of low income family(0-30,000$) vs. high income family(above $110,001)',
fontsize=16)
ax.set_ylabel('Mean earning after 10 years first enrollment',fontsize=14)
ax.set_xlabel('Percentage of low/high income family',fontsize=14)
ax.set_ylim(0,150000)
ax.annotate('%low_income family ', xy=(0.95, 20000), xytext=(0.9, 50000), arrowprops=dict(arrowstyle='->',facecolor='blue'),
fontsize=12)
ax.annotate('%high income family', xy=(0.8, 100000), xytext=(0.9, 130000), arrowprops=dict(arrowstyle='->',facecolor='blue'),
fontsize=12)
plt.savefig('data\low_hi_income.png')
plt.show()
#get the data on percentage of act scores of 25% and 75%
act_earn = data[['ACTCM25', 'ACTCM75','MN_EARN_WNE_P10']].dropna()
act_earn.dropna(inplace=True)
act_earn.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1239 entries, 0 to 7251 Data columns (total 3 columns): ACTCM25 1239 non-null float64 ACTCM75 1239 non-null float64 MN_EARN_WNE_P10 1239 non-null float64 dtypes: float64(3) memory usage: 38.7 KB
#plot the data
fig, ax=plt.subplots(figsize=(12,7))
sns.regplot(x='ACTCM25', y='MN_EARN_WNE_P10', data=act_earn, ax=ax, color = 'blue',ci=95)
sns.regplot(x='ACTCM75', y='MN_EARN_WNE_P10', data=act_earn, ax=ax, color = 'pink',ci=95)
ax.set_ylim(0, 150000)
ax.annotate('25th scores', xy=(10, 25000), xytext=(8, 50000), arrowprops=dict(arrowstyle='->',facecolor='blue'),
fontsize=12, color='blue')
ax.annotate('75th scores', xy=(17.5, 28000), xytext=(15, 15000), arrowprops=dict(arrowstyle='->',facecolor='blue'),
fontsize=12,color='pink')
sns.despine(ax=ax)
ax.set_title('After school earning on 25 percentile and 75 percentile of ACT scores ',fontsize=14)
ax.set_ylabel('After school earning', fontsize=12)
ax.set_xlabel('ACT scores',fontsize=12)
plt.savefig('data/score_earn.jpg')
plt.show()
#extract the data bouth admission rates, and drop NAs
ad_earn = data[['ADM_RATE','MN_EARN_WNE_P10']].dropna()
ad_earn.dropna(inplace=True)
ad_earn.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2008 entries, 0 to 7271 Data columns (total 2 columns): ADM_RATE 2008 non-null float64 MN_EARN_WNE_P10 2008 non-null float64 dtypes: float64(2) memory usage: 47.1 KB
#creat a jointplot the show the density of each variable and their joint distribution
h = sns.jointplot(x='ADM_RATE', y='MN_EARN_WNE_P10', kind='hex', data=ad_earn,
color = 'black')
h.set_axis_labels('admission rate of schools', 'earning after graduation')
#h.set_xlim(0,1)
plt.savefig('data/ad_earn.png')
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been " C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg. warnings.warn("The 'normed' kwarg is deprecated, and has been "
#see how the average after college earning conditional on states
state = pd.DataFrame(data.groupby(['STABBR'])[['MN_EARN_WNE_P10']].mean())
print(state.describe())
print(state['MN_EARN_WNE_P10'].rank(ascending=False))
#FM:Federated States of Micronesia
#PR:Puerto Rico
#MP:Northern Mariana Islands
#52 ID:Idaho
#51 AR:Arkansas
#WI 11
MN_EARN_WNE_P10 count 57.000000 mean 37506.958563 std 6049.836069 min 20400.000000 25% 35500.000000 50% 37314.545455 75% 40710.090090 max 55941.176471 STABBR AK 7.0 AL 36.0 AR 51.0 AS 54.0 AZ 33.0 CA 15.0 CO 24.0 CT 5.0 DC 1.0 DE 12.0 FL 47.0 FM 57.0 GA 32.0 GU 53.0 HI 22.0 IA 19.0 ID 52.0 IL 21.0 IN 37.0 KS 34.0 KY 49.0 LA 40.0 MA 2.0 MD 9.0 ME 20.0 MH NaN MI 46.0 MN 13.0 MO 23.0 MP 55.0 MS 50.0 MT 35.0 NC 41.0 ND 25.0 NE 16.0 NH 17.0 NJ 14.0 NM 38.0 NV 48.0 NY 3.0 OH 39.0 OK 44.0 OR 28.0 PA 8.0 PR 56.0 PW NaN RI 4.0 SC 45.0 SD 26.0 TN 42.0 TX 30.0 UT 31.0 VA 18.0 VI 43.0 VT 6.0 WA 10.0 WI 11.0 WV 29.0 WY 27.0 Name: MN_EARN_WNE_P10, dtype: float64
#{1:public, 2:privatenonprofit, 3: private for-profit}
data.groupby(['CONTROL'])[['MN_EARN_WNE_P10']].mean()
MN_EARN_WNE_P10 | |
---|---|
CONTROL | |
1 | 40594.814815 |
2 | 49414.719784 |
3 | 31203.827573 |
#further specify the variables for constructiong model later.
var=data[ ['ICLEVEL', 'ADM_RATE', 'ACTCM25','COSTT4_A','UGDS', 'UGDS_MEN', 'UGDS_WOMEN','ACTCM25',
'UGDS_WHITE','UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN',
'UGDS_NRA', 'INC_PCT_LO', 'INC_PCT_M1','INC_PCT_M2','INC_PCT_H1', 'INC_PCT_H2', 'C100_4',
'MN_EARN_WNE_P10' ]]
var.dropna(inplace=True)
var.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1186 entries, 0 to 7128 Data columns (total 21 columns): ICLEVEL 1186 non-null int64 ADM_RATE 1186 non-null float64 ACTCM25 1186 non-null float64 COSTT4_A 1186 non-null float64 UGDS 1186 non-null float64 UGDS_MEN 1186 non-null float64 UGDS_WOMEN 1186 non-null float64 ACTCM25 1186 non-null float64 UGDS_WHITE 1186 non-null float64 UGDS_BLACK 1186 non-null float64 UGDS_HISP 1186 non-null float64 UGDS_ASIAN 1186 non-null float64 UGDS_AIAN 1186 non-null float64 UGDS_NRA 1186 non-null float64 INC_PCT_LO 1186 non-null float64 INC_PCT_M1 1186 non-null float64 INC_PCT_M2 1186 non-null float64 INC_PCT_H1 1186 non-null float64 INC_PCT_H2 1186 non-null float64 C100_4 1186 non-null float64 MN_EARN_WNE_P10 1186 non-null float64 dtypes: float64(20), int64(1) memory usage: 203.8 KB
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy import sys
#TAKE OUT THE VARIABLES TAHT ARE ACTUALL USEGULL AND WITH ON COLINEARITY
#BY DROPPING ONE OF THE VARIABLE THAT WOULD ADD UP TO ONE WITH OHTERS, like UGDS_MEN, and UGSE_WOMEN
#and run a ols regression on these covariates with log(income)
res = smf.ols('np.log(MN_EARN_WNE_P10) ~ ADM_RATE +ACTCM25 + COSTT4_A+ UGDS_MEN+ UGDS_WHITE + UGDS_BLACK+ UGDS_HISP + UGDS_ASIAN+ UGDS_AIAN+ ACTCM25+ INC_PCT_LO+ INC_PCT_M1+ INC_PCT_H1+ C100_4', data=var).fit()
print(res.summary())
OLS Regression Results =================================================================================== Dep. Variable: np.log(MN_EARN_WNE_P10) R-squared: 0.703 Model: OLS Adj. R-squared: 0.700 Method: Least Squares F-statistic: 213.3 Date: Fri, 14 Dec 2018 Prob (F-statistic): 1.20e-297 Time: 15:09:18 Log-Likelihood: 702.87 No. Observations: 1186 AIC: -1378. Df Residuals: 1172 BIC: -1307. Df Model: 13 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 10.8759 0.113 96.439 0.000 10.655 11.097 ADM_RATE 0.0481 0.026 1.828 0.068 -0.004 0.100 ACTCM25[0] 0.0127 0.001 12.457 0.000 0.011 0.015 ACTCM25[1] 0.0127 0.001 12.457 0.000 0.011 0.015 COSTT4_A 1.262e-06 4.78e-07 2.637 0.008 3.23e-07 2.2e-06 UGDS_MEN 0.2352 0.035 6.804 0.000 0.167 0.303 UGDS_WHITE -0.1384 0.068 -2.030 0.043 -0.272 -0.005 UGDS_BLACK -0.1500 0.066 -2.258 0.024 -0.280 -0.020 UGDS_HISP 0.0896 0.075 1.201 0.230 -0.057 0.236 UGDS_ASIAN 1.0022 0.124 8.110 0.000 0.760 1.245 UGDS_AIAN -0.5614 0.243 -2.307 0.021 -1.039 -0.084 INC_PCT_LO -0.9325 0.088 -10.559 0.000 -1.106 -0.759 INC_PCT_M1 -0.6638 0.189 -3.521 0.000 -1.034 -0.294 INC_PCT_H1 -1.7972 0.208 -8.633 0.000 -2.206 -1.389 C100_4 -0.0490 0.039 -1.265 0.206 -0.125 0.027 ============================================================================== Omnibus: 87.004 Durbin-Watson: 1.718 Prob(Omnibus): 0.000 Jarque-Bera (JB): 330.840 Skew: 0.246 Prob(JB): 1.44e-72 Kurtosis: 5.540 Cond. No. 3.31e+19 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 1.38e-27. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
#and run a ols regression on these covariates with income for comparing
res = smf.ols('MN_EARN_WNE_P10 ~ ADM_RATE +ACTCM25 + COSTT4_A+ UGDS_MEN+ UGDS_WHITE + UGDS_BLACK+ UGDS_HISP + UGDS_ASIAN+ UGDS_AIAN+ ACTCM25+ INC_PCT_LO+ INC_PCT_M1+ INC_PCT_H1+ C100_4', data=var).fit()
print(res.summary())
OLS Regression Results ============================================================================== Dep. Variable: MN_EARN_WNE_P10 R-squared: 0.685 Model: OLS Adj. R-squared: 0.681 Method: Least Squares F-statistic: 195.7 Date: Fri, 14 Dec 2018 Prob (F-statistic): 1.43e-282 Time: 15:09:19 Log-Likelihood: -12370. No. Observations: 1186 AIC: 2.477e+04 Df Residuals: 1172 BIC: 2.484e+04 Df Model: 13 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 4.224e+04 6904.201 6.117 0.000 2.87e+04 5.58e+04 ADM_RATE -1605.1986 1612.020 -0.996 0.320 -4767.967 1557.569 ACTCM25[0] 771.2677 62.219 12.396 0.000 649.194 893.341 ACTCM25[1] 771.2677 62.219 12.396 0.000 649.194 893.341 COSTT4_A 0.1184 0.029 4.042 0.000 0.061 0.176 UGDS_MEN 1.692e+04 2116.797 7.994 0.000 1.28e+04 2.11e+04 UGDS_WHITE -2052.8408 4173.277 -0.492 0.623 -1.02e+04 6135.087 UGDS_BLACK -2797.6688 4067.548 -0.688 0.492 -1.08e+04 5182.819 UGDS_HISP 3343.3951 4568.683 0.732 0.464 -5620.315 1.23e+04 UGDS_ASIAN 6.62e+04 7565.723 8.750 0.000 5.14e+04 8.1e+04 UGDS_AIAN -2.115e+04 1.49e+04 -1.419 0.156 -5.04e+04 8083.025 INC_PCT_LO -4.327e+04 5406.603 -8.004 0.000 -5.39e+04 -3.27e+04 INC_PCT_M1 -2.168e+04 1.15e+04 -1.879 0.061 -4.43e+04 961.505 INC_PCT_H1 -1.049e+05 1.27e+04 -8.233 0.000 -1.3e+05 -7.99e+04 C100_4 -1458.4584 2372.720 -0.615 0.539 -6113.712 3196.795 ============================================================================== Omnibus: 499.855 Durbin-Watson: 1.736 Prob(Omnibus): 0.000 Jarque-Bera (JB): 5219.898 Skew: 1.655 Prob(JB): 0.00 Kurtosis: 12.730 Cond. No. 3.31e+19 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 1.38e-27. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
#TAKE OUT THE VARIABLES TAHT ARE ACTUALL USEGULL AND WITH ON COLINEARITY
#BY DROPPING ONE OF THE VARIABLE THAT WOULD ADD UP TO ONE WITH OHTERS, like UGDS_MEN, and UGSE_WOMEN
# Standardize the X vars so they have mean = 0 and std = 1
var_list = [ 'ADM_RATE','ACTCM25','COSTT4_A','UGDS_MEN',
'UGDS_WHITE','UGDS_BLACK' ,'UGDS_HISP','UGDS_ASIAN',
'UGDS_AIAN','INC_PCT_LO', 'INC_PCT_M1',
'INC_PCT_M2','INC_PCT_H1', 'INC_PCT_H2', 'C100_4' ]
X = StandardScaler().fit_transform(var[var_list])
#Run a Lasso regression through 10-fold cross validation to find ou the best alpha for the model
lasso = linear_model.LassoCV(cv=10).fit(X, var['MN_EARN_WNE_P10'])
print('The best alpha from the candidate alphas is {0}.'.format(lasso.alpha_))
The best alpha from the candidate alphas is 81.06249678321906.
#plot the alphas against mse
fig, ax = plt.subplots(figsize=(15, 6))
ax.plot(lasso.alphas_, lasso.mse_path_.mean(axis=1), marker='o', color='blue')
ax.set_title('' )
ax.set_ylabel('Average test MSE')
ax.set_xlabel('the value of α')
#ax.set_xlim(0, 2000)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.savefig('data\lasso.png')
plt.show()
#Because variable 'ACTCM25' will have 2 coefficient s after running regressions, so I call them 'ACTCM25_0' and 'ACTCM25_1'
#for better display of the coefficients.
var_list_1 = [ 'ADM_RATE','ACTCM25_0','ACTCM25_1','COSTT4_A','UGDS_MEN',
'UGDS_WHITE','UGDS_BLACK' ,'UGDS_HISP','UGDS_ASIAN',
'UGDS_AIAN','INC_PCT_LO', 'INC_PCT_M1',
'INC_PCT_M2','INC_PCT_H1', 'INC_PCT_H2', 'C100_4' ]
#display the coefficients for lasso model
best_coef_lasso = pd.DataFrame({'var':var_list_1, 'lasso': lasso.coef_})
best_coef_lasso
var | lasso | |
---|---|---|
0 | ADM_RATE | -407.912513 |
1 | ACTCM25_0 | 5155.643329 |
2 | ACTCM25_1 | 0.000000 |
3 | COSTT4_A | 1535.578561 |
4 | UGDS_MEN | 2010.041732 |
5 | UGDS_WHITE | -0.000000 |
6 | UGDS_BLACK | -420.981969 |
7 | UGDS_HISP | 412.787325 |
8 | UGDS_ASIAN | 4067.373168 |
9 | UGDS_AIAN | -232.121625 |
10 | INC_PCT_LO | -0.000000 |
11 | INC_PCT_M1 | 1312.622456 |
12 | INC_PCT_M2 | -388.766972 |
13 | INC_PCT_H1 | -1491.065808 |
14 | INC_PCT_H2 | 4516.354758 |
15 | C100_4 | 0.000000 |