#!/usr/bin/env python # coding: utf-8 # # Exploration of Graduates Earnings Based on College Majors Using Data-Visualisation Techniques # In[1]: import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # Read in the "recent-grads.csv" as a pandas DataFrame recent_grads = pd.read_csv('recent-grads.csv') #Display the top rank for review of columns recent_grads.iloc[0] # In[3]: recent_grads.head() # In[4]: recent_grads.tail() # In[5]: recent_grads.describe() # In[6]: raw_data_count = recent_grads.shape[0] raw_data_count # In[7]: recent_grads = recent_grads.dropna() cleaned_data_count = recent_grads.shape[0] cleaned_data_count # In[8]: recent_grads.plot(x='Sample_size', y='Median', kind='scatter') # Taking "Sample Size" as a proxy for "popularity", this plot shows that, more popular majors do not make more or less money than less popular majors but rather are distributed around a mean of 40,000. There are many "low popularity" majors with low medians, there are also many "low popularity" majors with high medians. # In[9]: recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter') # In[10]: recent_grads.plot(x='Full_time', y='Median', kind='scatter') # There does not appear to be a strong correlation between the number of full-time positions and the median salary. # In[11]: recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter') # In[12]: recent_grads.plot(x='Men', y='Median', kind='scatter') # In[13]: recent_grads.plot(x='Women', y='Median', kind='scatter') # Comparing the "Men" vs "Median" and "Women" vs "Median" plots above shows that majors with high numbers of females (and therefore assumed to be majority female) did not make more money than majors with high numbers of males (and therefore assumed to be majority male). # In[14]: recent_grads.plot(x='ShareWomen', y='Median', kind='scatter') # Although not requested by the instructions, the "ShareWomen" vs "Median" shows that there is a inverse relationship between the median salary and the proportion of women completing a major. # In[15]: recent_grads['Sample_size'].hist(bins=50, range=(0,5000)) # In[16]: recent_grads['Median'].hist(bins=25) # The "median" histogram above, shows that the most common median salary is approximately 35k. # In[17]: recent_grads['Employed'].hist(bins=25) # In[18]: recent_grads['Full_time'].hist(bins=25) # In[19]: recent_grads['ShareWomen'].hist(bins=2) # By reducing the number of bins to 2 we can clearly see that there are more majors that are predominantly female than male by a ratio of approximately 4:3. # In[20]: recent_grads['Unemployment_rate'].hist(bins=25) # In[21]: recent_grads['Men'].hist(bins=25) # In[22]: recent_grads['Women'].hist(bins=25) # In[23]: from pandas.plotting import scatter_matrix # In[24]: scatter_matrix(recent_grads[['Sample_size','Median']], figsize=(10,10)) # In[25]: scatter_matrix(recent_grads[['Sample_size','Median','Unemployment_rate']], figsize=(10,10)) # In[26]: scatter_matrix(recent_grads[['ShareWomen','Median']], figsize=(10,10)) # The scatter matrix above shows that the most common median salary is 30-40k. The scatter plot as previous shows that there is a weak correlation between increasing female participation and decreasing median salary. # In[27]: recent_grads[:10].plot.bar(x='Major', y='ShareWomen') # In[28]: recent_grads[-10:].plot.bar(x='Major', y='ShareWomen') # Comparing the two bar plots above it is evident that the highest ranking majors by median salary are male dominated while the lowest ranking majors by median salary are female dominated. It is also evident that the highest ranking majors are mostly from the "Engineering" category. # In[29]: recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate') # In[30]: recent_grads[-10:].plot.bar(x='Major', y='Unemployment_rate') # Comparing the two bar plots above, there is no clear difference in the employment rate for the highest and lowest ranking majors based on median salary. Combining this insight with that gained from the previous bar plot comparison suggests that employability is not a significant contributor to median salary. This insight however is generated without taking into account the number of graduates for each major. # In[31]: recent_grads[:10].plot.bar(x='Major', y='Total') # In[32]: recent_grads[-10:].plot.bar(x='Major', y='Total') # With three exceptions the top ranked majors by median salary are low attendance majors, the three exceptions being the generic engineering majors of chemical, mechanical and electrical engineering, this is similar to the lowest ranked majors by median salary with three being "high" attendance and the others being relatively "low" attendance majors. # In[33]: major_cats = recent_grads['Major_category'].unique() print(major_cats) # In[45]: cat_dist = {} for cat in major_cats: cat_data = recent_grads.loc[recent_grads['Major_category'] == cat,['Men','Women','Median','Unemployment_rate','ShareWomen']] cat_men = cat_data['Men'].sum() cat_women = cat_data['Women'].sum() cat_median = cat_data['Median'].mean() cat_unemploy = cat_data['Unemployment_rate'].mean() cat_ShareWomen = cat_data['ShareWomen'].mean() cat_dist[cat]=[cat_men, cat_women, cat_median, cat_unemploy, cat_ShareWomen] category_data = pd.DataFrame.from_dict(cat_dist,orient='index') category_data.columns = ['Men','Women','Ave Median','Ave Unemployment','Ave ShareWomen'] print(category_data) # In[35]: category_data.loc[:,('Men','Women')].plot.bar() # From the above plot, there are several major categories that are female dominant including, Humanities & Liberal Arts, Psychology and Social Work, Biology and Life Science, Education, Health, Arts, Communication and Journalism. There are only two major categories that are male dominated, these being Engineering and, Computers and Mathematics. Combining this observation with the previous observations it suggests that Engineering and Computers & Mathematics have high median values. # In[36]: category_data['Ave Median'].plot.bar() # In[46]: category_data.plot(x='Ave ShareWomen', y='Ave Unemployment', kind='scatter') # The Engineering category has the highest median salary and with the Engineering category being dominated by male students, this goes some way to explain the higher average median salary for males when compared to females. # In[49]: recent_grads['Median'].plot.box() # In[50]: recent_grads['Unemployment_rate'].plot.box() # Both the "Median" and "Unemployment rate" box plots show that the interquartile range is relatively small compared to the entire distribution. This signifies that a large number of samples are close to the mean similar to a normal probability distribution. # In[56]: recent_grads.plot(x='ShareWomen', y='Median', kind='hexbin', gridsize=15) # Considering there was little value in any scatter plot except the "ShareWomen" vs "Median", this plot has been reproduced in the hexbin format showing the same negative correlation. # # Conclusions # # If you want a high graduate salary: # * Be an engineer