import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# Read in the "recent-grads.csv" as a pandas DataFrame
recent_grads = pd.read_csv('recent-grads.csv')
#Display the top rank for review of columns
recent_grads.iloc[0]
Rank 1 Major_code 2419 Major PETROLEUM ENGINEERING Total 2339 Men 2057 Women 282 Major_category Engineering ShareWomen 0.120564 Sample_size 36 Employed 1976 Full_time 1849 Part_time 270 Full_time_year_round 1207 Unemployed 37 Unemployment_rate 0.0183805 Median 110000 P25th 95000 P75th 125000 College_jobs 1534 Non_college_jobs 364 Low_wage_jobs 193 Name: 0, dtype: object
recent_grads.head()
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.120564 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.018381 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.101852 | 7 | 640 | ... | 170 | 388 | 85 | 0.117241 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.153037 | 3 | 648 | ... | 133 | 340 | 16 | 0.024096 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.107313 | 16 | 758 | ... | 150 | 692 | 40 | 0.050125 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.341631 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.061098 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
5 rows × 21 columns
recent_grads.tail()
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.637293 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.046320 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.817099 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.065112 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.799859 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.149048 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.798746 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.053621 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.877960 | 2 | 742 | ... | 237 | 410 | 87 | 0.104946 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
5 rows × 21 columns
recent_grads.describe()
Rank | Major_code | Total | Men | Women | ShareWomen | Sample_size | Employed | Full_time | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 173.000000 | 173.000000 | 172.000000 | 172.000000 | 172.000000 | 172.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 |
mean | 87.000000 | 3879.815029 | 39370.081395 | 16723.406977 | 22646.674419 | 0.522223 | 356.080925 | 31192.763006 | 26029.306358 | 8832.398844 | 19694.427746 | 2416.329480 | 0.068191 | 40151.445087 | 29501.445087 | 51494.219653 | 12322.635838 | 13284.497110 | 3859.017341 |
std | 50.084928 | 1687.753140 | 63483.491009 | 28122.433474 | 41057.330740 | 0.231205 | 618.361022 | 50675.002241 | 42869.655092 | 14648.179473 | 33160.941514 | 4112.803148 | 0.030331 | 11470.181802 | 9166.005235 | 14906.279740 | 21299.868863 | 23789.655363 | 6944.998579 |
min | 1.000000 | 1100.000000 | 124.000000 | 119.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 111.000000 | 0.000000 | 111.000000 | 0.000000 | 0.000000 | 22000.000000 | 18500.000000 | 22000.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 44.000000 | 2403.000000 | 4549.750000 | 2177.500000 | 1778.250000 | 0.336026 | 39.000000 | 3608.000000 | 3154.000000 | 1030.000000 | 2453.000000 | 304.000000 | 0.050306 | 33000.000000 | 24000.000000 | 42000.000000 | 1675.000000 | 1591.000000 | 340.000000 |
50% | 87.000000 | 3608.000000 | 15104.000000 | 5434.000000 | 8386.500000 | 0.534024 | 130.000000 | 11797.000000 | 10048.000000 | 3299.000000 | 7413.000000 | 893.000000 | 0.067961 | 36000.000000 | 27000.000000 | 47000.000000 | 4390.000000 | 4595.000000 | 1231.000000 |
75% | 130.000000 | 5503.000000 | 38909.750000 | 14631.000000 | 22553.750000 | 0.703299 | 338.000000 | 31433.000000 | 25147.000000 | 9948.000000 | 16891.000000 | 2393.000000 | 0.087557 | 45000.000000 | 33000.000000 | 60000.000000 | 14444.000000 | 11783.000000 | 3466.000000 |
max | 173.000000 | 6403.000000 | 393735.000000 | 173809.000000 | 307087.000000 | 0.968954 | 4212.000000 | 307933.000000 | 251540.000000 | 115172.000000 | 199897.000000 | 28169.000000 | 0.177226 | 110000.000000 | 95000.000000 | 125000.000000 | 151643.000000 | 148395.000000 | 48207.000000 |
raw_data_count = recent_grads.shape[0]
raw_data_count
173
recent_grads = recent_grads.dropna()
cleaned_data_count = recent_grads.shape[0]
cleaned_data_count
172
recent_grads.plot(x='Sample_size', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1977b6c88>
Taking "Sample Size" as a proxy for "popularity", this plot shows that, more popular majors do not make more or less money than less popular majors but rather are distributed around a mean of 40,000. There are many "low popularity" majors with low medians, there are also many "low popularity" majors with high medians.
recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc195678320>
recent_grads.plot(x='Full_time', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc195687da0>
There does not appear to be a strong correlation between the number of full-time positions and the median salary.
recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc195671048>
recent_grads.plot(x='Men', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1955e04e0>
recent_grads.plot(x='Women', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc19553c278>
Comparing the "Men" vs "Median" and "Women" vs "Median" plots above shows that majors with high numbers of females (and therefore assumed to be majority female) did not make more money than majors with high numbers of males (and therefore assumed to be majority male).
recent_grads.plot(x='ShareWomen', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc195517a20>
Although not requested by the instructions, the "ShareWomen" vs "Median" shows that there is a inverse relationship between the median salary and the proportion of women completing a major.
recent_grads['Sample_size'].hist(bins=50, range=(0,5000))
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1954b2c18>
recent_grads['Median'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1954d90f0>
The "median" histogram above, shows that the most common median salary is approximately 35k.
recent_grads['Employed'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1952eb710>
recent_grads['Full_time'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc195296320>
recent_grads['ShareWomen'].hist(bins=2)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc19513cd30>
By reducing the number of bins to 2 we can clearly see that there are more majors that are predominantly female than male by a ratio of approximately 4:3.
recent_grads['Unemployment_rate'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1950c9780>
recent_grads['Men'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc19500c940>
recent_grads['Women'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc194f58198>
from pandas.plotting import scatter_matrix
scatter_matrix(recent_grads[['Sample_size','Median']], figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc194e4dc50>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194e22cc0>], [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc194debef0>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194d932b0>]], dtype=object)
scatter_matrix(recent_grads[['Sample_size','Median','Unemployment_rate']], figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc194cb99e8>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194cb4470>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194bfee10>], [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc194bbaba8>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194b85cf8>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194b45710>], [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc194b14400>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194acb550>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc194a99780>]], dtype=object)
scatter_matrix(recent_grads[['ShareWomen','Median']], figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc1949a7a20>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc19491d160>], [<matplotlib.axes._subplots.AxesSubplot object at 0x7fc1948e4c18>, <matplotlib.axes._subplots.AxesSubplot object at 0x7fc1948a17f0>]], dtype=object)
The scatter matrix above shows that the most common median salary is 30-40k. The scatter plot as previous shows that there is a weak correlation between increasing female participation and decreasing median salary.
recent_grads[:10].plot.bar(x='Major', y='ShareWomen')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc19492b390>
recent_grads[-10:].plot.bar(x='Major', y='ShareWomen')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc194adf9b0>
Comparing the two bar plots above it is evident that the highest ranking majors by median salary are male dominated while the lowest ranking majors by median salary are female dominated. It is also evident that the highest ranking majors are mostly from the "Engineering" category.
recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc194708048>
recent_grads[-10:].plot.bar(x='Major', y='Unemployment_rate')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc19468f4e0>
Comparing the two bar plots above, there is no clear difference in the employment rate for the highest and lowest ranking majors based on median salary. Combining this insight with that gained from the previous bar plot comparison suggests that employability is not a significant contributor to median salary. This insight however is generated without taking into account the number of graduates for each major.
recent_grads[:10].plot.bar(x='Major', y='Total')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc194629278>
recent_grads[-10:].plot.bar(x='Major', y='Total')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1945360f0>
With three exceptions the top ranked majors by median salary are low attendance majors, the three exceptions being the generic engineering majors of chemical, mechanical and electrical engineering, this is similar to the lowest ranked majors by median salary with three being "high" attendance and the others being relatively "low" attendance majors.
major_cats = recent_grads['Major_category'].unique()
print(major_cats)
['Engineering' 'Business' 'Physical Sciences' 'Law & Public Policy' 'Computers & Mathematics' 'Industrial Arts & Consumer Services' 'Arts' 'Health' 'Social Science' 'Biology & Life Science' 'Education' 'Agriculture & Natural Resources' 'Humanities & Liberal Arts' 'Psychology & Social Work' 'Communications & Journalism' 'Interdisciplinary']
cat_dist = {}
for cat in major_cats:
cat_data = recent_grads.loc[recent_grads['Major_category'] == cat,['Men','Women','Median','Unemployment_rate','ShareWomen']]
cat_men = cat_data['Men'].sum()
cat_women = cat_data['Women'].sum()
cat_median = cat_data['Median'].mean()
cat_unemploy = cat_data['Unemployment_rate'].mean()
cat_ShareWomen = cat_data['ShareWomen'].mean()
cat_dist[cat]=[cat_men, cat_women, cat_median, cat_unemploy, cat_ShareWomen]
category_data = pd.DataFrame.from_dict(cat_dist,orient='index')
category_data.columns = ['Men','Women','Ave Median','Ave Unemployment','Ave ShareWomen']
print(category_data)
Men Women Ave Median \ Communications & Journalism 131921.0 260680.0 34500.000000 Physical Sciences 95390.0 90089.0 41890.000000 Agriculture & Natural Resources 40357.0 35263.0 35111.111111 Interdisciplinary 2817.0 9479.0 35000.000000 Biology & Life Science 184919.0 268943.0 36421.428571 Law & Public Policy 91129.0 87978.0 42200.000000 Industrial Arts & Consumer Services 103781.0 126011.0 36342.857143 Computers & Mathematics 208725.0 90283.0 42745.454545 Psychology & Social Work 98115.0 382892.0 30100.000000 Engineering 408307.0 129276.0 57382.758621 Arts 134390.0 222740.0 33062.500000 Business 667852.0 634524.0 43538.461538 Education 103526.0 455603.0 32350.000000 Humanities & Liberal Arts 272846.0 440622.0 31913.333333 Health 75517.0 387713.0 36825.000000 Social Science 256834.0 273132.0 37344.444444 Ave Unemployment Ave ShareWomen Communications & Journalism 0.075538 0.658384 Physical Sciences 0.046511 0.508683 Agriculture & Natural Resources 0.051817 0.405267 Interdisciplinary 0.070861 0.770901 Biology & Life Science 0.060918 0.587193 Law & Public Policy 0.090805 0.483649 Industrial Arts & Consumer Services 0.048071 0.349523 Computers & Mathematics 0.084256 0.311772 Psychology & Social Work 0.072065 0.794397 Engineering 0.063334 0.238889 Arts 0.090173 0.603658 Business 0.071064 0.483198 Education 0.051702 0.748507 Humanities & Liberal Arts 0.081008 0.631790 Health 0.065920 0.795152 Social Science 0.095729 0.553962
category_data.loc[:,('Men','Women')].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc19452e710>
From the above plot, there are several major categories that are female dominant including, Humanities & Liberal Arts, Psychology and Social Work, Biology and Life Science, Education, Health, Arts, Communication and Journalism. There are only two major categories that are male dominated, these being Engineering and, Computers and Mathematics. Combining this observation with the previous observations it suggests that Engineering and Computers & Mathematics have high median values.
category_data['Ave Median'].plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1943830b8>
category_data.plot(x='Ave ShareWomen', y='Ave Unemployment', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7fc193fcddd8>
The Engineering category has the highest median salary and with the Engineering category being dominated by male students, this goes some way to explain the higher average median salary for males when compared to females.
recent_grads['Median'].plot.box()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc1942b15c0>
recent_grads['Unemployment_rate'].plot.box()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc193e42b70>
Both the "Median" and "Unemployment rate" box plots show that the interquartile range is relatively small compared to the entire distribution. This signifies that a large number of samples are close to the mean similar to a normal probability distribution.
recent_grads.plot(x='ShareWomen', y='Median', kind='hexbin', gridsize=15)
<matplotlib.axes._subplots.AxesSubplot at 0x7fc193a89ba8>
Considering there was little value in any scatter plot except the "ShareWomen" vs "Median", this plot has been reproduced in the hexbin format showing the same negative correlation.