#!/usr/bin/env python # coding: utf-8 # # *Visualising Earnings Based On College Majors* # # - This data is taken from FiveThirtyEight (Github). # # - This dataset is on the job outcomes of students who graduated from college between 2010 and 2012. # # ### Data Dictionary # # > Rank - Rank by median earnings (the dataset is ordered by this column).
# > Major_code - Major code.
# > Major - Major description.
# > Major_category - Category of major.
# > Total - Total number of people with major.
# > Sample_size - Sample size (unweighted) of full-time.
# > Men - Male graduates.
# > Women - Female graduates.
# > ShareWomen - Women as share of total.
# > Employed - Number employed.
# > Median - Median salary of full-time, year-round workers.
# > Low_wage_jobs - Number in low-wage service jobs.
# > Full_time - Number employed 35 hours or more.
# > Part_time - Number employed less than 35 hours.
# # ### Aim # ***Visualising data using Histograms, ScatterPlots, ScatterMatrix Plots and Bar Charts, and see if useful insights can be drawn from them.*** # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import matplotlib.pyplot as plt pd.set_option('display.max_columns', 100) recent_grads = pd.read_csv("recent-grads.csv") print("Recent-Grads First Row\n") recent_grads.head() # In[2]: recent_grads.describe() # ### Removing Null Values # In[3]: raw_data_count = len(recent_grads.index) print(raw_data_count) # In[4]: recent_grads = recent_grads.dropna() cleaned_data_count = len(recent_grads.index) print(cleaned_data_count) # ***There was only one row with null Values, which has been removed*** # ### Scatterplots # In[5]: # Sample_size vs unemployment_rate sample_vs_unemployment_rate = recent_grads.plot(x="Sample_size", y="Unemployment_rate", kind="scatter") # ShareWomen vs Unemployment_rate ShareWomen_vs_Unemployment_rate = recent_grads.plot(x="ShareWomen", y="Unemployment_rate", kind="scatter") # Sample_size vs Median sample_vs_median = recent_grads.plot(x="Sample_size", y="Median", kind="scatter") # Full_time vs Median Full_time_vs_Median = recent_grads.plot(x="Full_time", y="Median", kind="scatter") # Men_vs_median Men_vs_median = recent_grads.plot(x="Men", y="Median", kind="scatter") # Women_vs_median Women_vs_median = recent_grads.plot(x="Women", y="Median", kind="scatter") # ***From the 2nd plot 'Share Women' vs 'Unemployment Rate', we can see that there is no correlation between these two points.
And from all other scatter plots as well, there is not much useful info which we can gather.
Now we will try to get some insights from Histograms*** # ### Histograms # # - Sample_size # - Median # - Employed # - Full_time # - ShareWomen # - Unemployment_rate # - Men # - Women # In[6]: cols = ["Sample_size", "Median", "Employed", "Full_time", "ShareWomen", "Unemployment_rate", "Men", "Women"] fig = plt.figure(figsize = (8, 16)) for x in range(4): ax = fig.add_subplot(4, 1, x+1) ax = recent_grads[cols[x]].plot(kind='hist') ax.set_title(cols[x]) # In[7]: fig1 = plt.figure(figsize=(8, 16)) for x in range(4,8): ax = fig1.add_subplot(4, 1, (x-3)) ax = recent_grads[cols[x]].plot(kind='hist') ax.set_title(cols[x]) # ***From the 6th histogram, we can observe that 85% of the majors have an unemployment rate less then 10%
Rest of the histogram dont give much useful insights, let's use Scatter Matrix Plot*** # ### Scatter Matrix Plot # - Sample_size vs Median # - Sample_size vs Median vs Unemployment_rate # In[8]: from pandas.plotting import scatter_matrix scatter_matrix(recent_grads[["Sample_size", "Median"]], figsize=(10, 6)) scatter_matrix(recent_grads[["Median", "ShareWomen", "Unemployment_rate"]], figsize=(15, 9)) plt.show() # ***Here we can observe a negative correlation between ShareWomen and Median Salary, which means fields having Higher Median Salary tend to have less women ratio.
It is possibly due to the fact that high paying fields like engineering tend to have lesser women ratio.
Lets see if our theory can be showcased using Bar Plots.*** # ### Grouped Bar Plots # # - ShareWomen vs Median (Top 10) # - ShareWomen vs Median (Bottom 10) # In[25]: import numpy as np import matplotlib.pyplot as plt # Preparing Dataframes for Plotting Bottom_10_share_woman = recent_grads[['ShareWomen', 'Median']][:10] Bottom_10_share_woman.set_index(pd.Series([x[:14] for x in recent_grads['Major'][:10]]), inplace=True) Top_10_share_woman = recent_grads[['ShareWomen', 'Median']][-10:] Top_10_share_woman.set_index(pd.Series([x[:14] for x in recent_grads['Major'][-10:]]), inplace=True) # Plotting Data fig = plt.figure(figsize=(14, 6)) ax1= fig.add_subplot(1, 2, 1) ax2= fig.add_subplot(1, 2, 2) Bottom_10_share_woman.plot.bar(ax=ax1, secondary_y='ShareWomen', title= 'Least Share of Women') Top_10_share_woman.plot.bar(ax=ax2, secondary_y='ShareWomen', ylim=(0, 115000), title= 'Most Share of Women') # Setting up axis ax1.set_ylim(0, 115000) ax1.set_xlabel('') ax2.set_xlabel('') ax1.right_ax.set_ylim(0, 1.0) plt.show() # ***Here we can observe that 'Majors' with Higher Women Share have lesser Median Salary.
Meanwhile, 'Majors' with Lesser Women Share have higher Median Salary.*** # # ***Both these plots support our theory that high paying fields like engineering tend to have lesser women ratio.
Which in turn leads to negative correlation between ShareWomen and Higher Median Salary.***