#!/usr/bin/env python # coding: utf-8 # This notebook is created as a guided project in "Exploratory Data Visualization" course on DataQuest.io to visualize earnings based on college majors. #
# Data set can be downloaded from [here](https://github.com/fivethirtyeight/data/tree/master/college-majors) # In[1]: import pandas as pd import matplotlib.pyplot as plt from pandas.plotting import scatter_matrix get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: recent_grads = pd.read_csv('recent-grads.csv') recent_grads.head() # In[3]: recent_grads.describe() # In[4]: recent_grads.shape # In[5]: recent_grads.info() # We see that some of the columns have 172 values, we drop missing values so that all columns have equal number of values. # ### Dropping missing values # In[6]: recent_grads.dropna(inplace=True) recent_grads.shape # In[7]: len(recent_grads['Major_category'].unique()) # #### There are 16 unique major categories in data set. For each category, we will compare: # - the number of male and female # - the number of employed and unemployed # - the number of collge jobs, non college jobs and low wage jobs # - the median earning # # In[8]: recent_grads.groupby('Major_category')['Men', 'Women'].sum().plot(kind='bar') # We see that there is a significant gender gap in Education, Engineering, Health and Psychology & Social Work. # In[9]: recent_grads.groupby('Major_category')['Employed', 'Unemployed'].sum().plot(kind='bar') # 0 unemployment in Agriculture & Natural Resources. # In[10]: recent_grads.groupby('Major_category')['College_jobs','Non_college_jobs', 'Low_wage_jobs'].sum().plot(kind='bar') # In[12]: recent_grads.groupby('Major_category')['Median'].sum().plot(kind='bar') # Engineering has highest Median salary while Business has second highest. # In[17]: recent_grads[recent_grads['Major_category']=='Engineering']['Median'] # In[11]: recent_grads['Median'].hist(bins=20, range=(recent_grads['Median'].min(),recent_grads['Median'].max())) # 30k - 35k is most common salary figure for Engineering students. # In[29]: scatter_matrix(recent_grads[['Sample_size', 'Median']], figsize=(10,10)) # In[ ]: