# Importing modules to be used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import arange
%matplotlib inline
# %matplotlib inline is so that plots can show in the cell
# Reading the recent-grads.csv into a dataframe
recent_grads = pd.read_csv('recent-grads.csv', encoding='latin-1')
# Observing the first row of the dataframe
recent_grads.iloc[0]
Rank 1 Major_code 2419 Major PETROLEUM ENGINEERING Total 2339 Men 2057 Women 282 Major_category Engineering ShareWomen 0.120564 Sample_size 36 Employed 1976 Full_time 1849 Part_time 270 Full_time_year_round 1207 Unemployed 37 Unemployment_rate 0.0183805 Median 110000 P25th 95000 P75th 125000 College_jobs 1534 Non_college_jobs 364 Low_wage_jobs 193 Name: 0, dtype: object
# Observing the first and last five elements of the dataframe
print(recent_grads.head())
print(recent_grads.tail())
Rank Major_code Major Total \ 0 1 2419 PETROLEUM ENGINEERING 2339.0 1 2 2416 MINING AND MINERAL ENGINEERING 756.0 2 3 2415 METALLURGICAL ENGINEERING 856.0 3 4 2417 NAVAL ARCHITECTURE AND MARINE ENGINEERING 1258.0 4 5 2405 CHEMICAL ENGINEERING 32260.0 Men Women Major_category ShareWomen Sample_size Employed ... \ 0 2057.0 282.0 Engineering 0.120564 36 1976 ... 1 679.0 77.0 Engineering 0.101852 7 640 ... 2 725.0 131.0 Engineering 0.153037 3 648 ... 3 1123.0 135.0 Engineering 0.107313 16 758 ... 4 21239.0 11021.0 Engineering 0.341631 289 25694 ... Part_time Full_time_year_round Unemployed Unemployment_rate Median \ 0 270 1207 37 0.018381 110000 1 170 388 85 0.117241 75000 2 133 340 16 0.024096 73000 3 150 692 40 0.050125 70000 4 5180 16697 1672 0.061098 65000 P25th P75th College_jobs Non_college_jobs Low_wage_jobs 0 95000 125000 1534 364 193 1 55000 90000 350 257 50 2 50000 105000 456 176 0 3 43000 80000 529 102 0 4 50000 75000 18314 4440 972 [5 rows x 21 columns] Rank Major_code Major Total Men Women \ 168 169 3609 ZOOLOGY 8409.0 3050.0 5359.0 169 170 5201 EDUCATIONAL PSYCHOLOGY 2854.0 522.0 2332.0 170 171 5202 CLINICAL PSYCHOLOGY 2838.0 568.0 2270.0 171 172 5203 COUNSELING PSYCHOLOGY 4626.0 931.0 3695.0 172 173 3501 LIBRARY SCIENCE 1098.0 134.0 964.0 Major_category ShareWomen Sample_size Employed ... \ 168 Biology & Life Science 0.637293 47 6259 ... 169 Psychology & Social Work 0.817099 7 2125 ... 170 Psychology & Social Work 0.799859 13 2101 ... 171 Psychology & Social Work 0.798746 21 3777 ... 172 Education 0.877960 2 742 ... Part_time Full_time_year_round Unemployed Unemployment_rate Median \ 168 2190 3602 304 0.046320 26000 169 572 1211 148 0.065112 25000 170 648 1293 368 0.149048 25000 171 965 2738 214 0.053621 23400 172 237 410 87 0.104946 22000 P25th P75th College_jobs Non_college_jobs Low_wage_jobs 168 20000 39000 2771 2947 743 169 24000 34000 1488 615 82 170 25000 40000 986 870 622 171 19200 26000 2403 1245 308 172 20000 22000 288 338 192 [5 rows x 21 columns]
recent_grads.describe()
Rank | Major_code | Total | Men | Women | ShareWomen | Sample_size | Employed | Full_time | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 173.000000 | 173.000000 | 172.000000 | 172.000000 | 172.000000 | 172.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 |
mean | 87.000000 | 3879.815029 | 39370.081395 | 16723.406977 | 22646.674419 | 0.522223 | 356.080925 | 31192.763006 | 26029.306358 | 8832.398844 | 19694.427746 | 2416.329480 | 0.068191 | 40151.445087 | 29501.445087 | 51494.219653 | 12322.635838 | 13284.497110 | 3859.017341 |
std | 50.084928 | 1687.753140 | 63483.491009 | 28122.433474 | 41057.330740 | 0.231205 | 618.361022 | 50675.002241 | 42869.655092 | 14648.179473 | 33160.941514 | 4112.803148 | 0.030331 | 11470.181802 | 9166.005235 | 14906.279740 | 21299.868863 | 23789.655363 | 6944.998579 |
min | 1.000000 | 1100.000000 | 124.000000 | 119.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 111.000000 | 0.000000 | 111.000000 | 0.000000 | 0.000000 | 22000.000000 | 18500.000000 | 22000.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 44.000000 | 2403.000000 | 4549.750000 | 2177.500000 | 1778.250000 | 0.336026 | 39.000000 | 3608.000000 | 3154.000000 | 1030.000000 | 2453.000000 | 304.000000 | 0.050306 | 33000.000000 | 24000.000000 | 42000.000000 | 1675.000000 | 1591.000000 | 340.000000 |
50% | 87.000000 | 3608.000000 | 15104.000000 | 5434.000000 | 8386.500000 | 0.534024 | 130.000000 | 11797.000000 | 10048.000000 | 3299.000000 | 7413.000000 | 893.000000 | 0.067961 | 36000.000000 | 27000.000000 | 47000.000000 | 4390.000000 | 4595.000000 | 1231.000000 |
75% | 130.000000 | 5503.000000 | 38909.750000 | 14631.000000 | 22553.750000 | 0.703299 | 338.000000 | 31433.000000 | 25147.000000 | 9948.000000 | 16891.000000 | 2393.000000 | 0.087557 | 45000.000000 | 33000.000000 | 60000.000000 | 14444.000000 | 11783.000000 | 3466.000000 |
max | 173.000000 | 6403.000000 | 393735.000000 | 173809.000000 | 307087.000000 | 0.968954 | 4212.000000 | 307933.000000 | 251540.000000 | 115172.000000 | 199897.000000 | 28169.000000 | 0.177226 | 110000.000000 | 95000.000000 | 125000.000000 | 151643.000000 | 148395.000000 | 48207.000000 |
From the description of the dataframe there are 173 entries of data
raw_data_count = recent_grads.shape[0]
# The dataframe shape attribute returns a tuple as (row, column), so
# I assigned the first item which is the row number a variable
recent_grads = recent_grads.dropna()
# Here I dropped rows with null values for more accurate analysis
cleaned_data_count = recent_grads.shape[0]
# Getting the row number from the cleaned dataframe
print('Raw data count: ', raw_data_count,'----- Cleaned data count: ', cleaned_data_count)
Raw data count: 173 ----- Cleaned data count: 172
The Raw data count originally 173 rows but is now 172 rows after being cleaned which means that only one row contained null values
recent_grads.plot(x='Sample_size', y='Employed', kind='scatter', title='Employed vs. Sample_size', figsize=(5,6))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbe079f88>
recent_grads.plot(x='Sample_size', y='Median', kind='scatter', figsize=(5,10), title='Median vs. Sample_size')
<matplotlib.axes._subplots.AxesSubplot at 0x24dbe800308>
recent_grads.plot(x='Sample_size', y='Unemployment_rate', title='Unemployment_rate vs. Sample_size', figsize=(5,10), kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x24dbe873248>
recent_grads.plot(x='Full_time', y='Median', kind='scatter', title='Median vs. Full_time', figsize=(5,10))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbe8bd8c8>
The number of people working full time does not affect how the median salary changes, so there is no relationship.
recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter', title='Unemployment_rate vs. ShareWomen', figsize=(5,10))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbe927648>
recent_grads.plot(x='Men', y='Median', kind='scatter', title='Median vs. Men', figsize=(5,10))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbf96a648>
recent_grads.plot(x='Women', y='Median', kind='scatter', title='Median vs. Women', figsize=(7,10 ))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbf9db688>
recent_grads['Sample_size'].hist(bins=15, range=(0, 5000))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbd9c4a48>
recent_grads['Median'].hist(bins=20, range=(22000, 115000))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbda66f08>
recent_grads['Full_time'].hist(bins=25, range=(100, 250000))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbfb56248>
recent_grads['ShareWomen'].hist(bins=25, range=(0, 2))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbfd70c88>
recent_grads['Unemployment_rate'].hist(bins=25, range=(0, 0.3))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbfd9ec88>
recent_grads['Men'].hist(bins=25, range=(100, 200000))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbfeb81c8>
recent_grads['Women'].hist(bins=25, range=(0, 310000))
<matplotlib.axes._subplots.AxesSubplot at 0x24dbff8e908>
from pandas.plotting import scatter_matrix
scatter_matrix(recent_grads[['Sample_size', 'Median']], figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000024DBFEC3D88>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC004F408>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC0084B88>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC00BF588>]], dtype=object)
scatter_matrix(recent_grads[['Sample_size', 'Median', 'Unemployment_rate']], figsize=(20,20))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC03F5A88>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC041AA08>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC02061C8>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC023B948>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC0276208>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC02A9F88>], [<matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC02E3E88>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC031BBC8>, <matplotlib.axes._subplots.AxesSubplot object at 0x0000024DC031BDC8>]], dtype=object)
recent_grads[:10].plot.bar(x='Major', y='ShareWomen')
recent_grads[-10:].plot.bar(x='Major', y='ShareWomen')
<matplotlib.axes._subplots.AxesSubplot at 0x24dc0757e48>
recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate')
recent_grads[-10:].plot.bar(x='Major', y='Unemployment_rate')
<matplotlib.axes._subplots.AxesSubplot at 0x24dc086f4c8>
recent_grads.groupby('Major_category')[['Men', 'Women']].sum().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x24dc092a3c8>
recent_grads.boxplot(column=['Unemployment_rate'], vert=False)
<matplotlib.axes._subplots.AxesSubplot at 0x24dc08ea448>
recent_grads.boxplot(column=['Median'], vert=False)
<matplotlib.axes._subplots.AxesSubplot at 0x24dc07284c8>
There are some outliers in the Unemployment rate and median box plots
fig, [ax1, ax2] = plt.subplots(1,2, figsize=(12,5))
recent_grads.plot.hexbin(x='Sample_size', y='Median',gridsize=10, ax=ax1 )
ax1.set_title('Sample_size vs. Median hexbin')
ax1.set_xlim(0)
ax1.set_ylim(0)
recent_grads.plot.scatter(x='Sample_size', y='Median', ax=ax2)
ax2.set_xlim(0)
ax2.set_ylim(0)
ax2.set_title('Sample_size vs. Median scatter plot')
Text(0.5, 1.0, 'Sample_size vs. Median scatter plot')
fig, [ax1, ax2] = plt.subplots(1,2, figsize=(12,5))
recent_grads.plot.hexbin(x='Sample_size', y='Unemployment_rate',gridsize=10, ax=ax1 )
ax1.set_title('Sample_size vs. Unemployment_rate')
ax1.set_xlim(0)
ax1.set_ylim(0)
recent_grads.plot.scatter(x='Sample_size', y='Unemployment_rate', ax=ax2)
ax2.set_xlim(0)
ax2.set_ylim(0)
ax2.set_title('Sample_size vs. Unemployment_rate')
Text(0.5, 1.0, 'Sample_size vs. Unemployment_rate')
fig, [ax1, ax2] = plt.subplots(1,2, figsize=(12,5))
recent_grads.plot.hexbin(x='Full_time', y='Median',gridsize=10, ax=ax1 )
ax1.set_title('Full_time vs. Median')
ax1.set_xlim(0)
ax1.set_ylim(0)
recent_grads.plot.scatter(x='Full_time', y='Median', ax=ax2)
ax2.set_xlim(0)
ax2.set_ylim(0)
ax2.set_title('Full_time vs. Median')
Text(0.5, 1.0, 'Full_time vs. Median')
fig, [ax1, ax2] = plt.subplots(1,2, figsize=(16,5))
recent_grads.plot.hexbin(x='Men', y='Median',gridsize=10, ax=ax1 )
ax1.set_title('Median vs. Men')
ax1.set_xlim(0)
ax1.set_ylim(0)
recent_grads.plot.scatter(x='Men', y='Median', ax=ax2)
ax2.set_xlim(0)
ax2.set_ylim(0)
ax2.set_title('Median vs. Men')
Text(0.5, 1.0, 'Median vs. Men')
fig, [ax1, ax2] = plt.subplots(1,2, figsize=(12,5))
recent_grads.plot.hexbin(x='Women', y='Median',gridsize=10, ax=ax1 )
ax1.set_title('Median vs. Women')
ax1.set_xlim(0)
ax1.set_ylim(0)
recent_grads.plot.scatter(x='Women', y='Median', ax=ax2)
ax2.set_xlim(0)
ax2.set_ylim(0)
ax2.set_title('Median vs. Women')
Text(0.5, 1.0, 'Median vs. Women')
fig, [ax1, ax2] = plt.subplots(1,2, figsize=(12,5))
recent_grads.plot.hexbin(x='Sample_size', y='Employed',gridsize=10, ax=ax1 )
ax1.set_title('Sample_size vs. Employed')
ax1.set_xlim(0)
ax1.set_ylim(0)
recent_grads.plot.scatter(x='Sample_size', y='Employed', ax=ax2)
ax2.set_xlim(0)
ax2.set_ylim(0)
ax2.set_title('Sample_size vs. Employed')
Text(0.5, 1.0, 'Sample_size vs. Employed')
Any feedback on this project will be very helpful.