The Department of Education Statistics releases a data set annually containing the percentage of bachelor's degrees granted to women from 1970 to 2012. The data set is broken up into 17 categories of degrees, with each column as a separate category.
Randal olson, a data scientist at University of Pennsylvania, has cleaned the data set and made it available on his personal website. You can download the dataset by Randal compiled here.
Randal compiled this data set to explore the gender gap in STEM fields, which stands for science, technology, engineering, and mathematics. This gap is reported on often in the news and not everyone agrees that there is a gap.
Our problem statement in this project is to analyze the gender gap in college degrees using effective data visualization.
At first, We will visualize the data and do a line chart to have an overview of gender gap.
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
women_degrees = pd.read_csv('data/percent-bachelors-degrees-women-usa.csv')
women_degrees.head()
Year | Agriculture | Architecture | Art and Performance | Biology | Business | Communications and Journalism | Computer Science | Education | Engineering | English | Foreign Languages | Health Professions | Math and Statistics | Physical Sciences | Psychology | Public Administration | Social Sciences and History | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1970 | 4.229798 | 11.921005 | 59.7 | 29.088363 | 9.064439 | 35.3 | 13.6 | 74.535328 | 0.8 | 65.570923 | 73.8 | 77.1 | 38.0 | 13.8 | 44.4 | 68.4 | 36.8 |
1 | 1971 | 5.452797 | 12.003106 | 59.9 | 29.394403 | 9.503187 | 35.5 | 13.6 | 74.149204 | 1.0 | 64.556485 | 73.9 | 75.5 | 39.0 | 14.9 | 46.2 | 65.5 | 36.2 |
2 | 1972 | 7.420710 | 13.214594 | 60.4 | 29.810221 | 10.558962 | 36.6 | 14.9 | 73.554520 | 1.2 | 63.664263 | 74.6 | 76.9 | 40.2 | 14.8 | 47.6 | 62.6 | 36.1 |
3 | 1973 | 9.653602 | 14.791613 | 60.2 | 31.147915 | 12.804602 | 38.4 | 16.4 | 73.501814 | 1.6 | 62.941502 | 74.9 | 77.4 | 40.9 | 16.5 | 50.4 | 64.3 | 36.4 |
4 | 1974 | 14.074623 | 17.444688 | 61.9 | 32.996183 | 16.204850 | 40.5 | 18.9 | 73.336811 | 2.2 | 62.413412 | 75.3 | 77.9 | 41.8 | 18.2 | 52.6 | 66.1 | 37.3 |
cb_dark_blue = (0/255,107/255,164/255)
cb_orange = (255/255, 128/255, 14/255)
stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']
fig = plt.figure(figsize=(18, 3))
for sp in range(0,6):
ax = fig.add_subplot(1,6,sp+1)
ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(stem_cats[sp])
ax.tick_params(bottom=False, top=False, left=False, right=False)
if sp == 0:
ax.text(2005, 87, 'Men')
ax.text(2002, 8, 'Women')
elif sp == 5:
ax.text(2005, 62, 'Men')
ax.text(2001, 35, 'Women')
plt.show()
women_degrees.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 42 entries, 0 to 41 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 42 non-null int64 1 Agriculture 42 non-null float64 2 Architecture 42 non-null float64 3 Art and Performance 42 non-null float64 4 Biology 42 non-null float64 5 Business 42 non-null float64 6 Communications and Journalism 42 non-null float64 7 Computer Science 42 non-null float64 8 Education 42 non-null float64 9 Engineering 42 non-null float64 10 English 42 non-null float64 11 Foreign Languages 42 non-null float64 12 Health Professions 42 non-null float64 13 Math and Statistics 42 non-null float64 14 Physical Sciences 42 non-null float64 15 Psychology 42 non-null float64 16 Public Administration 42 non-null float64 17 Social Sciences and History 42 non-null float64 dtypes: float64(17), int64(1) memory usage: 6.0 KB
There are 17 degrees in total and let us group the same into 3 categories STEM, Liberal Arts and Other.
category = {"stem_cats":['Psychology', 'Biology', 'Math and Statistics', 'Physical Sciences', 'Computer Science', 'Engineering'],
"lib_arts_cats":['Foreign Languages', 'English', 'Communications and Journalism', 'Art and Performance', 'Social Sciences and History'],
"other_cats":['Health Professions', 'Public Administration', 'Education', 'Agriculture','Business', 'Architecture']
}
# 6 Rows x 3 Columns
fig, ax = plt.subplots(6, 3, figsize=(12, 12))
column = 0
# stem_cats, column = 0
for key, value in category.items():
for sp in range(0,len(value)):
ax[sp,column].plot(women_degrees['Year'], women_degrees[value[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax[sp,column].plot(women_degrees['Year'], 100-women_degrees[value[sp]], c=cb_orange, label='Men', linewidth=3)
ax[sp,column].spines["right"].set_visible(False)
ax[sp,column].spines["left"].set_visible(False)
ax[sp,column].spines["top"].set_visible(False)
ax[sp,column].spines["bottom"].set_visible(False)
ax[sp,column].set_xlim(1968, 2011)
ax[sp,column].set_ylim(0,100)
ax[sp,column].set_title(value[sp])
if column == 0:
if sp == 0:
ax[sp,column].text(2005, 25, 'Men')
ax[sp,column].text(2005, 87, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 90, 'Men')
ax[sp,column].text(2005, 27, 'Women')
if column == 1:
if sp == 0:
ax[sp,column].text(2002, 32, 'Men')
ax[sp,column].text(2002, 75, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(1970, 70, 'Men')
ax[sp,column].text(1970, 40, 'Women')
if column == 2:
if sp == 0:
ax[sp,column].text(2002, 23, 'Men')
ax[sp,column].text(2002, 88, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 60, 'Men')
ax[sp,column].text(2005, 22, 'Women')
column += 1
fig.delaxes(ax[5][1])
plt.tight_layout(pad=2)
plt.show();
We will remove x-axis labels except bottom chart, as the non-data elements clutter the field of view.
# 6 Rows x 3 Columns
fig, ax = plt.subplots(6, 3, figsize=(12, 12))
column = 0
# stem_cats, column = 0
for key, value in category.items():
for sp in range(0,len(value)):
ax[sp,column].plot(women_degrees['Year'], women_degrees[value[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax[sp,column].plot(women_degrees['Year'], 100-women_degrees[value[sp]], c=cb_orange, label='Men', linewidth=3)
ax[sp,column].spines["right"].set_visible(False)
ax[sp,column].spines["left"].set_visible(False)
ax[sp,column].spines["top"].set_visible(False)
ax[sp,column].spines["bottom"].set_visible(False)
ax[sp,column].set_xlim(1968, 2011)
ax[sp,column].set_ylim(0,100)
ax[sp,column].set_title(value[sp])
bottom_on = False
if sp+1 == len(value):
bottom_on = True
ax[sp,column].tick_params(bottom=False, top=False, left=False, right=False, labelbottom=bottom_on)
if column == 0:
if sp == 0:
ax[sp,column].text(2005, 25, 'Men')
ax[sp,column].text(2005, 87, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 90, 'Men')
ax[sp,column].text(2005, 27, 'Women')
if column == 1:
if sp == 0:
ax[sp,column].text(2002, 32, 'Men')
ax[sp,column].text(2002, 75, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(1970, 70, 'Men')
ax[sp,column].text(1970, 40, 'Women')
if column == 2:
if sp == 0:
ax[sp,column].text(2002, 23, 'Men')
ax[sp,column].text(2002, 88, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 60, 'Men')
ax[sp,column].text(2005, 22, 'Women')
column += 1
fig.delaxes(ax[5][1])
plt.tight_layout(pad=2)
plt.show();
All 17 plots have same y-axis labels, let us simplify the same.
# 6 Rows x 3 Columns
fig, ax = plt.subplots(6, 3, figsize=(12, 12))
column = 0
# stem_cats, column = 0
for key, value in category.items():
for sp in range(0,len(value)):
ax[sp,column].plot(women_degrees['Year'], women_degrees[value[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax[sp,column].plot(women_degrees['Year'], 100-women_degrees[value[sp]], c=cb_orange, label='Men', linewidth=3)
ax[sp,column].spines["right"].set_visible(False)
ax[sp,column].spines["left"].set_visible(False)
ax[sp,column].spines["top"].set_visible(False)
ax[sp,column].spines["bottom"].set_visible(False)
ax[sp,column].set_xlim(1968, 2011)
ax[sp,column].set_ylim(0,100)
ax[sp,column].set_title(value[sp])
ax[sp,column].set_yticks([0, 100])
bottom_on = False
if sp+1 == len(value):
bottom_on = True
ax[sp,column].tick_params(bottom=False, top=False, left=False, right=False, labelbottom=bottom_on)
if column == 0:
if sp == 0:
ax[sp,column].text(2005, 25, 'Men')
ax[sp,column].text(2005, 87, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 90, 'Men')
ax[sp,column].text(2005, 27, 'Women')
if column == 1:
if sp == 0:
ax[sp,column].text(2002, 32, 'Men')
ax[sp,column].text(2002, 75, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(1970, 70, 'Men')
ax[sp,column].text(1970, 40, 'Women')
if column == 2:
if sp == 0:
ax[sp,column].text(2002, 23, 'Men')
ax[sp,column].text(2002, 88, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 60, 'Men')
ax[sp,column].text(2005, 22, 'Women')
column += 1
fig.delaxes(ax[5][1])
plt.tight_layout(pad=2)
plt.show();
Let us add a horizontal line to understand which degrees have close to 50:50 gender gap.
# 6 Rows x 3 Columns
fig, ax = plt.subplots(6, 3, figsize=(12, 12))
column = 0
# stem_cats, column = 0
for key, value in category.items():
for sp in range(0,len(value)):
ax[sp,column].plot(women_degrees['Year'], women_degrees[value[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax[sp,column].plot(women_degrees['Year'], 100-women_degrees[value[sp]], c=cb_orange, label='Men', linewidth=3)
ax[sp,column].spines["right"].set_visible(False)
ax[sp,column].spines["left"].set_visible(False)
ax[sp,column].spines["top"].set_visible(False)
ax[sp,column].spines["bottom"].set_visible(False)
ax[sp,column].set_xlim(1968, 2011)
ax[sp,column].set_ylim(0,100)
ax[sp,column].set_title(value[sp])
ax[sp,column].set_yticks([0, 100])
ax[sp,column].axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3)
bottom_on = False
if sp+1 == len(value):
bottom_on = True
ax[sp,column].tick_params(bottom=False, top=False, left=False, right=False, labelbottom=bottom_on)
if column == 0:
if sp == 0:
ax[sp,column].text(2005, 25, 'Men')
ax[sp,column].text(2005, 87, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 90, 'Men')
ax[sp,column].text(2005, 27, 'Women')
if column == 1:
if sp == 0:
ax[sp,column].text(2002, 32, 'Men')
ax[sp,column].text(2002, 75, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(1970, 70, 'Men')
ax[sp,column].text(1970, 40, 'Women')
if column == 2:
if sp == 0:
ax[sp,column].text(2002, 23, 'Men')
ax[sp,column].text(2002, 88, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 60, 'Men')
ax[sp,column].text(2005, 22, 'Women')
column += 1
fig.delaxes(ax[5][1])
plt.tight_layout(pad=2)
plt.show();
We will save the figure containing all charts to gender_degrees.png
.
# 6 Rows x 3 Columns
fig, ax = plt.subplots(6, 3, figsize=(12, 12))
column = 0
# stem_cats, column = 0
for key, value in category.items():
for sp in range(0,len(value)):
ax[sp,column].plot(women_degrees['Year'], women_degrees[value[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax[sp,column].plot(women_degrees['Year'], 100-women_degrees[value[sp]], c=cb_orange, label='Men', linewidth=3)
ax[sp,column].spines["right"].set_visible(False)
ax[sp,column].spines["left"].set_visible(False)
ax[sp,column].spines["top"].set_visible(False)
ax[sp,column].spines["bottom"].set_visible(False)
ax[sp,column].set_xlim(1968, 2011)
ax[sp,column].set_ylim(0,100)
ax[sp,column].set_title(value[sp])
ax[sp,column].set_yticks([0, 100])
ax[sp,column].axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3)
bottom_on = False
if sp+1 == len(value):
bottom_on = True
ax[sp,column].tick_params(bottom=False, top=False, left=False, right=False, labelbottom=bottom_on)
if column == 0:
if sp == 0:
ax[sp,column].text(2005, 25, 'Men')
ax[sp,column].text(2005, 87, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 90, 'Men')
ax[sp,column].text(2005, 27, 'Women')
if column == 1:
if sp == 0:
ax[sp,column].text(2002, 32, 'Men')
ax[sp,column].text(2002, 75, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(1970, 70, 'Men')
ax[sp,column].text(1970, 40, 'Women')
if column == 2:
if sp == 0:
ax[sp,column].text(2002, 23, 'Men')
ax[sp,column].text(2002, 88, 'Women')
elif sp+1 == len(value):
ax[sp,column].text(2005, 60, 'Men')
ax[sp,column].text(2005, 22, 'Women')
column += 1
fig.delaxes(ax[5][1])
plt.tight_layout(pad=2)
plt.savefig("images/gender_degrees.png")
plt.show();
Following are the data visualization observations as of Year 2010: