Visualizing The Gender Gap In College Degrees

Here we have dataset complied by Randal Olson, a data scientist at University of Pennsylvania. Through this data set we will explore the gender gap in STEM fields(Science, Technology, Engineering, and Mathematics). This gap is reported on often in the news and not everyone agrees that there is a gap.

Here, we'll explore how we can communicate the nuanced narrative of gender gap using effective data visualization.

Introduction to Data

In [16]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

women_degrees = pd.read_csv('percent-bachelors-degrees-women-usa.csv')
women_degrees.head()
Out[16]:
Year Agriculture Architecture Art and Performance Biology Business Communications and Journalism Computer Science Education Engineering English Foreign Languages Health Professions Math and Statistics Physical Sciences Psychology Public Administration Social Sciences and History
0 1970 4.229798 11.921005 59.7 29.088363 9.064439 35.3 13.6 74.535328 0.8 65.570923 73.8 77.1 38.0 13.8 44.4 68.4 36.8
1 1971 5.452797 12.003106 59.9 29.394403 9.503187 35.5 13.6 74.149204 1.0 64.556485 73.9 75.5 39.0 14.9 46.2 65.5 36.2
2 1972 7.420710 13.214594 60.4 29.810221 10.558962 36.6 14.9 73.554520 1.2 63.664263 74.6 76.9 40.2 14.8 47.6 62.6 36.1
3 1973 9.653602 14.791613 60.2 31.147915 12.804602 38.4 16.4 73.501814 1.6 62.941502 74.9 77.4 40.9 16.5 50.4 64.3 36.4
4 1974 14.074623 17.444688 61.9 32.996183 16.204850 40.5 18.9 73.336811 2.2 62.413412 75.3 77.9 41.8 18.2 52.6 66.1 37.3
In [2]:
women_degrees.tail()
Out[2]:
Year Agriculture Architecture Art and Performance Biology Business Communications and Journalism Computer Science Education Engineering English Foreign Languages Health Professions Math and Statistics Physical Sciences Psychology Public Administration Social Sciences and History
37 2007 47.605026 43.100459 61.4 59.411993 49.000459 62.5 17.6 78.721413 16.8 67.874923 70.2 85.4 44.1 40.7 77.1 82.1 49.3
38 2008 47.570834 42.711730 60.7 59.305765 48.888027 62.4 17.8 79.196327 16.5 67.594028 70.2 85.2 43.3 40.7 77.2 81.7 49.4
39 2009 48.667224 43.348921 61.0 58.489583 48.840474 62.8 18.1 79.532909 16.8 67.969792 69.3 85.1 43.3 40.7 77.1 82.0 49.4
40 2010 48.730042 42.066721 61.3 59.010255 48.757988 62.5 17.6 79.618625 17.2 67.928106 69.0 85.0 43.1 40.2 77.0 81.7 49.3
41 2011 50.037182 42.773438 61.2 58.742397 48.180418 62.2 18.2 79.432812 17.5 68.426730 69.5 84.8 43.1 40.1 76.7 81.9 49.2

The Data above shows percentage of Bachelor Degrees awarded per Subject to women from year 1970 to 2011.

Visualizing the Gender Gap

In [3]:
fig = plt.figure()

plt.plot(women_degrees["Year"], women_degrees['Biology'], c='blue', label='Women')
plt.plot(women_degrees["Year"], 100 - women_degrees['Biology'], c='green', label= "Men")

plt.title("Percentage of Biology Degrees Awarded By Gender")
plt.legend(loc="upper right")
plt.show()

Hiding Tick-Marks

In [6]:
fig = plt.figure()

plt.plot(women_degrees['Year'], women_degrees['Biology'], label='Women', c='blue')
plt.plot(women_degrees['Year'], 100 - women_degrees['Biology'], c='green', label='Men')

# Hiding Tick Marks
plt.tick_params(top = False, bottom = False, right = False, left = False)

plt.title("Percentage of Biology Degrees Awarded By Gender")
plt.legend(loc="upper right")
plt.show()

Hiding Spines

In [10]:
fig, ax = plt.subplots()
ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women', c='blue')
ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men', c='green')
ax.tick_params(top = False, bottom = False, right = False, left = False)

# Removing Spines
# ax.spines['right'].set_visible(False)
# ax.spines['top'].set_visible(False)
# ax.spines['bottom'].set_visible(False)
# ax.spines['left'].set_visible(False)

# We can use either of them
for key, spine in ax.spines.items():
   spine.set_visible(False)

ax.legend(loc='upper right')
ax.set_title('Percentage of Biology Degrees Awarded By Gender')
plt.savefig('new.png')
plt.show()

Comparing Gender Gap Across Categories

In [11]:
major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):
    ax = fig.add_subplot(2,2,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')
    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')
    
    # Formatting Axes Objects
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0, 100)
    for key, spine in ax.spines.items():
       spine.set_visible(False)
    ax.tick_params(top = False, bottom = False, right = False, left = False)
    ax.set_title(major_cats[sp])
    
# Calling pyplot.legend() here will add the legend to the last subplot that was created.
plt.legend(loc='upper right')
plt.show()

We can conclude that Computer Science and Engineering have big gender gaps, while the gap in Biology and Math and Statistics is quite small. In addition, the first two degree categories are dominated by men while the latter degree categories are much more balanced.


In order to publish the data visualizations that we create, we have to be mindful of color blindness.

Thankfully, there are color palettes we can use that are friendly for people with color blindness. One of them is called Color Blind 10 and was released by Tableau, the company that makes the data visualization platform of the same name. We can navigate to this page and select the Color Blind 10 option from the list of palettes to see the ten colors included in the palette.

Setting Line Color using RGB

In [14]:
fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):
    ax = fig.add_subplot(2,2,sp+1)
    
    # The color for each line is assigned here.
    cb_dark_blue = (0/255, 107/255, 164/255) 
    cb_orange = (255/255, 128/255, 14/255)
    
    # Formatting Axes Objects
    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women')
    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men')
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(top = False, bottom = False, right = False, left = False)

plt.legend(loc='upper right')
plt.show()

Setting Line Width

In [17]:
cb_dark_blue = (0/255, 107/255, 164/255)
cb_orange = (255/255, 128/255, 14/255)

fig = plt.figure(figsize=(12, 12))

for sp in range(0,4):
    ax = fig.add_subplot(2,2,sp+1)
    
    # Set the line width when specifying how each line should look.
    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=3)
    
    # Formatting Axes Objects
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    ax.set_title(major_cats[sp])
    ax.tick_params(top = False, bottom = False, right = False, left = False)

plt.legend(loc='upper right')
plt.show()

Improving the Layout and Ordering

In [29]:
stem_cats = ['Engineering', 'Computer Science', 'Physical Sciences', 'Math and Statistics', 'Biology', 'Psychology']

fig = plt.figure(figsize=(18, 3))

for sp in range(0,6):
    ax = fig.add_subplot(1,6,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    ax.set_title(stem_cats[sp])
    ax.tick_params(top = False, bottom = False, right = False, left = False)

plt.legend(loc='upper right')
plt.show()

Replacing the Legend with Annotations

In [30]:
cb_dark_blue = (0/255, 107/255, 164/255)
cb_orange = (255/255, 128/255, 14/255)

fig = plt.figure(figsize=(18, 3))

for sp in range(0,6):
    ax = fig.add_subplot(1,6,sp+1)
    ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
    ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
    if sp == 0:
        ax.text(2005, 87, 'Men')
        ax.text(2002, 8, 'Women')
    elif sp == 5:
        ax.text(2004, 13, 'Men')
        ax.text(1999, 83, 'Women')
        
    for key,spine in ax.spines.items():
        spine.set_visible(False)
    ax.set_xlim(1968, 2011)
    ax.set_ylim(0,100)
    ax.set_title(stem_cats[sp])
    ax.tick_params(top = False, bottom = False, right = False, left = False)
    
plt.show()

Comparing Across All Degrees

  • We will generate a 6 row by 3 column grid of subplots.
  • We will generate line charts for both male and female percentages:
    • In the first column, for every degree in stem_cats.
    • In the second column, for every degree in lib_arts_cats.
    • In the third column, for every degree in other_cats.
    • Add text annotations for Women and Men in the topmost and bottommost plots of each column.
In [47]:
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
             'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism', 
                 'Art and Performance', 'Social Sciences and History']

other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

fig = plt.figure(figsize=(15, 18))

def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
    for num in range(axes_range):
        # Plotting Data
        ax = fig.add_subplot(6, 3, (num*3)+column)
        ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
        
        # Formatting Axes Objects
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(data[num])
        ax.tick_params(bottom=False, top=False, left=False, right=False)
        
        if num == 0:
            ax.text(2006, Annot_1_M, 'Men')
            ax.text(2003, Annot_1_F, 'Women')
        elif num == (axes_range - 1):
            ax.text(2006, Annot_2_M, 'Men')
            ax.text(2003, Annot_2_F, 'Women')
            
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)

Hiding x-axis Labels

  • We will disable the x-axis labels for all line charts except the bottommost line charts in each column.
In [48]:
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
             'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism', 
                 'Art and Performance', 'Social Sciences and History']

other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

fig = plt.figure(figsize=(15, 18))

def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
    for num in range(axes_range):
        # Plotting Data
        ax = fig.add_subplot(6, 3, (num*3)+column)
        ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
        
        # Formatting Axes Objects
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(data[num])
        # Hiding x-axis labels for all charts (labelbottom=False)
        ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
        
        if num == 0:
            ax.text(2006, Annot_1_M, 'Men')
            ax.text(2003, Annot_1_F, 'Women')
        elif num == (axes_range - 1):
            ax.text(2006, Annot_2_M, 'Men')
            ax.text(2003, Annot_2_F, 'Women')
            ax.tick_params(labelbottom=True)
            
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)

Setting y-axis Labels

  • For all plots, we will enable just the y-axis labels at 0 and 100.
In [49]:
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
             'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism', 
                 'Art and Performance', 'Social Sciences and History']

other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

fig = plt.figure(figsize=(15, 18))

def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
    for num in range(axes_range):
        # Plotting Data
        ax = fig.add_subplot(6, 3, (num*3)+column)
        ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
        
        # Formatting Axes Objects
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(data[num])
        # Setting y-axis labels for all charts at just 0 and 100 (set_yticks([0, 100]))
        ax.set_yticks([0,100])
        ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
        
        if num == 0:
            ax.text(2006, Annot_1_M, 'Men')
            ax.text(2003, Annot_1_F, 'Women')
        elif num == (axes_range - 1):
            ax.text(2006, Annot_2_M, 'Men')
            ax.text(2003, Annot_2_F, 'Women')
            ax.tick_params(labelbottom=True)
            
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)

Adding A Horizontal Line

  • For all plots, we will generate a horizontal line with the following properties:
    • Starts at the y-axis position 50
    • Set to the third color (light gray) in the Color Blind 10 palette
    • Has a transparency of 0.3
In [50]:
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
             'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism', 
                 'Art and Performance', 'Social Sciences and History']

other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

fig = plt.figure(figsize=(15, 18))

def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
    for num in range(axes_range):
        # Plotting Data
        ax = fig.add_subplot(6, 3, (num*3)+column)
        ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
        
        # Formatting Axes Objects
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(data[num])
        ax.set_yticks([0,100])
        # Generating Horizontal Line at position 50, with transparency 0.3, and colors given
        ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3)
        ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
        
        if num == 0:
            ax.text(2006, Annot_1_M, 'Men')
            ax.text(2003, Annot_1_F, 'Women')
        elif num == (axes_range - 1):
            ax.text(2006, Annot_2_M, 'Men')
            ax.text(2003, Annot_2_F, 'Women')
            ax.tick_params(labelbottom=True)
            
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)

Exporting A File

  • Finally, we will export the figure containing all of the line charts to "gender_degrees.png".
In [51]:
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
             'Biology', 'Physical Sciences', 'Math and Statistics']

lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism', 
                 'Art and Performance', 'Social Sciences and History']

other_cats = ['Health Professions', 'Public Administration', 'Education', 
              'Agriculture','Business', 'Architecture']

import matplotlib as mpb

fig = plt.figure(figsize=(15, 18))

def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
    for num in range(axes_range):
        # Plotting Data
        ax = fig.add_subplot(6, 3, (num*3)+column)
        ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
        
        # Formatting Axes Objects
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(data[num])
        ax.set_yticks([0,100])
        # Generating Horizontal Line at position 50, with transparency 0.3, and colors given
        ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3)
        ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
        
        if num == 0:
            ax.text(2006, Annot_1_M, 'Men')
            ax.text(2003, Annot_1_F, 'Women')
        elif num == (axes_range - 1):
            ax.text(2006, Annot_2_M, 'Men')
            ax.text(2003, Annot_2_F, 'Women')
            ax.tick_params(labelbottom=True)
            
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)

# print(mpb.get_backend())
plt.savefig("gender_degrees.png")
plt.show()