#!/usr/bin/env python # coding: utf-8 # # Guided Project No. 05: Visualizing the Gender Gap in College Degrees # # Welcome to my notebook for the fifth guided project for Dataquest's Data Scientist in Python path. This time, we're going to practice improving plot aesthetics and making our visualizations as effective as possible. # # We will be using [data compiled by Randal Olson](http://www.randalolson.com/wp-content/uploads/percent-bachelors-degrees-women-usa.csv) on the percentage of bachelors degrees awarded to women in USA. The [raw data can be found at the website of the National Center for Education Statistics](https://nces.ed.gov/programs/digest/2013menu_tables.asp). # # First, we start with loading up the the modules and the data. # In[1]: # Loading modules get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import matplotlib.pyplot as plt # Loading in data women_degrees = pd.read_csv('percent-bachelors-degrees-women-usa.csv') # In presenting colored visualizations, we may want to consider whether the colors we use can be differentiated by color-blind people. To take this into account, we'll use a color-blind-friendly color palette. Let's load in the adjusted RGB values for those. # In[2]: # Setting RGB values for our color palette cb_dark_blue = (0/255,107/255,164/255) cb_orange = (255/255, 128/255, 14/255) cb_gray = (171/255, 171/255, 171/255) # Now, let's begin generating our first graph. We only consider the STEM degrees to set up our first few lines of code for generating the visualizations. # In[3]: stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics'] fig = plt.figure(figsize=(18, 3)) for sp in range(0,6): ax = fig.add_subplot(1,6,sp+1) ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(stem_cats[sp]) ax.tick_params(right='off', left='off', bottom='off', top='off') if sp == 0: ax.text(2005, 87, 'Men') ax.text(2002, 8, 'Women') elif sp == 5: ax.text(2005, 62, 'Men') ax.text(2001, 35, 'Women') # Let's also have a quick look at our data set first before moving forward. # In[4]: women_degrees.head() # In[5]: print(women_degrees.shape) women_degrees.describe() # We see that we have 17 college degrees with varying patterns in terms of female representation. In the next section, we will generate a similar graph but containing all the college degrees in our data set. # ## Comparing across all degrees # # Before we begin generating the charts for all seventeen majors, we will categorize them into three groups: # - STEM # - Liberal Arts # - Other # # After organizing them into the different categories, we will arrange them in descending order based on the percentage of women for the last year in our data set. # In[6]: # Classifying degrees based in categories stem = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics', 'Physical Sciences', 'Psychology'] lib_arts = ['Art and Performance', 'Communications and Journalism', 'English', 'Foreign Languages', 'Social Sciences and History'] other = ['Agriculture', 'Architecture', 'Business', 'Education', 'Health Professions', 'Public Administration'] # Sorting the degrees within each category based on ending share of degrees awarded to women stem_sorted = women_degrees[stem].sort_values(women_degrees[stem].shape[0] - 1, axis=1, ascending=False) lib_arts_sorted = women_degrees[lib_arts].sort_values(women_degrees[lib_arts].shape[0] - 1, axis=1, ascending=False) other_sorted = women_degrees[other].sort_values(women_degrees[other].shape[0] - 1, axis=1, ascending=False) # Creating sorted list of degree categories stem_cats = list(stem_sorted.columns) lib_arts_cats = list(lib_arts_sorted.columns) other_cats = list(other_sorted.columns) # Let's see the categories and their ordering. # In[7]: print(stem_cats) print(lib_arts_cats) print(other_cats) # Let's now generate our figure with subplots arranged in six rows and three columns (6 by 3). The first column is where we will graph the STEM degrees, the second column is for the liberal arts degrees, and the third column for the other degrees. # In[8]: # Generating the figures fig = plt.figure(figsize=(18, 12)) # Generating plots for first column (STEM degrees) for sp in range(0, 18, 3): cat_index = int(sp / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[stem_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(stem_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off') if cat_index == 0: ax.text(2005, 10, 'Men') ax.text(2005, 85, 'Women') if cat_index == 5: ax.text(2005, 90, 'Men') ax.text(2005, 5, 'Women') # Generating plots for second column (liberal arts degrees) for sp in range(1, 16, 3): cat_index = int((sp - 1) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(lib_arts_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off') if cat_index == 0: ax.text(2005, 15, 'Men') ax.text(2005, 80, 'Women') # Generating plots for third column (other degrees) for sp in range(2, 20, 3): cat_index = int((sp - 2) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[other_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(other_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off') if cat_index == 0: ax.text(2005, 0, 'Men') ax.text(2005, 95, 'Women') if cat_index == 5: ax.text(2005, 65, 'Men') ax.text(2005, 30, 'Women') fig.tight_layout() # ## Hiding x-axis labels # # The repeating x-axis labels (years) per subplot is cluttering our figure so we will remove those except for the bottom subplot for each column. # In[9]: # Generating the figures fig = plt.figure(figsize=(18, 12)) # Generating plots for first column (STEM degrees) for sp in range(0, 18, 3): cat_index = int(sp / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[stem_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(stem_cats[cat_index]) # Added labelbottom=False to the arguments in ax.tick_params ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') # Adjusted annotation positions if cat_index == 0: ax.text(2005, 10, 'Men') ax.text(2005, 85, 'Women') if cat_index == 5: ax.text(2005, 90, 'Men') ax.text(2005, 5, 'Women') # Adding back the x labels for the bottom subplot ax.tick_params(labelbottom='on') # Generating plots for second column (liberal arts degrees) for sp in range(1, 16, 3): cat_index = int((sp - 1) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(lib_arts_cats[cat_index]) # Added labelbottom=False to the arguments in ax.tick_params ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') # Adjusted annotation positions if cat_index == 0: ax.text(2005, 15, 'Men') ax.text(2005, 80, 'Women') if cat_index == 4: # Adding back the x labels for the bottom subplot ax.tick_params(labelbottom='on') # Generating plots for third column (other degrees) for sp in range(2, 20, 3): cat_index = int((sp - 2) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[other_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(other_cats[cat_index]) # Added labelbottom=False to the arguments in ax.tick_params ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') # Adjusted annotation positions if cat_index == 0: ax.text(2005, 0, 'Men') ax.text(2005, 95, 'Women') if cat_index == 5: ax.text(2005, 65, 'Men') ax.text(2005, 30, 'Women') # Adding back the x labels for the bottom subplot ax.tick_params(labelbottom='on') fig.tight_layout() # ## Setting y-axis labels # # In order to further reduce the amount of clutter in our figure, we will also remove most y-axis tick labels and retain only zero (0) and one hundred (100) # In[10]: # Generating the figures fig = plt.figure(figsize=(18, 12)) # Generating plots for first column (STEM degrees) for sp in range(0, 18, 3): cat_index = int(sp / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[stem_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(stem_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') # Removing y tick labels except 0 and 100 ax.set_yticks([0,100]) if cat_index == 0: ax.text(2005, 10, 'Men') ax.text(2005, 85, 'Women') if cat_index == 5: ax.text(2005, 90, 'Men') ax.text(2005, 5, 'Women') ax.tick_params(labelbottom='on') # Generating plots for second column (liberal arts degrees) for sp in range(1, 16, 3): cat_index = int((sp - 1) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(lib_arts_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') # Removing y tick labels except 0 and 100 ax.set_yticks([0,100]) if cat_index == 0: ax.text(2005, 15, 'Men') ax.text(2005, 80, 'Women') if cat_index == 4: ax.tick_params(labelbottom='on') # Generating plots for third column (other degrees) for sp in range(2, 20, 3): cat_index = int((sp - 2) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[other_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(other_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') # Removing y tick labels except 0 and 100 ax.set_yticks([0,100]) if cat_index == 0: ax.text(2005, 0, 'Men') ax.text(2005, 95, 'Women') if cat_index == 5: ax.text(2005, 65, 'Men') ax.text(2005, 30, 'Women') ax.tick_params(labelbottom='on') fig.tight_layout() # ## Adding a horizontal line # # Since we removed most of the y-axis tick labels, it's now more difficult to eyeball the values for the line graphs. In order to remedy this, we will add a horizontal gray line at the 50-mark in the y-axis. This will help our readers see much easier the degree of discrepancy in terms of gender gaps. # In[11]: # Generating the figures fig = plt.figure(figsize=(18, 12)) # Generating plots for first column (STEM degrees) for sp in range(0, 18, 3): cat_index = int(sp / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[stem_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(stem_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') ax.set_yticks([0,100]) # Added horizontal line at y=50 ax.axhline(50, c=cb_gray, alpha=0.3) if cat_index == 0: ax.text(2005, 10, 'Men') ax.text(2005, 85, 'Women') if cat_index == 5: ax.text(2005, 90, 'Men') ax.text(2005, 5, 'Women') ax.tick_params(labelbottom='on') # Generating plots for second column (liberal arts degrees) for sp in range(1, 16, 3): cat_index = int((sp - 1) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(lib_arts_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') ax.set_yticks([0,100]) # Added horizontal line at y=50 ax.axhline(50, c=cb_gray, alpha=0.3) if cat_index == 0: ax.text(2005, 15, 'Men') ax.text(2005, 80, 'Women') if cat_index == 4: ax.tick_params(labelbottom='on') # Generating plots for third column (other degrees) for sp in range(2, 20, 3): cat_index = int((sp - 2) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[other_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(other_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') ax.set_yticks([0,100]) # Added horizontal line at y=50 ax.axhline(50, c=cb_gray, alpha=0.3) if cat_index == 0: ax.text(2005, 0, 'Men') ax.text(2005, 95, 'Women') if cat_index == 5: ax.text(2005, 65, 'Men') ax.text(2005, 30, 'Women') ax.tick_params(labelbottom='on') # ## Exporting to a file # # Finally, we may want to save the figure we generated in a separate file. Documentation for saving figures can be found [here](https://matplotlib.org/3.3.3/api/_as_gen/matplotlib.pyplot.savefig.html). # In[12]: # Generating the figures fig = plt.figure(figsize=(18, 12)) # Generating plots for first column (STEM degrees) for sp in range(0, 18, 3): cat_index = int(sp / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[stem_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(stem_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') ax.set_yticks([0,100]) ax.axhline(50, c=cb_gray, alpha=0.3) if cat_index == 0: ax.text(2005, 10, 'Men') ax.text(2005, 85, 'Women') if cat_index == 5: ax.text(2005, 90, 'Men') ax.text(2005, 5, 'Women') ax.tick_params(labelbottom='on') # Generating plots for second column (liberal arts degrees) for sp in range(1, 16, 3): cat_index = int((sp - 1) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[lib_arts_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[lib_arts_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(lib_arts_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') ax.set_yticks([0,100]) ax.axhline(50, c=cb_gray, alpha=0.3) if cat_index == 0: ax.text(2005, 15, 'Men') ax.text(2005, 80, 'Women') if cat_index == 4: ax.tick_params(labelbottom='on') # Generating plots for third column (other degrees) for sp in range(2, 20, 3): cat_index = int((sp - 2) / 3) ax = fig.add_subplot(6, 3, sp + 1) ax.plot(women_degrees['Year'], women_degrees[other_cats[cat_index]], c=cb_dark_blue, label='Women', linewidth=3) ax.plot(women_degrees['Year'], 100-women_degrees[other_cats[cat_index]], c=cb_orange, label='Men', linewidth=3) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.set_xlim(1968, 2011) ax.set_ylim(0,100) ax.set_title(other_cats[cat_index]) ax.tick_params(right='off', left='off', bottom='off', top='off', labelbottom='off') ax.set_yticks([0,100]) ax.axhline(50, c=cb_gray, alpha=0.3) if cat_index == 0: ax.text(2005, 0, 'Men') ax.text(2005, 95, 'Women') if cat_index == 5: ax.text(2005, 65, 'Men') ax.text(2005, 30, 'Women') ax.tick_params(labelbottom='on') # Saving figure fig.savefig('gender_gaps_college.png', dpi=300) # ## Conclusion and Learnings # # In this project, we learned how to manipulate or adjust various chart elements to improve readability of our visualizations. As for the data itself, we showed that gender gaps vary across different degrees although the general trend is that the percentage of women awarded degrees in different majors have been somewhat increasing.