#!/usr/bin/env python # coding: utf-8 # # Percentage of Bachelor’s degrees conferred to women in the U.S., by major # # This notebook provides the methodology and code used in the blog post, [Percentage of Bachelor’s degrees conferred to women in the U.S., by major (1970-2014)](http://www.randalolson.com/2014/06/14/percentage-of-bachelors-degrees-conferred-to-women-by-major-1970-2012/). # # ### Notebook by [Randal S. Olson](http://www.randalolson.com) # # Please see the [repository README file](https://github.com/rhiever/Data-Analysis-and-Machine-Learning-Projects#license) for the licenses and usage terms for the instructional material and code in this notebook. In general, I have licensed this material so that it is as widely useable and shareable as possible. # # ### Required Python libraries # # If you don't have Python on your computer, you can use the [Anaconda Python distribution](http://continuum.io/downloads) to install most of the Python packages you need. Anaconda provides a simple double-click installer for your convenience. # # This code uses base Python libraries except for the `BeautifulSoup`, `pandas`, and `matplotlib` packages. You can install these packages using `pip` by typing the following command into your command line: # # > pip install beautifulsoup4 pandas matplotlib # # If you're on a Mac, Linux, or Unix machine, you may need to type `sudo` before the command to install the package with administrator privileges. # # ### Scraping the NCES database # # To acquire the data used in the blog post, we need to scrape the [NCES database](http://nces.ed.gov/programs/digest/current_tables.asp). The NCES database is available for download as Excel files, but we didn't want to deal with a bunch of Excel files. Instead, let's scrape the NCES database web pages directly using [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/). # # Running the code below will create a file called `gender_degree_data.tsv` that contains all data about the gender breakdowns of various degree majors in the US. # In[1]: from bs4 import BeautifulSoup import requests with open('gender_degree_data.tsv', 'w') as out_file: out_file.write('\t'.join(['Year', 'Degree_Major', 'Total_Bachelors', 'Percent_change_Bachelors', 'Male_Bachelors', 'Female_Bachelors', 'Female_percent_Bachelors', 'Total_Masters', 'Male_Masters', 'Female_Masters', 'Total_Doctorates', 'Male_Doctorates', 'Female_Doctorates']) + '\n') table_list_response = requests.get('http://nces.ed.gov/programs/digest/current_tables.asp') table_list_response = BeautifulSoup(table_list_response.text, 'lxml') for link in table_list_response.find_all('a', href=True): # We only want the tables that stratify the data by degree and gender, which are in table group 325 if 'dt15_325' in link['href'] and int(link.text.split('.')[1]) % 5 == 0: url = 'http://nces.ed.gov/programs/digest/{}'.format(link['href']) url_response = requests.get(url) url_response = BeautifulSoup(url_response.text, 'lxml') degree_major = url_response.find('title').text.split('Degrees in')[1].split('conferred')[0].strip() all_trs = url_response.find_all('tr') for tr in all_trs: # We only want to parse entries that correspond to a certain year year_header = tr.find('th') if year_header is None: continue # Stop parsing after all of the years are listed if 'Percent change' in year_header.text: break # Years always have a dash (-) in them if '-' not in year_header.text: continue year = str(int(year_header.text.split('-')[0]) + 1) year_vals = [x.text.replace(',', '').replace('†', '0').replace('#', '0') for x in tr.find_all('td')] out_text = '\t'.join([year, degree_major] + year_vals) + '\n' out_file.write(out_text) # ### Visualizing the gender breakdowns for the various degree programs # # Next, let's use [matplotlib](http://matplotlib.org) to visualize the trends in the gender breakdowns. # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd # This is my custom style that does most of the plot formatting plt.style.use('https://gist.githubusercontent.com/rhiever/a4fb39bfab4b33af0018/raw/b25b4ba478c2e163dd54fd5600e80ca7453459da/tableau20.mplstyle') degree_gender_data = pd.read_csv('gender_degree_data.tsv', sep='\t') degree_gender_data = degree_gender_data[degree_gender_data['Year'] >= 1970] degree_gender_data.set_index('Year', inplace=True) # Create a list of the degree majors ranked by their last value in the time series # We'll use this list to determine what colors the degree majors are assigned degree_major_order = degree_gender_data.groupby('Degree_Major')['Female_percent_Bachelors'].last() degree_major_order = degree_major_order.sort_values(ascending=False).index.values degree_major_order_dict = dict(zip(degree_major_order, range(len(degree_major_order)))) degree_gender_data['Degree_Major_Order'] = degree_gender_data[ 'Degree_Major'].apply(lambda major: degree_major_order_dict[major]) degree_gender_data.groupby('Degree_Major_Order')['Female_percent_Bachelors'].plot(figsize=(10, 12)) plt.xlabel('') plt.yticks(range(0, 91, 10), ['{}%'.format(x) for x in range(0, 91, 10)]) plt.xlim(1969, 2014) plt.ylim(-1, 90) plt.title('Percentage of Bachelor\'s degrees conferred to women' 'in the U.S.A., by major (1970-2014)\n', fontsize=14) plt.grid(False, axis='x') degree_major_pcts = dict(degree_gender_data.groupby( 'Degree_Major')['Female_percent_Bachelors'].last().iteritems()) degree_major_color_map = [x['color'] for x in plt.rcParams['axes.prop_cycle']] # We use this dictionary to rename the degree majors to shorter names degree_major_name_map = { 'the social sciences and history': 'Social Sciences and History', 'the health professions and related programs': 'Health Professions', 'visual and performing arts': 'Art and Performance', 'foreign languages and literatures': 'Foreign Languages', 'engineering and engineering technologies': 'Engineering', 'the biological and biomedical sciences': 'Biology', 'mathematics and statistics': 'Math and Statistics', 'agriculture and natural resources': 'Agriculture', 'the physical sciences and science technologies': 'Physical Sciences', 'communication, journalism, and related ' 'programs and in communications technologies': 'Communications\nand Journalism', 'public administration and social services': 'Public Administration', 'psychology': 'Psychology', 'English language and literature/letters': 'English', 'computer and information sciences': 'Computer Science', 'education': 'Education', 'business': 'Business', 'architecture and related services': 'Architecture', } # We use these offsets to prevent the degree major labels from overlapping degree_major_offset_map = { 'foreign languages and literatures': 1.0, 'English language and literature/letters': -0.5, 'agriculture and natural resources': 0.5, 'business': -0.5, 'architecture and related services': 0.75, 'mathematics and statistics': -0.75, 'engineering and engineering technologies': 0.75, 'computer and information sciences': -0.75, } # Draw the degree major labels at the end of the time series lines for degree_major in degree_major_pcts: plt.text(2014.5, degree_major_pcts[degree_major] - 0.5 + degree_major_offset_map.get(degree_major, 0), degree_major_name_map[degree_major], color=degree_major_color_map[degree_major_order_dict[degree_major]]) plt.text(1967, -9, '\nData source: nces.ed.gov/programs/digest/current_tables.asp (Tables 325.*)\n' 'Author: Randy Olson (randalolson.com / @randal_olson)', ha='left', fontsize=10) plt.savefig('pct-bachelors-degrees-women-usa-1970-2014.png') ("")