#!/usr/bin/env python # coding: utf-8 # # Preliminary Analysis of Gender Participation in AAS Talks # Initial analysis led by Brett Morris and Jim Davenport. # In[25]: get_ipython().run_line_magic('pylab', 'inline') from urllib.request import urlopen import datetime import numpy as np import matplotlib matplotlib.rcParams['font.size'] = 15 from matplotlib import pyplot as plt stableURL = 'https://raw.githubusercontent.com/jradavenport/aas225-gender/master/data.csv' raw = urlopen(stableURL).read().splitlines() num = [] time = [] sid = [] speaker = [] qs = [] comments = [] for i in range(len(raw)): splitline = str(raw[i]).split(';')[0].split(',') splitline = [j for j in splitline if j != ' ' and j != ''] # If all columns filled and there is only one entry (M, F) for speaker gender: if len(splitline) >= 5 and len(splitline[3]) == 1 and ('M' in splitline[4].upper() or 'F' in splitline[4].upper()): iscleansid = splitline[2].replace('.', '').isdigit() if iscleansid: # Fix sid if necessary if len(splitline[2]) != 6: splitsid = splitline[2].split('.') splitline[2] = '.'.join([splitsid[0], splitsid[1].zfill(2)]) # If this talk ID has not been added: if splitline[2] not in sid: num.append(splitline[0]) time.append(splitline[1]) sid.append(splitline[2]) speaker.append(splitline[3].upper()) qs.append(splitline[4].upper()) comments.append(','.join(splitline[5:])) # If this talk ID has been added already: if splitline[2] in sid: # Check if the number of questions in new entry is longer. # If so, replace earlier entry with the longer one. if len(qs[sid.index(splitline[2])]) < len(splitline[4].upper()): replaceindex = sid.index(splitline[2]) num[replaceindex] = splitline[0] time[replaceindex] = splitline[1] sid[replaceindex] = splitline[2] speaker[replaceindex] = splitline[3].upper() qs[replaceindex] = splitline[4].upper() comments[replaceindex] = ','.join(splitline[5:]) # ## Overall demographics # In[26]: def annotateformat(number): if number < 1: return '%.2f' % number else: return '%d' % number def nicehist(axis, female, male, title): axis.bar([0, 1], [female, male], color='k', width=0.5) axis.set_xticklabels(['Female', 'Male']) axis.set_xticks([0.25, 1.25]) axis.set_xlim([-0.2, 1.7]) axis.annotate(annotateformat(female), xy=(0.25, female), va='bottom', ha='center') axis.annotate(annotateformat(male), xy=(1.25, male), va='bottom', ha='center') axis.set_title(title) axis.spines['right'].set_color('none') axis.spines['top'].set_color('none') axis.xaxis.set_ticks_position('bottom') axis.yaxis.set_ticks_position('left') Nfemalespeakers = len([i for i in speaker if i == 'F']) Nmalespeakers = len([i for i in speaker if i == 'M']) fig, ax = plt.subplots(1) nicehist(ax, Nfemalespeakers, Nmalespeakers, 'Gender of Speakers ({0} Total)'.format(Nfemalespeakers+Nmalespeakers)) plt.show() print('Male speakers/All speakers= {0:.1f}%' .format(100*float(Nmalespeakers)/(Nfemalespeakers + Nmalespeakers))) print('Female speakers/All speakers = {0:.1f}%' .format(100*float(Nfemalespeakers)/(Nfemalespeakers + Nmalespeakers))) # In[27]: allqs = ''.join([i for i in qs if 'M' in i or 'F' in i]) Nfemaleqs = len([i for i in allqs if i == 'F']) Nmaleqs = len([i for i in allqs if i == 'M']) fig, ax = plt.subplots(1) nicehist(ax, Nfemaleqs, Nmaleqs, 'Gender of Question-Askers (%d Total)' % (Nfemaleqs+Nmaleqs)) print('Male qs/All qs = {0:.1f}%'.format((100*float(Nmaleqs)/(Nfemaleqs + Nmaleqs)))) print('Female qs/All qs = {0:.1f}%'.format((100*float(Nfemaleqs)/(Nfemaleqs + Nmaleqs)))) # In[28]: fig, ax = plt.subplots(1) Nbins = 7 ax.hist([len(i) for i in qs], Nbins, histtype='stepfilled', color='w', lw=2) ax.set_title('Number of Questions Per Talk:') ax.set_xlabel('Number of Questions') plt.show() malespeaker_Nqs = [len(qs[i]) for i in range(len(num)) if speaker[i] == 'M'] femalespeaker_Nqs = [len(qs[i]) for i in range(len(num)) if speaker[i] == 'F'] fig, ax = plt.subplots(1) ax.hist(malespeaker_Nqs, Nbins, histtype='step', color='b', lw=2, label='Male Speaker', range=[1,8]) ax.hist(femalespeaker_Nqs, Nbins, histtype='step', color='r', lw=2, label='Female Speaker', range=[1,8]) ax.set_title('Number of Questions Per Talk:') ax.set_xlabel('Number of Questions') ax.legend() plt.show() # # Who asks questions first? # In[29]: Nmalesfirst = len([i for i in qs if i[0] == 'M']) Nfemalesfirst = len([i for i in qs if i[0] == 'F']) fig, ax = plt.subplots(1) nicehist(ax, Nfemalesfirst, Nmalesfirst, 'Gender of First Question-Askers ({0} Total)'.format(Nfemalesfirst+Nmalesfirst)) plt.show() print('Male first qs/All qs = {0:.1f}%'.format((100*float(Nmalesfirst)/(Nfemalesfirst + Nmalesfirst)))) print('Female first qs/All qs = {0:.1f}%'.format((100*float(Nfemalesfirst)/(Nfemalesfirst + Nmalesfirst)))) # ## When males/females ask first Q, are subsequent Qs affected? # In[30]: malesfirst = [i for i in qs if i[0] == 'M'] malesfirst_percentagefemaleafter = [float(i.count('F'))/len(i) for i in malesfirst[1:] \ if len(i) >= 2] femalesfirst = [i for i in qs if i[0] == 'F'] femalesfirst_percentagefemaleafter = [float(i.count('F'))/len(i) for i in femalesfirst[1:] if len(i) >= 2] fig, ax = plt.subplots(1, 2, figsize=(16,8)) Nbins = 5 ax[0].hist(malesfirst_percentagefemaleafter, Nbins, color='k', range=[0,1], histtype='step', lw=2, weights=len(malesfirst_percentagefemaleafter)*[1./len(malesfirst_percentagefemaleafter)]) ax[0].set_title('Male Asks First Question') ax[1].hist(femalesfirst_percentagefemaleafter, Nbins, color='k', range=[0,1], histtype='step', lw=2, weights=len(femalesfirst_percentagefemaleafter)*[1./len(femalesfirst_percentagefemaleafter)]) ax[1].set_title('Female Asks First Question') for axes in ax: axes.set_xlim([0, 1]) axes.set_xlabel("Fraction subsequent Q's asked by females") axes.set_ylabel("Fraction of all talks") axes.set_ylim([0, 1]) plt.show() # In[31]: fig, ax = plt.subplots(1, figsize=(10,8)) Nbins = 5 ax.hist(malesfirst_percentagefemaleafter, Nbins, color='b', range=[0,1], histtype='step', lw=3, weights=len(malesfirst_percentagefemaleafter)*[1./len(malesfirst_percentagefemaleafter)], label='Male Asks First Question') ax.hist(femalesfirst_percentagefemaleafter, Nbins, color='r', range=[0,1], histtype='step', lw=3, weights=len(femalesfirst_percentagefemaleafter)*[1./len(femalesfirst_percentagefemaleafter)], label='Female Asks First Question') ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_xlabel("Fraction subsequent Q's asked by females") ax.set_ylabel("Fraction of all talks") ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') ax.legend() plt.show() # ## When M/F asks first Q, who asks following questions? # In[32]: malesfirst_lenafter = float(len(''.join(malesfirst))) malesfirst_Nfemaleafter = sum([i.count('F') for i in malesfirst[1:] if len(i) >= 2])/malesfirst_lenafter femalesfirst_lenafter = float(len(''.join(femalesfirst))) femalesfirst_Nfemaleafter = sum([i.count('F') for i in femalesfirst[1:] if len(i) >= 2])/femalesfirst_lenafter fig, ax = plt.subplots(1, 2, figsize=(16,8)) nicehist(ax[0], malesfirst_Nfemaleafter, 1-malesfirst_Nfemaleafter, 'Male Asks First Q ({0} Total)'.format(len(''.join(malesfirst)))) nicehist(ax[1], femalesfirst_Nfemaleafter, 1-femalesfirst_Nfemaleafter, 'Female Asks First Q (%d Total)'.format(len(''.join(femalesfirst)))) for axes in ax: axes.set_ylim([0,1]) plt.show() # In[33]: malespeaker_qs = [qs[i] for i in range(len(num)) if speaker[i] == 'M'] femalespeaker_qs = [qs[i] for i in range(len(num)) if speaker[i] == 'F'] malespeaker_Nfemaleqs = ''.join(malespeaker_qs).count('F') malespeaker_Nquestions = len(''.join(malespeaker_qs)) malespeaker_percentagefemaleqs = malespeaker_Nfemaleqs/float(malespeaker_Nquestions) femalespeaker_Nfemaleqs = ''.join(femalespeaker_qs).count('F') femalespeaker_Nquestions = len(''.join(femalespeaker_qs)) femalespeaker_percentagefemaleqs = femalespeaker_Nfemaleqs/float(femalespeaker_Nquestions) fig, ax = plt.subplots(1, 2, figsize=(16,8)) nicehist(ax[0], malespeaker_percentagefemaleqs, 1-malespeaker_percentagefemaleqs, 'Male Speaker ({0} Total)'.format(malespeaker_Nquestions)) nicehist(ax[1], femalespeaker_percentagefemaleqs, 1-femalespeaker_percentagefemaleqs, 'Female Speaker ({0} Total)'.format(femalespeaker_Nquestions)) for axes in ax: axes.set_xlabel("Fraction Q's asked") axes.set_ylim([0,1]) plt.show() # ## When is first question asked by female? # In[34]: def getfirstindex(searchstring, letter): if letter in searchstring: return searchstring.index(letter) + 1 return None malespeaker_orderfemale = [getfirstindex(i, "F") for i in malespeaker_qs if 'F' in i] femalespeaker_orderfemale = [getfirstindex(i, "F") for i in femalespeaker_qs if 'F' in i] fig, ax = plt.subplots(1, figsize=(10,8)) Nbins = 3 ax.hist(malespeaker_orderfemale, Nbins, color='b', range=[1,3], histtype='step', lw=3, label='Male Speaker') ax.hist(femalespeaker_orderfemale, Nbins, color='r', range=[1,3], histtype='step', lw=3, label='Female Speaker') ax.set_xlim([1, 3]) ax.set_xticks([1,2,3]) #ax.set_ylim([0, 1]) ax.set_xlabel("First Female Question Position in Sequence") #ax.set_ylabel("Fraction of all talks") ax.set_title('In talks when a female asked a question:') ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') ax.legend() plt.show() # ## # Qs asked versus % of Female Qs asked # In[35]: N_qs = np.array([len(i) for i in qs]) percentagefemaleqs = np.array([i.count('F')/float(len(i)) for i in qs]) order = np.argsort(N_qs) #plt.plot(N_qs, percentagefemaleqs, '.') for i in range(1, max(N_qs)): considertalks = i == N_qs medianpercentagefemaleqs = np.median(percentagefemaleqs[considertalks]) stdpercentagefemaleqs = np.std(percentagefemaleqs[considertalks]) plt.errorbar(i, medianpercentagefemaleqs, yerr=stdpercentagefemaleqs, fmt='o', color='k') plt.ylim([0, 1.0]) plt.xlabel('Number of questions asked') plt.ylabel('Fraction of Qs asked by females') plt.show() # ## Correlations by genre # In[36]: infile = 'aas225-all-talks.txt' raw = open(infile).read().splitlines() rawchairgender = [] catagory = [] sessionID = [] for i,line in enumerate(raw): if i % 5 == 3: a, b, = line.split(',') catagory.append(a) rawchairgender.append(b.strip()) if i % 5 == 0: sessionID.append(line[0:3]) rawchairgender = np.array(rawchairgender) sessionID = np.array(sessionID) chair = [] for talknumber in sid: session = talknumber.split('.')[0] chair.append(rawchairgender[sessionID == session]) # In[ ]: # #Questions to ask the data: # # Show how reliable responses are by looking for simultaneous responses from individual talks? # # Does the order of questions asked matter? # # Are females more likely to ask follow-up questions rather than leading quetions? # # Do women ask N questions proportionate to their size in the population? # # If a woman asks a question, are others more likely to follow? Clustering? # # Are the speaker genders similar in M/F to the overall population? # # What are the biases in the talks that get survey responses? # # Do people submit responses more often for talks given by women compared to the general pop? # # How late in Q session is the first female Q? # # In[ ]: