#!/usr/bin/env python # coding: utf-8 # # 2. Gender Detection # # ## Figuring out genders from names # # We're going to use 3 different methods, all of which use a similar philosophy. Essentially, each of these services have build databases from datasets where genders are known or can be identified. For example, national census data and social media profiles. # # - [GenderDetector](https://pypi.python.org/pypi/gender-detector) can be run locally, but only provides "male", "female" or "unknown", and has a limitted number of names in the database. # - [genderize.io](http://genderize.io) and [Gender API](http://gender-api.com) are web services that allow us to query names and return genders # - Each of these services provides a "probability" that the gender is correct (so if "Jamie" shows up 80 times in their data as a female name, and 20 times as a male name, they'll say it's "female" with a probability of 0.8) # - They also tell us how certain we can be of that gender by telling us how many times that name shows up (in the above example, the `count` would be 100. This is useful because some names might only have 1 or 2 entries, in which case a 100% probability of being male would be less reliable than a name that has 1000 entries. # # The web APIs have superior data, but the problem is that they are services that require you to pay if you make more than a certain number of queries in a short period of time. The owners of both services have generously provided me with enough queries to do this research for free. # ## Getting names to query # # First, we'll take the names from our pubmed queries and collapse them into sets. We don't really need to query the # name "John" a thousand times - once will do. I'm going to loop through the csv we wrote out in the [last section](../xml_parsing.ipynb) and pull the fourth column, which contains our author name. # In[ ]: import os os.chdir("../data/pubdata") # In[ ]: names = [] with open("comp.csv") as infile: for line in infile: names.append(line.split(",")[5]) # Then we'll convert the list to a set, which is an unordered array of unique values (so it removes duplicates) # In[ ]: print(len(names)) names = set(names) print(len(names)) # Here's a function that does the same thing. # In[ ]: def get_unique_names(csv_file): names = [] with open(csv_file) as infile: for line in infile: names.append(line.split(",")[5]) return set(names) # The `set.union()` function will merge 2 sets into a single set, so we'll do this with our other datasets. # In[ ]: names = names.union(get_unique_names("bio.csv")) print(len(all_names)) # ## Getting genders from names # # ### GenderDetector # First up - `GenderDetector`. The usage is pretty straighforward: # In[ ]: from gender_detector import GenderDetector detector = GenderDetector('us') print(detector.guess("kevin")) print(detector.guess("melanie")) print(detector.guess("ajasja")) # In[ ]: gender_dict = {} counter = 0 for name in names: try: gender = detector.guess(name) gender_dict[name] = gender except: print(name) # In[ ]: print(len(gender_dict)) # In[ ]: print(sum([1 for x in gender_dict if gender_dict[x] == 'unknown'])) print(sum([1 for x in gender_dict if gender_dict[x] != 'unknown'])) # ### Output datasets # In[ ]: import json with open("GenderDetector_genders.json", "w+") as outfile: outfile.write(json.dumps(gender_dict, indent=4)) # ### Genderize.io # # This one is a bit more complicated, since we have to make a call to the web api, and then parse the json that's returned. Happily, someone already wrote [a python package](https://pypi.python.org/pypi/Genderize) to do most of the work. We can query 10 names at a time rather than each one individually, and we'll get back a list of dictionaries, one for each query: # ``` # [{u'count': 1037, u'gender': u'male', u'name': u'James', u'probability': 0.99}, # {u'count': 234, u'gender': u'female', u'name': u'Eva', u'probability': 1.0}, # {u'gender': None, u'name': u'Thunderhorse'}] # ``` # # I will turn that into a dictionary of dictionaries, where the name is the key, and the other elements are stored under them. Eg: # # ``` # { # u'James':{ # u'count': 1037, # u'gender': u'male', # u'probability': 0.99 # }, # u'Eva':{ # u'count': 234, # u'gender': u'female', # u'probability': 1.0 # }, # u'Thunderhorse':{ # u'count: 0, # u'gender': None, # u'probability': None # } # } # ``` # # **Note**: # # I've got an API key stored in a separate file called `api_keys.py` (that I'm not putting on git because you can't have my queries!) that looks like this: # # ``` # genderize_key = "s0m3numb3rsandl3tt3rs" # genderAPI_key = "0th3rnumb3rsandl3tt3rs" # ``` # # You can get a key from both services for free, but you'll be limited in the number of queries you can make. Just make a similar file, or add them in below in place of the proper variables. # # In[ ]: from api_keys import genderize_key from genderize import Genderize all_names = list(all_names) genderize = Genderize( user_agent='Kevin_Bonham', api_key=genderize_key) genderize_dict = {} for i in range(0, len(all_names), 10): query = all_names[i:i+10] genders = genderize.get(query) for gender in genders: n = gender["name"] g = gender["gender"] if g != None: p = gender["probability"] c = gender["count"] else: p = None c = 0 genderize_dict[n] = {"gender":g, "probability":p, "count": c} with open("genderize_genders.json", "w+") as outfile: outfile.write(json.dumps(genderize_dict, indent=4)) # In[ ]: print(len(genderize_dict)) print(sum([1 for x in genderize_dict if genderize_dict[x]["gender"] == 'unknown'])) print(sum([1 for x in genderize_dict if genderize_dict[x]["gender"] != 'unknown'])) # ## Gender-API # # This is a similar service, but I didn't find a python package for it. Thankfully, it's pretty easy too. The following code is for python2, but you can find the python3 code on the [website](http://gender-api.com). The vaule that gets returned comes in the form of a dictionary as well: # # ``` # {u'accuracy': 99, # u'duration': u'26ms', # u'gender': u'male', # u'name': u'markus', # u'samples': 26354} # ``` # # Which I'll convert to the same keys and value types used from genderize above (eg. "probability" instead of "accuracy", "count" instead of "samples", and `0.99` instead of `99`), # In[ ]: from api_keys import genderAPI_key import urllib2 genderAPI_dict = {} counter = 0 for i in range(counter, len(all_names), 20): names = all_names[i:i+20] query = ";".join(names) data = json.load(urllib2.urlopen("https://gender-api.com/get?key={}&name={}".format(genderAPI_key, query))) for r in data['result']: n = r["name"] g = r["gender"] if g != u"unknown": p = float(r["accuracy"]) / 100 c = r["samples"] else: p = None c = 0 genderAPI_dict[n] = {"gender":g, "probability":p, "count": c} with open("../data/pubs/genderAPI_genders.json", "w+") as outfile: outfile.write(json.dumps(genderAPI_dict, indent=4)) # If you want to do this without going through this notebook and you have a python2 installation, you can use the included `gender_detection.py`. The first argument should be `genderize` or `genderapi` depending on which method you want to use, (or if nothing, it will try to use GenderDetector). The second argument should be a path to an output file (like `genders.json`), and then the rest of the arguments should be the csv files output from the previous notebook. The script will pull all the names together into a set, and then use the relevant API or GenderDetector. # # ``` # $ python2 gender_detection.py genderize path/to/dataset1.csv path/to/dataset2.csv # ```