#!/usr/bin/env python # coding: utf-8 # ## NSF Funding History 1990 - 2016 # Ths script requests zipped files containing annual funding history from the NSF website, unpacks, parses and loads the resulting xml files (in-memory) to 4 relational data sets into SQLite, and finally performs some analysis using Pandas, Seaborn and Plotly. # In[1]: import os import sys import json import re import csv import datetime as dt import requests from collections import OrderedDict from io import BytesIO from zipfile import ZipFile from lxml import etree as ET import pandas as pd import numpy as np import sqlalchemy as sql import matplotlib.pylab as plt import seaborn as sns import plotly.plotly as py from plotly.graph_objs import Bar, Scatter, Marker, Layout from plotly.tools import FigureFactory as FF from wordcloud import WordCloud, STOPWORDS from bs4 import BeautifulSoup from nltk.corpus import stopwords from IPython.display import HTML get_ipython().run_line_magic('matplotlib', 'inline') # # Engineering # ## Scraping the NSF # downloaded from: http://www.nsf.gov/awardsearch/download.jsp # In[2]: # Constants data_in = 'data_in/' data_out = 'data_out/' # URL for http-requests req_root = 'http://www.nsf.gov/awardsearch/download?DownloadFileName={YR}&All=true' # When to start looking at awards, and when to end. It is the YR in the root year_start = 1990 year_end = 2017 csv_out = [ 'NSF_AWARDS.csv', 'NSF_ABSTRACTS.csv', 'NSF_INSTITUTIONS.csv', 'NSF_PI.csv' ] department_normalization = { 'Polar Progrms': 'Polar Programs', 'Arctic Sciences Division' : 'Polar Programs', 'Research On Learning In Formal And Informal Settings (DRL)': 'Research On Learning', 'Information Systems': 'Information & Intelligent Systems', 'Civil, Mechanical, And Manufacturing Innovation': 'Civil, Mechanical, And Manufacturing Inn', 'Behavioral And Cognitive Sci' : 'Behavioral And Cognitive Sciences', 'Integrative Organismal System' : 'Integrative Organismal Sys', 'Information Systems': 'Information & Intelligent Systems' } modifiers_to_remove = { 'Division ': '', 'Of ': '', 'Div ': '', 'Directorate ':'', 'Office ': '', 'Direct ': '', 'Divn ': '', 'For ': '', } # Each annual zipped file contains one xml file per award: # In[621]: with open('data_in/2016/1623912.xml', 'r') as xml_view: print(xml_view.read()) # In[626]: def division_cleanse(raw_division): """ Use normalization dicts to standardize legacy division naming. """ rep = dict((re.escape(k), v) for k, v in modifiers_to_remove.items()) pattern = re.compile("|".join(rep.keys())) division = pattern.sub(lambda m: rep[re.escape(m.group(0))], raw_division.title()) return department_normalization.get(division, division) def get_PIs(list_investigators): """ Returns a semi-colon joined list of PI names (first and last) """ return ';'.join( ['{} {}'.format(PI.find('FirstName').text, PI.find('LastName').text) for PI in list_investigators]) # Builder functions def awards_builder(tree): global awards list_investigators = tree.findall('.//Award/Investigator') return awards.append( OrderedDict([ ('AwardID',tree.find('Award/AwardID').text), ('AwardTitle',tree.find('Award/AwardTitle').text), ('AwardEffectiveDate',tree.find('Award/AwardEffectiveDate').text), ('AwardExpirationDate',tree.find('Award/AwardExpirationDate').text), ('AwardAmount',tree.find('Award/AwardAmount').text), ('InstitutionName',tree.find('Award/Institution/Name').text), ('Division', division_cleanse(tree.find('Award/Organization/Division')[0].text)), ('Directorate', division_cleanse(tree.find('Award/Organization/Directorate')[0].text)), ('EmailAddress',tree.find('Award/Investigator/EmailAddress').text), ('InvestigatorNames', get_PIs(list_investigators)), ('NumInvestigators', len(list_investigators)) ])) def abstracts_builder(tree): global abstracts return abstracts.append( OrderedDict([ ('AwardID',tree.find('Award/AwardID').text), ('Abstract',tree.find('Award/AbstractNarration').text) ])) def institutions_builder(tree): global institutions return institutions.append( OrderedDict([ ('InstitutionName',tree.find('Award/Institution/Name').text), ('StreetAddress',tree.find('Award/Institution/StreetAddress').text), ('CityName',tree.find('Award/Institution/CityName').text), ('StateCode',tree.find('Award/Institution/StateCode').text), ('ZipCode',tree.find('Award/Institution/ZipCode').text), ('CountryName',tree.find('Award/Institution/CountryName').text) ])) def PI_builder(tree): global PI return PI.append( OrderedDict([ ('EmailAddress',tree.find('Award/Investigator/EmailAddress').text), ('FirstName',tree.find('Award/Investigator/FirstName').text), ('LastName',tree.find('Award/Investigator/LastName').text) ])) def xml_parse(file): ''' Calls all four data set builders after parsing a xml file into a tree. This function is called for each unzipped-file. ''' try: tree = ET.parse(file) except: return("empty") try: awards_builder(tree) except: return(tree.find('Award/AwardID').text,"is missing an award key") abstracts_builder(tree) institutions_builder(tree) PI_builder(tree) return def curl(path): ''' Requests zip file of a year's Award info from the NSf site. The zip file is unzipped and stored as a stream of bytes in memory. Each of the unzipped xml files is parsed by xml_parse for all 4 data set builders. ''' file_url = req_root.format(YR = str(path)) print("*** {} ***\ndownloading {}".format(path,file_url)) url = requests.get(file_url) zipfile = ZipFile(BytesIO(url.content)) [xml_parse(zipfile.open(file)) for file in zipfile.namelist()] return def remove_duplicates(dict_list): return [dict(tupleized) for tupleized in set(tuple(item.items()) for item in dict_list)] def json_dump(): ''' Iterates through each relational dataset, and dumps each to json. ''' dict_NSF = [awards, abstracts, institutions, PI] out_json = ["NSF_AWARDS.json", "NSF_ABSTRACTS.json", "NSF_INSTITUTIONS.json", "NSF_PI.json"] for list_dicts, json_file in zip(dict_NSF, out_json): with open(os.path.join(data_out, json_file), 'w') as outfile: json.dump(list_dicts, outfile, indent=4) def csv_dump(): ''' Iterates through each relational dataset, and dumps each to csv. ''' for list_dicts, file in zip(dict_NSF, csv_out): with open(os.path.join(data_out,file), 'w') as outfile: dict_writer = csv.DictWriter(outfile, list_dicts[0].keys()) dict_writer.writeheader() dict_writer.writerows(list_dicts) # In[629]: awards = [] abstracts = [] institutions = [] PI = [] dict_NSF = [awards, abstracts, institutions, PI] start = dt.datetime.now() [curl(year) for year in range(year_start, year_end+1)] print("{ROWS} records scraped in {TIME} sec".format( ROWS=len(awards), TIME=(dt.datetime.now()-start).total_seconds())) # remove doops start = dt.datetime.now() [remove_duplicates(data_set) for data_set in [awards, PI, institutions]] print("Duplicates removed in %s sec." % (dt.datetime.now()-start).total_seconds()) # dump'em start = dt.datetime.now() csv_dump() print("Dumped into CSV in %s sec." % (dt.datetime.now()-start).total_seconds()) # In[22]: pd.read_csv(os.path.join(data_out,"NSF_AWARDS.csv"), nrows=5) # ## Getting the data into a SQLite database # We could have gone straight from the stream of bytes to the SQLite db, but there's nothing wrong with keeping the processed back-up files in disk :) # In[631]: def SQLite_dump(infile, Type='csv'): # open SQlite connection disk_engine = sql.create_engine('sqlite:///NSF.db') start = dt.datetime.now() tableName = infile.replace("NSF_","")\ .replace(".csv","")\ .replace(".json","")\ .replace(data_out,"") print("*********************************************************************") if Type == 'csv': ''' Set Type to csv after csv_dump. Break each CSV into chunks to reduce in memory stoarge of dataset. Each chunk is sent to a SQLite table (tableName). ''' chunksize = 20000 j = 0 index_start = 1 for df in pd.read_csv(infile, chunksize=chunksize, iterator=True, encoding='utf-8'): if tableName == 'AWARDS': for col in ['AwardEffectiveDate','AwardExpirationDate']: df[col] = pd.to_datetime(df[col], format='%m/%d/%Y') df.index += index_start index_start = df.index[-1] + 1 j += 1 print("{X} seconds: {Y} records dumped into SQLite table {Z}".format( X=(dt.datetime.now() - start).total_seconds(), Y=j*chunksize, Z=tableName )) if j == 1: df.to_sql(tableName, disk_engine, if_exists='replace', index=False) else: df.to_sql(tableName, disk_engine, if_exists='append', index=False) elif Type == 'json': ''' After json_dump, file is opened in a dataframe and dumped into a sqlite table ''' df = pd.read_json(infile) if tableName == 'AWARDS': for col in ['AwardEffectiveDate','AwardExpirationDate']: df[col] = pd.to_datetime(df[col], format='%m/%d/%Y') print("{X} seconds: {Y} records dumped into SQLite table {Z}".format( X=(dt.datetime.now() - start).seconds, Y=len(df), Z=tableName )) df.to_sql(tableName, disk_engine, if_exists='replace', index=False) # In[632]: for infile in csv_out: SQLite_dump(os.path.join(data_out,infile), Type='csv') # # Analysis # ## Filtering SQL tables into Pandas. # There is a read_sql() function in Pandas, but who wants to open connections, and write querries every time? # This function takes care of that in a way that is SQL-native, with the advantage of default values and auto formatting to spend more time analyzing data and less time writing querries! # In[10]: def do(query): ''' Direct sql queries to a Pandas dataframe. ''' disk_engine = sql.create_engine('sqlite:///NSF.db') df = pd.read_sql_query(query, disk_engine) for col in df.columns: if "Date" in col: df[col] = pd.to_datetime(df[col]) return df # In[29]: df = do("SELECT * FROM AWARDS") df.head() # In[33]: q = """ SELECT strftime(\'%Y\',AwardEffectiveDate) as year, Division, SUM(AwardAmount) as 'AwardSum' FROM AWARDS GROUP BY Division ORDER BY AwardSum DESC;""" do(q).head() # In[34]: q = """ SELECT strftime(\'%Y\',AwardEffectiveDate) as year, Directorate, SUM(AwardAmount) as 'AwardSum' FROM AWARDS GROUP BY 2 ORDER BY 3 DESC;""" do(q).head() # # Analysis # Let's use Plotly and Seaborn for some Analysis # In[6]: plotly.offline.init_notebook_mode() with open("/Users/leonyin/Documents/creds/plotly_credentials.json",'r') as data_file: ''' Get username credentials. ''' creds = json.load(data_file)['credentials'] plotly.tools.set_credentials_file( username=creds['USER'], api_key=creds['API_KEY'] ) with open("/Users/leonyin/Documents/creds/plotly_presets.json",'r') as data_file: ''' Load graphy presets. ''' layout_dict = json.load(data_file)['layout'] # In[15]: def top_funds(df,groupby,n=5): return df.groupby(groupby) \ .apply(lambda x:x['AwardAmount'].sum()) \ .sort_values(ascending=False) \ .head(n) # In[16]: years = list(range(year_start,year_end)) top_n = 10 # In[22]: annual_funds = {} annual_funds['Other'] = {} for year in years: df = do("""SELECT strftime(\'%Y\',AwardEffectiveDate) as year, Division, SUM(AwardAmount) as 'AwardSum' FROM AWARDS WHERE YEAR='{}' GROUP BY Division ORDER BY AwardSum DESC""".format(year)) annual_funds['Other'][year] = sum(df.AwardSum) # get top 10 to plot for i in range(top_n): division = df.get_value(i,'Division') _sum =df.get_value(i,'AwardSum') annual_funds['Other'][year] -= _sum if division in annual_funds: annual_funds[division][year] = _sum else: annual_funds[division] = {year:_sum} tops = [] for division in top_funds(do("SELECT * FROM AWARDS"),groupby='Division', n=top_n).keys(): tops.append({ 'x': years, 'y': [annual_funds[division].get(year,None) for year in years], 'name': division, 'type': 'bar' }) py.iplot({ 'data': tops, 'layout': { 'barmode': 'stack', 'xaxis': layout_dict['axis'], 'yaxis': layout_dict['axis'], 'plot_bgcolor': layout_dict['plot_bgcolor'], 'title': '{} Highest Funded NSF Divisions'.format(top_n) }}) # The plot can be viewed here. #
Ocean sciences happen to be the highest funded division from the NSF.
# We can also look at how different divisions' funding varies interannual and interdivisionally. # In[11]: df = do('SELECT * FROM ABSTRACTS ' 'LEFT JOIN AWARDS ' 'USING(AwardID)' 'WHERE Abstract NOT NULL') df['year'] = df['AwardEffectiveDate'].dt.year df['AwardDuration'] = (abs(((df['AwardExpirationDate'] - df['AwardEffectiveDate']).dt.days)) + 1) / 365 df['AwardAmountStandardized'] = df['AwardAmount'] / (df['NumInvestigators'] * df['AwardDuration']) df.replace({'AwardAmountStandardized' : {np.nan : 0}}, inplace=True) # In[272]: def division_history(divisions_to_compare, start=1990, end=2016, agg_key='Division', num_bins=3): metric1 = 'AwardAmount' metric2 = 'AwardAmountStandardized' df_plot = df[(df[agg_key].isin(divisions_to_compare)) & (df['year'] >= start) & (df['year'] <= end)] fig, ax = plt.subplots(2,2, figsize=(18,12)) # Top two plots for label, df_div in df_plot.groupby([agg_key,'year'])[metric1].sum() \ .reset_index().groupby(agg_key): df_div[[metric1, 'year']].plot(x='year', y=metric1, logy=False, ax=ax[0][0], label=label, title='Annual Funding per Division (USD)') for label, df_div in df_plot.groupby([agg_key,'year'])[metric2].mean() \ .reset_index().groupby(agg_key): df_div[[metric2,'year']].plot(x='year', y=metric2, logy=False, ax=ax[0][1], label=label, title='Average Annual Funding Available ' 'per Investigator per Year of Research (USD)') # Bottom two plots if len(divisions_to_compare) > 2: # Density distro for more than two divisions for label, df_div in df_plot[df_plot['year']==end].groupby(agg_key): df_div['AwardDuration'].plot(kind='kde', logx=False, xlim=(0,10), ax=ax[1][0], label=label, legend=True, title='{} Distribution of ' 'Funding Length (Years)'.format(end)) for label, df_div in df_plot[df_plot['year']==end].groupby(agg_key): df_div['NumInvestigators'].plot(kind='kde', logx=False, xlim=(0,8), ax=ax[1][1], label=label, title='{} Distribution of Number of ' 'Research Collaborators'.format(end)) else: # Violins for 2 or less pd.options.mode.chained_assignment = None bins = np.linspace(start, end, num=num_bins+1) labels = ['{:.0f}-{:.0f}'.format( bins[i], bins[i+1] ) for i in range(len(bins) - 1)] df_plot['YearRange'] = pd.cut(df['year'], bins, labels=labels) ax[1, 0].set_title('Distribution of Funding Length (Years)') sns.violinplot(data=df_plot, y='YearRange', x='AwardDuration', hue=agg_key, split=True, inner=None, ax=ax[1][0]) ax[1, 1].set_title('Distribution of Research Collaborators Per Award') g = sns.violinplot(data=df_plot, y='YearRange', x='NumInvestigators', hue=agg_key, split=True, inner=None, ax=ax[1][1]) g.set(ylabel='') g.set(yticks=[]) ax[0, 1].legend_.remove() ax[1, 0].legend_.remove() ax[1, 1].legend_.remove() # In[1030]: divisions_to_compare = ['Molecular And Cellular Bioscience', 'Social And Economic Sciences'] # In[1170]: def get_history(full_name): ''' Returns a PI-centric SQLite query to a Pandas dataframe. ''' return do( "SELECT DISTINCT AwardID, AwardTitle, AwardEffectiveDate," "InstitutionName, Division, AwardAmount FROM AWARDS " "INNER JOIN PI using(EmailAddress) " "WHERE FirstName||' '||LastName = '{}' " "ORDER BY AwardEffectiveDate ASC".format(full_name) ) # In[1031]: division_history(divisions_to_compare, start=2000, num_bins=3) # Let's look at a PI's history in more depth... # In[1171]: get_history('Cynthia Rudin') # To compare funding history of multiple PIs, we can recycle the `get_history` for PI funding timeseries. # In[12]: def trace(PI): ''' Generates trace for a given PI ''' def build_annotation(fund_history): ''' Returns a list of break delimtied json-looking-strings for html rendering on the plotly plot. More meta-data! ''' A = ["Award ID: {}".format(str(award)) for award in fund_history['AwardID'].tolist()] I = ["Institution : {}".format(uni) for uni in fund_history['InstitutionName'].tolist()] M = ["Award Amount: ${}".format(str(award)) for award in fund_history['AwardAmount'].tolist()] return ['
'.join([award, inst, mone]) for (award, inst, mone) in zip(A,I,M)] fund_history = get_history(PI) return Scatter( x = pd.to_datetime(fund_history['AwardEffectiveDate'], format='%Y/%m/%d'), y = fund_history['AwardAmount'].cumsum(), text = build_annotation(fund_history), name = PI, connectgaps = True ) # In[14]: py.iplot({ 'data': [trace('Scott Doney'), trace('Jorge Sarmiento'), trace('Hugh Ducklow')], 'layout': {'title': "Marine Biogeochemists' NSF Funding History in USD 2000-2016", 'xaxis': layout_dict['axis'], 'yaxis': layout_dict['axis'], 'plot_bgcolor': layout_dict['plot_bgcolor'] } }) # The plot can be viewed here. # With the annotations, we can backtrack see see which awards were fullfilled at what institution. This provides an accurate track record into each scientist's funding history. Unfortunately this plot oversimplified an award as being given to one recipeient, when in reality these rewards are split into many slices and may last several years. # In[1173]: def read_abstract(AwardID): ''' Renders the abstract in HTML so we can learn more! ''' abstract = do("SELECT * FROM ABSTRACTS " "WHERE AwardID={}".format(AwardID)) return HTML(abstract['Abstract'].loc[0]) # We see a 20M huge spike for Jorge Sarmiento on Sept 2014.
# We can quickly addess the abstract of using the awardID _1425989_. # In[ ]: read_abstract(1425989) # We can also split the data by Abstract topics.
# Currently this is only supported by substrings.
# An unsupervised learning algorithm might be interesting to use here. # In[1024]: def topic_history(topic): abstract_df = do( 'SELECT * FROM AWARDS ' 'INNER JOIN ( ' 'SELECT * FROM ABSTRACTS ' "WHERE ABSTRACT LIKE '%{}%' " 'AND Abstract NOT NULL ' "AND Abstract != 'None'" ') USING(AwardID)'.format(topic)) return abstract_df # In[1175]: topic_history("ocean acidification").head() # ## Natural Language Processing # How can we leverage the abstracts to understand what each field within a Division is _all about_.
# We can count words within abstracts and visualize them with sizr proportional to frequency as a world cloud. # # First we need to ignore common words called stopwords. # In[5]: def download_english_stopwords(): ''' Stopwords are words that don't give any unique insight across different texts. ''' stop_url = 'https://raw.githubusercontent.com/nltk/nltk_data/' \ 'gh-pages/packages/corpora/stopwords.zip' url = requests.get(stop_url) zipfile = ZipFile(BytesIO(url.content)) with zipfile.open('stopwords/english', 'r') as f: return f.read().decode('utf8').split('\n') # In[173]: stops = download_english_stopwords() # In[174]: stops[:5] # In[210]: stops += [ 'lab', 'abstract', 'project', 'career', 'research', 'give', 'given', 'gives', 'giving', 'get', 'fund', 'funding', 'funds', 'gain', 'gained', 'gaining', 'gains', 'go', 'good', 'goods', 'great', 'greater', 'greatest', 'greatly', 'high', 'higher', 'improve', 'improves', 'improvement', 'improving', 'inrease', 'increased', 'increasing', 'increasingly', 'know', 'large', 'larger', 'largely', 'learn', 'learned', 'learner', 'less', 'like', 'investigator', 'also', 'university', 'student', 'students', 'undergraduate', 'graduate', 'br', 'new', 'pi', 'one', 'two', 'three', 'four', 'five', 'year', 'well', 'studies', 'things', 'using', 'many', 'use', 'used','using',' users', 'school', 'work', 'nsf', 'science', 'professor', 'researcher', 'researchers' ] # In[1015]: def display_cloud(words, title, exclude=list(), save=False): wc = WordCloud(max_words=300, stopwords=stops + exclude, margin=10, background_color="white", random_state=1, scale=.75, width=800, height=400).generate(words) plt.title(title) plt.imshow(wc.to_array()) plt.axis("off") plt.figure() if save: wc.to_file("clouds/{}_cloud.png".format(title.replace(' ', '_').lower())) # We can compare most common terms across divisions. # In[1032]: for division in divisions_to_compare: words = df[(df['year'] == 2016) & (df['Division'] == division)] \ .groupby('Division')['Abstract'] \ .apply(lambda x: x.sum()).iloc[0] display_cloud(words, division, exclude=['data'], save=True) # We can create a wordcloud for a specific topic (substring) # In[1033]: topic = 'policy' words = topic_history(topic) \ .groupby('Division')['Abstract'] \ .apply(lambda x: x.sum()).iloc[0] display_cloud(words, topic, save=False) # ## Network Graphs # How are researchers connected?
# Let's make a network of collaborators based on who has been award grants together. # # Work in progress... # But a beautiful mistake!. # In[1176]: # df['Division'].value_counts().iloc[:40] def count_PIS(PI_STRING): """ split PI string """ PI_list = [x.rstrip().lstrip().title() for x in PI_STRING.split(';')] for PI in PI_list: if PI_count.get(PI): PI_count[PI]['count'] += 1 else: PI_count[PI] = {'count': 1} def parse_PIS(PI_STRING): """ split PI string """ PI_list = [x.rstrip().lstrip().title() for x in PI_STRING.split(';')] if len(PI_list) > 1: for PI in PI_list: if PI_dict.get(PI): # check if PI exists continue else: # if PI not in dict, add it PI_dict[PI] = dict() #check other PIs remaining_PI = [x for x in PI_list if x != PI] for sub_PI in remaining_PI: # for PI already collaborated add one if PI_dict[PI].get(sub_PI): PI_dict[PI][sub_PI] += 1 else: PI_dict[PI][sub_PI] = 1 else: return PI_dict = dict() PI_count = dict() # df[ # (df['Division'].isin(['Ocean Sciences'])) # & (df['year'] >= 2000) # ]['InvestigatorNames'].apply(parse_PIS) df[df['year'] >= 2000]['InvestigatorNames'].apply(parse_PIS) # df[(df['Division'].isin(['Ocean Sciences'])) & (df['year'] >= 2000)]['InvestigatorNames'].apply(count_PIS) df[df['year'] >= 2000]['InvestigatorNames'].apply(count_PIS) # PI_exclude = [PI for (PI, count) in PI_count.items() if count <= 2] df_connection = pd.DataFrame([ {'source' : source.replace(' ', '_'), 'target': target.replace(' ', '_'), 'value': value} for (source, targets) in PI_dict.items() for (target, value) in targets.items()]) # if source not in PI_exclude # or target not in PI_exclude]) force_directed_graph = {} force_directed_graph['links'] = df_connection.to_dict(orient='records') force_directed_graph['nodes'] = [{'id': source.replace(' ', '_'), 'group': 1}\ for source in df_connection['source'].append(df_connection['target']).unique()] with open('data_out/nsf.json', 'w') as fp: json.dump(force_directed_graph, fp) # In[1184]: PI_count # In[1180]: def get_edges(PI): ''' look through dataframes for PI and all PIs who have shared an award with that PI. ''' df_PI = df[df['InvestigatorNames'].str.contains(PI)] PI_count[PI]['Children'] = set(';'.join(df_PI['InvestigatorNames'].tolist()).split(';')) # In[1183]: [get_edges(PI) for PI in PI_count.keys()] # 1. check other PI's children for new subchildren # 2. append other PI's child to subchildren # 3. seach other children # In[1166]: def depth(first_PI, PI): if not PI_count[PI].get('subChildren'): PI_count[PI]['subChildren'] = set() diff_set = PI_count[PI]['Children'] - PI_count[first_PI]['subChildren'] if diff_set: print("difference found between {} and {}".format(first_PI, PI)) for new_PI in diff_set: # add new PI to OG PI's subchildren PI_count[first_PI]['subChildren'].update(set(new_PI)) # add OG PI to new PI's subchildren PI_count[new_PI]['subChildren'].update(set(first_PI)) # recursive call depth(first_PI, new_PI) else: # base case return # In[1090]: ex = set(';'.join(df[df['InvestigatorNames'].str.lower().str.contains('Sarah Gille'.lower())]['InvestigatorNames'].tolist()).split(';')) # In[1161]: PI_count['Sarah Gille'].get('Children') # In[1158]: for PI in PI_count.keys(): PI_count[PI]['subChildren'] = PI_count[PI]['Children'] # check each set of Children, append their Children to dict, and start generating graph # # AKA links, all permutations of investigators # In[1159]: PI_count['Shawn Campagna']['subChildren']