#!/usr/bin/env python
# coding: utf-8

# ## NSF Funding History 1990 - 2016
# Ths script requests zipped files containing annual funding history from the NSF website, unpacks, parses and loads the resulting xml files (in-memory) to 4 relational data sets into SQLite, and finally performs some analysis using Pandas, Seaborn and Plotly.

# In[1]:


import os
import sys
import json
import re
import csv
import datetime as dt
import requests
from collections import OrderedDict
from io import BytesIO
from zipfile import ZipFile
from lxml import etree as ET

import pandas as pd
import numpy as np
import sqlalchemy as sql
import matplotlib.pylab as plt
import seaborn as sns
import plotly.plotly as py
from plotly.graph_objs import Bar, Scatter, Marker, Layout
from plotly.tools import FigureFactory as FF
from wordcloud import WordCloud, STOPWORDS
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from IPython.display import HTML

get_ipython().run_line_magic('matplotlib', 'inline')


# # Engineering

# ## Scraping the NSF
# downloaded from: http://www.nsf.gov/awardsearch/download.jsp

# In[2]:


# Constants
data_in  = 'data_in/'
data_out = 'data_out/' 

# URL for http-requests
req_root = 'http://www.nsf.gov/awardsearch/download?DownloadFileName={YR}&All=true'

# When to start looking at awards, and when to end. It is the YR in the root
year_start = 1990
year_end = 2017

csv_out = [
    'NSF_AWARDS.csv',
    'NSF_ABSTRACTS.csv',
    'NSF_INSTITUTIONS.csv',
    'NSF_PI.csv'
]

department_normalization = {
    'Polar Progrms': 'Polar Programs',
    'Arctic Sciences Division' : 'Polar Programs',
    'Research On Learning In Formal And Informal Settings (DRL)': 'Research On Learning',
    'Information Systems': 'Information & Intelligent Systems',
    'Civil, Mechanical, And Manufacturing Innovation': 'Civil, Mechanical, And Manufacturing Inn',
    'Behavioral And Cognitive Sci' : 'Behavioral And Cognitive Sciences',
    'Integrative Organismal System' : 'Integrative Organismal Sys',
    'Information Systems': 'Information & Intelligent Systems'

}

modifiers_to_remove = {
    'Division ': '', 
    'Of ': '',
    'Div ': '',
    'Directorate ':'',
    'Office ': '',
    'Direct ': '',
    'Divn ': '',
    'For ': '',
}


# Each annual zipped file contains one xml file per award:

# In[621]:


with open('data_in/2016/1623912.xml', 'r') as xml_view:
    print(xml_view.read())


# In[626]:


def division_cleanse(raw_division):
    """
    Use normalization dicts to standardize legacy division naming.
    """

    rep = dict((re.escape(k), v) for k, v in modifiers_to_remove.items())
    pattern = re.compile("|".join(rep.keys()))

    division = pattern.sub(lambda m: rep[re.escape(m.group(0))],
                           raw_division.title())

    return department_normalization.get(division, division)

def get_PIs(list_investigators):
    """
    Returns a semi-colon joined list of PI names (first and last)
    """
    return ';'.join(
        ['{} {}'.format(PI.find('FirstName').text, 
                        PI.find('LastName').text) 
         for PI in list_investigators])


# Builder functions
def awards_builder(tree):
    global awards
    list_investigators = tree.findall('.//Award/Investigator')
    return awards.append(
            OrderedDict([
                    ('AwardID',tree.find('Award/AwardID').text),
                    ('AwardTitle',tree.find('Award/AwardTitle').text),
                    ('AwardEffectiveDate',tree.find('Award/AwardEffectiveDate').text),
                    ('AwardExpirationDate',tree.find('Award/AwardExpirationDate').text),
                    ('AwardAmount',tree.find('Award/AwardAmount').text),
                    ('InstitutionName',tree.find('Award/Institution/Name').text),
                    ('Division', division_cleanse(tree.find('Award/Organization/Division')[0].text)),
                    ('Directorate', division_cleanse(tree.find('Award/Organization/Directorate')[0].text)),
                    ('EmailAddress',tree.find('Award/Investigator/EmailAddress').text),
                    ('InvestigatorNames', get_PIs(list_investigators)),
                    ('NumInvestigators', len(list_investigators))
            ]))

def abstracts_builder(tree):
    global abstracts
    return abstracts.append(
            OrderedDict([
                    ('AwardID',tree.find('Award/AwardID').text),
                    ('Abstract',tree.find('Award/AbstractNarration').text)
             ]))

def institutions_builder(tree):
    global institutions
    return institutions.append(
            OrderedDict([
                    ('InstitutionName',tree.find('Award/Institution/Name').text),
                    ('StreetAddress',tree.find('Award/Institution/StreetAddress').text),
                    ('CityName',tree.find('Award/Institution/CityName').text),
                    ('StateCode',tree.find('Award/Institution/StateCode').text),
                    ('ZipCode',tree.find('Award/Institution/ZipCode').text),
                    ('CountryName',tree.find('Award/Institution/CountryName').text)
            ]))

def PI_builder(tree):
    global PI
    return PI.append(
            OrderedDict([
                    ('EmailAddress',tree.find('Award/Investigator/EmailAddress').text),
                    ('FirstName',tree.find('Award/Investigator/FirstName').text),
                    ('LastName',tree.find('Award/Investigator/LastName').text)
            ]))

def xml_parse(file):
    '''
    Calls all four data set builders after parsing a xml file into a tree.
    This function is called for each unzipped-file.
    '''
    try:
        tree = ET.parse(file)
    except:
        return("empty")
    try:
        awards_builder(tree)
    except:
        return(tree.find('Award/AwardID').text,"is missing an award key")
    
    abstracts_builder(tree)
    institutions_builder(tree)
    PI_builder(tree)
    return

def curl(path):
    '''
    Requests zip file of a year's Award info from the NSf site.
    The zip file is unzipped and stored as a stream of bytes in memory.
    Each of the unzipped xml files is parsed by xml_parse for all 4 data set builders.
    '''
    file_url = req_root.format(YR = str(path))
    print("*** {} ***\ndownloading {}".format(path,file_url))
    url = requests.get(file_url)
    zipfile = ZipFile(BytesIO(url.content))
    [xml_parse(zipfile.open(file)) for file in zipfile.namelist()]
    return

def remove_duplicates(dict_list):
    return [dict(tupleized) for tupleized in 
            set(tuple(item.items()) for item in dict_list)]

def json_dump():
    '''
    Iterates through each relational dataset, and dumps each to json.
    '''
    dict_NSF = [awards,
                abstracts,
                institutions,
                PI]

    out_json = ["NSF_AWARDS.json",
                "NSF_ABSTRACTS.json",
                "NSF_INSTITUTIONS.json",
                "NSF_PI.json"]

    for list_dicts, json_file in zip(dict_NSF, out_json):
        with open(os.path.join(data_out, json_file), 'w') as outfile:
            json.dump(list_dicts, outfile, indent=4)

def csv_dump():
    '''
    Iterates through each relational dataset, and dumps each to csv.
    '''
    for list_dicts, file in zip(dict_NSF, csv_out):
        with open(os.path.join(data_out,file), 'w') as outfile:
            dict_writer = csv.DictWriter(outfile, list_dicts[0].keys())
            dict_writer.writeheader()
            dict_writer.writerows(list_dicts)


# In[629]:


awards = []
abstracts = []
institutions = []
PI = []
dict_NSF = [awards, abstracts, institutions, PI]

start = dt.datetime.now()
[curl(year) for year in range(year_start, year_end+1)] 
print("{ROWS} records scraped in {TIME} sec".format(
        ROWS=len(awards),
        TIME=(dt.datetime.now()-start).total_seconds()))

# remove doops
start = dt.datetime.now()
[remove_duplicates(data_set) for data_set in [awards, PI, institutions]]
print("Duplicates removed in %s sec." % (dt.datetime.now()-start).total_seconds())

# dump'em
start = dt.datetime.now()
csv_dump()
print("Dumped into CSV in %s sec." % (dt.datetime.now()-start).total_seconds())


# In[22]:


pd.read_csv(os.path.join(data_out,"NSF_AWARDS.csv"), nrows=5)


# ## Getting the data into a SQLite database
# We could have gone straight from the stream of bytes to the SQLite db, but there's nothing wrong with keeping the processed back-up files in disk :)

# In[631]:


def SQLite_dump(infile, Type='csv'):
    # open SQlite connection
    disk_engine = sql.create_engine('sqlite:///NSF.db')
    start = dt.datetime.now()
    tableName = infile.replace("NSF_","")\
                      .replace(".csv","")\
                      .replace(".json","")\
                      .replace(data_out,"")
    
    print("*********************************************************************")
    if Type == 'csv':
        '''
        Set Type to csv after csv_dump.
        Break each CSV into chunks to reduce in memory stoarge of dataset.
        Each chunk is sent to a SQLite table (tableName).
        '''
        chunksize = 20000
        j = 0
        index_start = 1
        
        for df in pd.read_csv(infile, chunksize=chunksize, iterator=True, encoding='utf-8'):
            if tableName == 'AWARDS':
                for col in ['AwardEffectiveDate','AwardExpirationDate']:
                    df[col] = pd.to_datetime(df[col], format='%m/%d/%Y')
            df.index += index_start
            index_start = df.index[-1] + 1
            j += 1

            print("{X} seconds: {Y} records dumped into SQLite table {Z}".format(
                    X=(dt.datetime.now() - start).total_seconds(),
                    Y=j*chunksize,
                    Z=tableName
                ))
            
            if j == 1:
                df.to_sql(tableName, disk_engine, if_exists='replace', index=False)
            
            else:
                df.to_sql(tableName, disk_engine, if_exists='append', index=False)
            
    elif Type == 'json':
        '''
        After json_dump, file is opened in a dataframe and dumped into a sqlite table
        '''
        df = pd.read_json(infile)
        if tableName == 'AWARDS':
            for col in ['AwardEffectiveDate','AwardExpirationDate']:
                df[col] = pd.to_datetime(df[col], format='%m/%d/%Y')
                
        print("{X} seconds: {Y} records dumped into SQLite table {Z}".format(
                X=(dt.datetime.now() - start).seconds,
                Y=len(df),
                Z=tableName
            ))
        
        df.to_sql(tableName, disk_engine, if_exists='replace', index=False)


# In[632]:


for infile in csv_out:
    SQLite_dump(os.path.join(data_out,infile), Type='csv')


# # Analysis

# ## Filtering SQL tables into Pandas.
# There is a read_sql() function in Pandas, but who wants to open connections, and write querries every time?
# This function takes care of that in a way that is SQL-native, with the advantage of default values and auto formatting to spend more time analyzing data and less time writing querries!

# In[10]:


def do(query):
    '''
    Direct sql queries to a Pandas dataframe.
    '''
    disk_engine = sql.create_engine('sqlite:///NSF.db')
    df = pd.read_sql_query(query, disk_engine)
    
    for col in df.columns:
        if "Date" in col:
            df[col] = pd.to_datetime(df[col])
    return df


# In[29]:


df = do("SELECT * FROM AWARDS")
df.head()


# In[33]:


q = """
SELECT 
    strftime(\'%Y\',AwardEffectiveDate) as year,
    Division,
    SUM(AwardAmount) as 'AwardSum'
FROM AWARDS GROUP BY Division ORDER BY AwardSum DESC;"""
do(q).head()


# In[34]:


q = """
SELECT 
    strftime(\'%Y\',AwardEffectiveDate) as year,
    Directorate,
    SUM(AwardAmount) as 'AwardSum'
FROM AWARDS GROUP BY 2 ORDER BY 3 DESC;"""
do(q).head()


# # Analysis
# Let's use Plotly and Seaborn for some Analysis

# In[6]:


plotly.offline.init_notebook_mode()
with open("/Users/leonyin/Documents/creds/plotly_credentials.json",'r') as data_file:
    '''
    Get username credentials.
    '''
    creds = json.load(data_file)['credentials']
    
    plotly.tools.set_credentials_file(
        username=creds['USER'],
        api_key=creds['API_KEY']
    )
with open("/Users/leonyin/Documents/creds/plotly_presets.json",'r') as data_file:
    '''
    Load graphy presets.
    '''
    layout_dict = json.load(data_file)['layout']


# In[15]:


def top_funds(df,groupby,n=5):
    return df.groupby(groupby)                         \
               .apply(lambda x:x['AwardAmount'].sum())  \
               .sort_values(ascending=False)             \
               .head(n)


# In[16]:


years = list(range(year_start,year_end))
top_n = 10


# In[22]:


annual_funds = {}
annual_funds['Other'] = {}

for year in years:
    df = do("""SELECT strftime(\'%Y\',AwardEffectiveDate) as year,
               Division, SUM(AwardAmount) as 'AwardSum'
               FROM AWARDS WHERE YEAR='{}'
               GROUP BY Division ORDER BY AwardSum DESC""".format(year))

    annual_funds['Other'][year] = sum(df.AwardSum)
    # get top 10 to plot
    for i in range(top_n):
        division = df.get_value(i,'Division')
        _sum =df.get_value(i,'AwardSum')
        annual_funds['Other'][year] -= _sum
        if division in annual_funds:
            annual_funds[division][year] = _sum
        else:
            annual_funds[division] = {year:_sum}

tops = []
for division in top_funds(do("SELECT * FROM AWARDS"),groupby='Division', n=top_n).keys():
    tops.append({
            'x': years,
            'y': [annual_funds[division].get(year,None) for year in years],
            'name': division,
            'type': 'bar'
        })

py.iplot({
    'data': tops, 
    'layout': {
        'barmode': 'stack',
        'xaxis': layout_dict['axis'],
        'yaxis': layout_dict['axis'],
        'plot_bgcolor': layout_dict['plot_bgcolor'],
        'title': '{} Highest Funded NSF Divisions'.format(top_n)
    }})


# The plot can be viewed <a html='https://plot.ly/~leonyin/56'>here</a>.
# <br>Ocean sciences happen to be the highest funded division from the NSF.<br>
# We can also look at how different divisions' funding varies interannual and interdivisionally.

# In[11]:


df = do('SELECT * FROM ABSTRACTS '
        'LEFT JOIN AWARDS '
        'USING(AwardID)'
        'WHERE Abstract NOT NULL')

df['year'] = df['AwardEffectiveDate'].dt.year
df['AwardDuration'] = (abs(((df['AwardExpirationDate'] - df['AwardEffectiveDate']).dt.days)) + 1) / 365
df['AwardAmountStandardized'] = df['AwardAmount'] / (df['NumInvestigators'] * df['AwardDuration'])
df.replace({'AwardAmountStandardized' : {np.nan : 0}}, inplace=True)


# In[272]:


def division_history(divisions_to_compare, start=1990, end=2016, 
                     agg_key='Division', num_bins=3):
    
    metric1 = 'AwardAmount'
    metric2 = 'AwardAmountStandardized'
    
    df_plot = df[(df[agg_key].isin(divisions_to_compare)) &
                 (df['year'] >= start) & (df['year'] <= end)]
    
    fig, ax = plt.subplots(2,2, figsize=(18,12))

    # Top two plots
    for label, df_div in df_plot.groupby([agg_key,'year'])[metric1].sum() \
                                .reset_index().groupby(agg_key):
        
        df_div[[metric1, 'year']].plot(x='year', y=metric1, logy=False,
                                       ax=ax[0][0], label=label,
                                       title='Annual Funding per Division (USD)')
        
    for label, df_div in df_plot.groupby([agg_key,'year'])[metric2].mean() \
                             .reset_index().groupby(agg_key):
        
        df_div[[metric2,'year']].plot(x='year', y=metric2, logy=False,
                                      ax=ax[0][1], label=label,
                                      title='Average Annual Funding Available '
                                            'per Investigator per Year of Research (USD)')
        
    # Bottom two plots
    if len(divisions_to_compare) > 2:
        # Density distro for more than two divisions
        for label, df_div in df_plot[df_plot['year']==end].groupby(agg_key):
            df_div['AwardDuration'].plot(kind='kde', logx=False, xlim=(0,10),
                                         ax=ax[1][0], label=label, legend=True,
                                         title='{} Distribution of '
                                               'Funding Length (Years)'.format(end))

        for label, df_div in df_plot[df_plot['year']==end].groupby(agg_key):
            df_div['NumInvestigators'].plot(kind='kde', logx=False, xlim=(0,8), 
                                            ax=ax[1][1], label=label,
                                            title='{} Distribution of Number of '
                                                  'Research Collaborators'.format(end))
    else:
        # Violins for 2 or less
        pd.options.mode.chained_assignment = None
        
        bins = np.linspace(start, end, num=num_bins+1) 
        labels = ['{:.0f}-{:.0f}'.format(
                bins[i], bins[i+1]
            ) for i in range(len(bins) - 1)]
        
        df_plot['YearRange'] = pd.cut(df['year'], bins, labels=labels)
        
        ax[1, 0].set_title('Distribution of Funding Length (Years)')
        sns.violinplot(data=df_plot, y='YearRange', x='AwardDuration', hue=agg_key, 
                       split=True, inner=None, ax=ax[1][0])
        
        ax[1, 1].set_title('Distribution of Research Collaborators Per Award')
        g = sns.violinplot(data=df_plot, y='YearRange', x='NumInvestigators', hue=agg_key,
                       split=True, inner=None, ax=ax[1][1])
        
        g.set(ylabel='')
        g.set(yticks=[])
        ax[0, 1].legend_.remove()
        ax[1, 0].legend_.remove()
        ax[1, 1].legend_.remove()


# In[1030]:


divisions_to_compare = ['Molecular And Cellular Bioscience', 'Social And Economic Sciences']


# In[1170]:


def get_history(full_name):
    '''
    Returns a PI-centric SQLite query to a Pandas dataframe.
    '''
    return do(
        "SELECT DISTINCT AwardID, AwardTitle, AwardEffectiveDate,"
        "InstitutionName, Division, AwardAmount FROM AWARDS "
        "INNER JOIN PI using(EmailAddress) "
        "WHERE FirstName||' '||LastName = '{}' "
        "ORDER BY AwardEffectiveDate ASC".format(full_name)
    )


# In[1031]:


division_history(divisions_to_compare, start=2000, num_bins=3)


# Let's look at a PI's history in more depth...

# In[1171]:


get_history('Cynthia Rudin')


# To compare funding history of multiple PIs, we can recycle the `get_history` for PI funding timeseries.

# In[12]:


def trace(PI):
    '''
    Generates trace for a given PI
    '''
    def build_annotation(fund_history):
        '''
        Returns a list of break delimtied json-looking-strings for html rendering on the plotly plot.
        More meta-data!
        '''
        A = ["Award ID: {}".format(str(award)) for award in fund_history['AwardID'].tolist()]
        I = ["Institution : {}".format(uni) for uni in fund_history['InstitutionName'].tolist()]
        M = ["Award Amount: ${}".format(str(award)) for award in fund_history['AwardAmount'].tolist()]
        return ['<br>'.join([award, inst, mone]) for (award, inst, mone) in zip(A,I,M)]
    
    fund_history = get_history(PI)
    
    return Scatter(
        x = pd.to_datetime(fund_history['AwardEffectiveDate'], format='%Y/%m/%d'),
        y = fund_history['AwardAmount'].cumsum(),
        text = build_annotation(fund_history),
        name = PI,
        connectgaps = True
    )


# In[14]:


py.iplot({
    'data': [trace('Scott Doney'), trace('Jorge Sarmiento'), trace('Hugh Ducklow')],
    'layout': 
        {'title': "Marine Biogeochemists' NSF Funding History in USD 2000-2016",
         'xaxis': layout_dict['axis'],
         'yaxis': layout_dict['axis'],
         'plot_bgcolor': layout_dict['plot_bgcolor']
        }
    })


# The plot can be viewed <a html='https://plot.ly/~leonyin/54/'>here</a>.

# With the annotations, we can backtrack see see which awards were fullfilled at what institution. This provides an accurate track record into each scientist's funding history. Unfortunately this plot oversimplified an award as being given to <i>one</i> recipeient, when in reality these rewards are split into many slices and may last several years.

# In[1173]:


def read_abstract(AwardID):
    '''
    Renders the abstract in HTML so we can learn more!
    '''
    abstract = do("SELECT * FROM ABSTRACTS "
                  "WHERE AwardID={}".format(AwardID))
    
    return HTML(abstract['Abstract'].loc[0])


# We see a 20M huge spike for Jorge Sarmiento on Sept 2014. <br>
# We can quickly addess the abstract of using the awardID _1425989_.

# In[ ]:


read_abstract(1425989)


# We can also split the data by Abstract topics.<br>
# Currently this is only supported by substrings.<br>
# An unsupervised learning algorithm might be interesting to use here.

# In[1024]:


def topic_history(topic):
    
    abstract_df = do(
        'SELECT * FROM AWARDS '
        'INNER JOIN ( '
            'SELECT * FROM ABSTRACTS '
            "WHERE ABSTRACT LIKE '%{}%' "
            'AND Abstract NOT NULL '
            "AND Abstract != 'None'"
        ') USING(AwardID)'.format(topic))
    
    return abstract_df


# In[1175]:


topic_history("ocean acidification").head()


# ## Natural Language Processing
# How can we leverage the abstracts to understand what each field within a Division is _all about_.<br>
# We can count words within abstracts and visualize them with sizr proportional to frequency as a world cloud.
# 
# First we need to ignore common words called stopwords.

# In[5]:


def download_english_stopwords():
    '''
    Stopwords are words that don't give any unique insight
    across different texts.
    '''
    stop_url = 'https://raw.githubusercontent.com/nltk/nltk_data/' \
               'gh-pages/packages/corpora/stopwords.zip'
    url = requests.get(stop_url)
    zipfile = ZipFile(BytesIO(url.content))
    with zipfile.open('stopwords/english', 'r') as f:
        return f.read().decode('utf8').split('\n')


# In[173]:


stops = download_english_stopwords()


# In[174]:


stops[:5]


# In[210]:


stops += [
    'lab', 'abstract', 'project', 'career', 'research', 
    'give', 'given', 'gives', 'giving', 'get', 'fund', 
    'funding', 'funds', 'gain', 'gained', 'gaining', 
    'gains', 'go', 'good', 'goods', 'great', 'greater', 
    'greatest', 'greatly', 'high', 'higher', 'improve', 
    'improves', 'improvement', 'improving', 'inrease', 
    'increased', 'increasing', 'increasingly', 'know', 
    'large', 'larger', 'largely', 'learn', 'learned', 
    'learner', 'less', 'like', 'investigator', 'also', 
    'university', 'student', 'students', 'undergraduate', 
    'graduate', 'br', 'new', 'pi', 'one', 'two', 'three',
    'four', 'five', 'year', 'well', 'studies', 'things', 
    'using', 'many', 'use', 'used','using',' users', 
    'school', 'work', 'nsf', 'science', 'professor', 
    'researcher', 'researchers'
]


# In[1015]:


def display_cloud(words, title, exclude=list(), save=False):    
    wc = WordCloud(max_words=300, stopwords=stops + exclude, margin=10, background_color="white",
                   random_state=1, scale=.75, width=800, height=400).generate(words)
       
    plt.title(title)
    plt.imshow(wc.to_array())
    plt.axis("off")
    plt.figure()
    
    if save:
        wc.to_file("clouds/{}_cloud.png".format(title.replace(' ', '_').lower()))


# We can compare most common terms across divisions.

# In[1032]:


for division in divisions_to_compare:
    
    words = df[(df['year'] == 2016) & (df['Division'] == division)] \
                .groupby('Division')['Abstract'] \
                .apply(lambda x: x.sum()).iloc[0]
        
    display_cloud(words, division, exclude=['data'], save=True)


# We can create a wordcloud for a specific topic (substring)

# In[1033]:


topic = 'policy'

words = topic_history(topic) \
            .groupby('Division')['Abstract'] \
            .apply(lambda x: x.sum()).iloc[0]

display_cloud(words, topic, save=False)


# ## Network Graphs
# How are researchers connected?<br>
# Let's make a network of collaborators based on who has been award grants together.
# 
# Work in progress...
# But a <a href='http://bl.ocks.org/yinleon/raw/54c4133a8d234522dc50038fda2cc810/'>beautiful mistake</a>!.

# In[1176]:


# df['Division'].value_counts().iloc[:40]
def count_PIS(PI_STRING):
    """
    split PI string
    """
    PI_list = [x.rstrip().lstrip().title() for x in PI_STRING.split(';')]
    
    for PI in PI_list:
        if PI_count.get(PI):
            PI_count[PI]['count'] += 1
        else:
            PI_count[PI] = {'count': 1}

def parse_PIS(PI_STRING):
    """
    split PI string
    """
    PI_list = [x.rstrip().lstrip().title() for x in PI_STRING.split(';')]
    
    if len(PI_list) > 1:
        for PI in PI_list:
            if PI_dict.get(PI):
                # check if PI exists
                continue
            else:
                # if PI not in dict, add it
                PI_dict[PI] = dict()
            
            #check other PIs
            remaining_PI = [x for x in PI_list if x != PI]
            for sub_PI in remaining_PI:
                # for PI already collaborated add one
                if PI_dict[PI].get(sub_PI):
                    PI_dict[PI][sub_PI] += 1
                else:
                    PI_dict[PI][sub_PI] = 1
    else:
        return

PI_dict = dict()
PI_count = dict()

# df[
#     (df['Division'].isin(['Ocean Sciences'])) 
#   & (df['year'] >= 2000)
# ]['InvestigatorNames'].apply(parse_PIS)

df[df['year'] >= 2000]['InvestigatorNames'].apply(parse_PIS)

# df[(df['Division'].isin(['Ocean Sciences'])) & (df['year'] >= 2000)]['InvestigatorNames'].apply(count_PIS)

df[df['year'] >= 2000]['InvestigatorNames'].apply(count_PIS)


# PI_exclude = [PI for (PI, count) in PI_count.items() if count <= 2]

df_connection = pd.DataFrame([
    {'source' : source.replace(' ', '_'), 
     'target': target.replace(' ', '_'), 
     'value': value} 
    for (source, targets) in PI_dict.items() 
    for (target, value)  in targets.items()])
#     if source not in PI_exclude 
#     or target not in PI_exclude])

force_directed_graph = {}
force_directed_graph['links'] = df_connection.to_dict(orient='records')
force_directed_graph['nodes'] = [{'id': source.replace(' ', '_'), 'group': 1}\
                                 for source in df_connection['source'].append(df_connection['target']).unique()]

with open('data_out/nsf.json', 'w') as fp:
    json.dump(force_directed_graph, fp)


# In[1184]:


PI_count


# In[1180]:


def get_edges(PI):
    '''
    look through dataframes for PI and all PIs who have shared an award with that PI.
    '''
    df_PI = df[df['InvestigatorNames'].str.contains(PI)]
    PI_count[PI]['Children'] = set(';'.join(df_PI['InvestigatorNames'].tolist()).split(';'))


# In[1183]:


[get_edges(PI) for PI in PI_count.keys()]


# 1. check other PI's children for new subchildren
# 2. append other PI's child to subchildren
# 3. seach other children

# In[1166]:


def depth(first_PI, PI):
    if not PI_count[PI].get('subChildren'):
        PI_count[PI]['subChildren'] = set()
        
    diff_set = PI_count[PI]['Children'] -  PI_count[first_PI]['subChildren']
    if diff_set:
        print("difference found between {} and {}".format(first_PI, PI))
        for new_PI in diff_set:
            # add new PI to OG PI's subchildren
            PI_count[first_PI]['subChildren'].update(set(new_PI))
            # add OG PI to new PI's subchildren
            PI_count[new_PI]['subChildren'].update(set(first_PI))
            # recursive call
            depth(first_PI, new_PI)
    else: # base case
        return


# In[1090]:


ex = set(';'.join(df[df['InvestigatorNames'].str.lower().str.contains('Sarah Gille'.lower())]['InvestigatorNames'].tolist()).split(';'))


# In[1161]:


PI_count['Sarah Gille'].get('Children')


# In[1158]:


for PI in PI_count.keys():
    PI_count[PI]['subChildren'] = PI_count[PI]['Children']


# check each set of Children, append their Children to dict, and start generating graph
# 
# AKA links, all permutations of investigators 

# In[1159]:


PI_count['Shawn Campagna']['subChildren']