#!/usr/bin/env python # coding: utf-8 # In[1]: import json # Load our preprocessed abstract data with open('visvssrelationships_data_2016.json', 'r') as f: abstracts = json.load(f) # In[ ]: import difflib list_of_lists = [abstracts[a]['author'] for a in abstracts] originalList = set([val for sublist in list_of_lists for val in sublist]) authorList = set() while originalList: authorA = originalList.pop() deleteSet = set() for authorB in originalList: ratio = difflib.SequenceMatcher(None, authorA, authorB).ratio() if ((ratio > 0.9) & (ratio != 1.0)): print('authorA:', authorA) print('authorB:', authorB) print(' Ratio:', ratio) deleteSet.add(authorB) originalList.difference_update(deleteSet) authorList.add(authorA) print(len(authorList),'+',len(originalList),'=',len(authorList)+len(originalList)) # In[ ]: import networkx as nx import json from networkx.readwrite import json_graph from tqdm import tqdm # Use this to find the closest authors (assuming the authorList is pre-pruned) def findClosestAuthor(name,authorList): maxRatio = 0.0 outAuthor = None for author in authorList: ratio = difflib.SequenceMatcher(None, author, name).ratio() if (ratio > maxRatio): outAuthor = author maxRatio = ratio return outAuthor # Loop through each abstract and add edges between the authors' names # and the title of the abstract. G = nx.Graph() for a in tqdm(abstracts): title = abstracts[a]['title'] for name in abstracts[a]['author']: author = findClosestAuthor(name, authorList) G.add_edge(author, title) if not 'group' in G.node[author]: G.node[author]['group'] = 1 firstAuthor = findClosestAuthor(abstracts[a]['author'][0], authorList) G.node[firstAuthor]['group'] = 2 G.node[title]['group'] = 3 # Loop through all the nodes and add the name property for D3.js for n in G: G.node[n]['name'] = n # Remove parallel edges G = nx.Graph(G) # Remove self loops G.remove_edges_from(G.selfloop_edges()) # Export for D3.js d = json_graph.node_link_data(G) # node-link format to serialize json.dump(d, open('html/force.json','w')) # Use NetworkX to plot the data (I've had limited success) #import matplotlib.pyplot as plt #pos=nx.spring_layout(G) # positions for all nodes #nx.draw_networkx_nodes(G,pos,node_size=2,alpha=0.5) #nx.draw_networkx_edges(G,pos,alpha=0.25) #nx.draw_networkx_labels(G,pos,font_size=7,font_family='sans-serif') #plt.axis('off') #plt.show() # display # In[ ]: