%matplotlib inline from matplotlib import pyplot from bulbs.rexster import Graph, Config, REXSTER_URI REXSTER_URI = 'http://localhost:8182/graphs/plos' config = Config(REXSTER_URI) g = Graph(config) # Bulbs Models from bulbs.model import Node, Relationship from bulbs.property import String, Integer, DateTime, List class Author(Node): element_type = 'author' name = String(nullable=False) class Article(Node): element_type = 'article' title = String(nullable=False) published = DateTime() doi = String() class Authorship(Relationship): label = 'authored' class Citation(Relationship): label = 'cites' reference_count = Integer(nullable=False) tag = String() g.add_proxy('authors', Author) g.add_proxy('articles', Article) g.add_proxy('authored', Authorship) g.add_proxy('cites', Citation) nodes = g.V edges = g.E authors = {n.name: n for n in nodes if n.element_type == 'author'} authors.keys()[:10] articles = {n.doi: n for n in nodes if n.element_type == 'article'} articles.keys()[:10] len(articles.keys()) article = articles['10.1371/journal.pbio.1000584'] article.title list(article.inE()) for author in article.inV(): print author.name list(article.outE()) for citation in article.outV(): print citation.title print [n.name for n in citation.inV() if n.element_type == 'author'] print citation.doi sum(1 for n in nodes if n.element_type == 'article' and n.outV() > 0) for citation in article.outE(): print citation.reference_count citation_counts = [] for doi in articles.keys(): if articles[doi].outE(): for e in articles[doi].outE(): if e.label == 'cites': citation_counts.append(e.reference_count) pyplot.hist(citation_counts, bins=range(20)) pyplot.xlabel('number of times cited') pyplot.ylabel('count') def article_pp(article): authors = unicode(', '.join([n.name for n in article.inV() if n.element_type == 'author'])) s = ('Title: %s\n' 'Authors: %s\n' 'DOI: %s' % (article.title, authors, article.doi)) return s for edge in edges: if edge.label == 'cites': if edge.reference_count >= 21: print('Citer:') print(article_pp(edge.outV())) print('') print('Citee') print(article_pp(edge.inV())) print('') print('Citer cites citee %d times.' % edge.reference_count) print('-----------------------------------------------------') def are_different_authors(article_1, article_2): authors_1 = [] authors_2 = [] for n in article_1.inV(): if n.element_type == 'author': authors_1.append(n.name) for n in article_2.inV(): if n.element_type == 'author': authors_2.append(n.name) authors_1 = set(authors_1) authors_2 = set(authors_2) return len(authors_1.intersection(authors_2)) == 0 citation_counts = [] for edge in edges: if edge.label == 'cites': if are_different_authors(edge.inV(), edge.outV()): citation_counts.append(edge.reference_count) pyplot.hist(citation_counts, bins=range(20)) pyplot.xlabel('number of times cited') pyplot.ylabel('count') for edge in edges: if edge.label == 'cites': if edge.reference_count >= 16 and are_different_authors(edge.inV(), edge.outV()): print('Citer:') print(article_pp(edge.outV())) print('') print('Citee') print(article_pp(edge.inV())) print('') print('Citer cites citee %d times.' % edge.reference_count) print('-----------------------------------------------------') inspirators = [] for article in articles.values(): in_nodes = [] if article.inE(): for edge in article.inE(): if edge.label == 'cites': if are_different_authors(edge.inV(), edge.outV()) and edge.reference_count >= 3: in_nodes.append([edge.outV(), edge.reference_count]) if len(in_nodes) >= 3: inspirators.append([article, in_nodes]) len(inspirators) for inspirator in inspirators: print('Inspirator') print article_pp(inspirator[0]) print('') for el in inspirator[1]: print('Inspired Article') print article_pp(el[0]) print('Cites inspirator %d times.' % el[1]) print('') print('--------------------------------------') print('')