Image(url='http://it-ebooks.info/images/ebooks/3/agile_data_science.jpg') from IPython.display import Image Image(url='http://www.cloudpointasia.com/var/site/storage/images/media/images/site-images/logos/citrixonline_logo_web/96233-1-eng-GB/citrixonline_logo_web.gif') Image(url='http://www.theactivityexchange.com/images/logo_small.png') # The Data Science Venn Diagram (1.0) — Drew Conway Image(url='http://static.squarespace.com/static/5150aec6e4b0e340ec52710a/t/51525c33e4b0b3e0d10f77ab/1364352052403/Data_Science_VD.png?format=750w') # Venn Diagram 2.0, Steven Geringer Image('http://2.bp.blogspot.com/-Qi-0utjhySM/UsteLrV6NyI/AAAAAAAACNQ/AdkizQfS8l8/s1600/moz-screenshot-3-729576.png') import urllib2 import json # Download all info from all the Data Science Meetup members. members = json.loads(urllib2.urlopen( "http://api.meetup.com/2/members?order=name" + "&group_urlname=Santa-Barbara-Data-Science&offset=0&format=json&page=150" + "&sig_id=66734052&sig=f7fc02b7069092e6775332b25f01b69e21346b92" ).read()) bios = [{'name' : x['name'], 'bio': x['bio']} for x in members['results'] ] bios[-2:] import nltk from collections import Counter # Find all the named entities. entity_counter = Counter() for x in bios: text = nltk.wordpunct_tokenize(x['bio']) for name,tag in nltk.pos_tag(text): if tag == 'NNP': entity_counter.update([name.capitalize()]) entity_counter.most_common(5) import matplotlib.pyplot as plt import numpy as np fig = plt.figure(figsize=(15,9)); ax = fig.add_subplot(111); ax.set_xticks([]); names,counts=zip(*[(name, count) for name, count in entity_counter.most_common(20) if name not in ['Data', 'Hi','Santa', 'Barbara','D','Ph', 'My', 'Science']]) x_pos = np.arange(len(counts)) plt.bar(x_pos - .4, counts, color = '#eeefff'); for x, y, label in zip(x_pos, counts, names): plt.annotate(label, (x+0.1, y + len(label)/2.2), ha='center', rotation=70, size='xx-large') from itertools import product #Create a topic co-occurrence graph. Nodes are topics, edges between a,b means that a member listed both a,b as topics nodes = Counter() edges = Counter() for x in members['results']: for y in x['topics']: nodes.update([y['name']]) edges.update([(a['name'],b['name']) for a,b in product(x['topics'],x['topics']) if a['name'] > b['name']]) nodes.most_common(10) import networkx as nx g = nx.Graph() node_names = set() for name, count in nodes.most_common(20): g.add_node(name, count=count) node_names.add(name) for edge, count in edges.iteritems(): if edge[0] in node_names and edge[1] in node_names: g.add_edge(edge[0], edge[1], weight = count) labels = dict([(name, name.replace(' ','\n')) for name,_ in nodes.most_common(20)]) fig = plt.figure(figsize=(17, 10)); ax = fig.add_subplot(111) pos = nx.spring_layout(g,k=4.9, scale = 1000.0) nx.draw_networkx_nodes(g, pos, node_size = [7*(d['count']**2) for _, d in g.nodes_iter(data=True)], alpha = 0.8, node_color = '#eeefff') nx.draw_networkx_labels(g, pos, labels, font_size=18); nx.draw_networkx_edges(g, pos, width= [(d['weight']/10.0)**2 for _, _, d in g.edges_iter(data=True)], alpha = 0.5, edge_color = 'g') ax.set_xticks([]);ax.set_yticks([]);