import pandas as pd import numpy as np import matplotlib.pyplot as plt %pylab inline source = 'https://gist.githubusercontent.com/chengsoonong/dede21b2eefa43b30d14/raw/add9312ea30c0ccc770151cb6188f964d2baa047/gsoc2014_accepted.csv' data = pd.read_table(source, sep=',') print(data.columns.values) orgs = np.unique(data['Organization']) proj_per_org = [] for org in orgs: proj_per_org.append(len(np.flatnonzero(data['Organization'] == org))) proj_per_org = np.array(proj_per_org) sort_idx = np.argsort(-proj_per_org) fig = plt.figure(figsize=(15,5)) ax = fig.add_subplot(111) top_orgs = 20 idx = np.arange(top_orgs) ax.bar(idx, proj_per_org[sort_idx][:top_orgs]) ax.set_xticks(idx) dummy = ax.set_xticklabels(orgs[sort_idx], rotation=80, ha='center') ml_orgs = [] for org in orgs: if 'learning' in org.lower(): ml_orgs.append(org) ml_titles = [] for org in ml_orgs: title_from_org = data['Title'][data['Organization']==org] for title in title_from_org: ml_titles.append(np.flatnonzero(data['Title']==title)[0]) titles = data['Title'] for idx,title in enumerate(titles): if ('learning' in title.lower()) or ('scikit-learn' in title.lower()): ml_titles.append(np.flatnonzero(data['Title']==title)[0]) ml_orgs.append(data['Organization'][idx]) ml_orgs = np.unique(np.array(ml_orgs)) print(ml_orgs) ml_titles = np.unique(np.array(ml_titles)) print(data.loc[ml_titles][['Title','Organization','Student','Mentors']]) ml_proj_count = [] for org in ml_orgs: ml_proj_count.append(len(np.flatnonzero(data.loc[ml_titles]['Organization'] == org))) ml_proj_count = np.array(ml_proj_count) ml_orgs = np.array(ml_orgs) sort_idx = np.argsort(-ml_proj_count) fig = plt.figure(figsize=(10,5)) ax = fig.add_subplot(111) idx = np.arange(len(ml_orgs)) ax.bar(idx, ml_proj_count[sort_idx]) ax.set_xticks(idx) dummy = ax.set_xticklabels(ml_orgs[sort_idx], rotation=80)