In this project, we gathered the data for all MIT astrophysics alumni who finished their PhD since 2002 (data from MIT Astro Website) and created a quick data visualization to see the data in a quick glance.
import subprocess
import pickle
import matplotlib.pyplot as plt
import mpld3
import numpy as np
import pandas as pd
from astropy.io import fits
from statsmodels.graphics.mosaicplot import mosaic
%matplotlib inline
df=pd.read_csv('mki_data.csv')
df=df.replace('n/a', np.nan)
df=df.replace('Reppaport','Rappaport')
df=df.replace('Burles','Burke')
df2=df.groupby(['Career Type'])
NUM_COLORS=7
cm = plt.get_cmap('Set2')
df2=df.groupby(['Career Type'])
bins=np.arange(13)+2002.-0.5
y=[group['Year'] for name, group in df2]
names=[str(name) for name, group in df2]
fig,ax=plt.subplots(figsize=(15,7))
ax.set_color_cycle([cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])
plot=ax.hist(y, bins=bins, stacked=True, normed = True, label=names)
box=ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.suptitle("Career Type")
<matplotlib.text.Text at 0x135650f10>
for name, group in df2:
print "--"+name+"--"
for i in zip(group['Company Name'].values,group['Title'].values):
print i
print ""
--Academic-- ('University of Michigan', 'Associate Professor') ('Arizona State University', 'Assistant Professor') ('University of Michigan', 'Associate Professor(also CFA affiliate)') ('Sam Houston State University', 'Assistant Professor') ('Mount Holyoke College', 'Assistant Professor') ('Caltech', 'Professor') ('University of California, Santa Cruz', 'Assistant Professor') ('Shanghai Astronomical Observatory', 'Professor') ('University of Utah', 'Assistant Professor') ('Tennessee State University', 'Assistant Professor') ('Syracuse Univesity', 'Professor') ('Arizona State University', 'Assistant Professor') ('University of Tokyo', 'Professor') ('Louisiana State University', 'Assistant Professor') ('UC Davis', 'Lecturer') ('Georgia Tech', 'Assistant Professor') ('NC State University', 'Assistant Professor') --K-12-- ('Westport Board of Education, CT', 'Physics & Science Research Teacher') ('Acera School', 'Specialist') --Medical Industry-- ('MGH/Harvard Medical School', 'Instructor in Radiology') ('San Fransicso Paramedic Association', 'EMT Program Director') ('Harvard Medical School', 'Medical Physics Resident') --Nonprofit-- ('Institute for Defense Analyses', 'Research Staff') ('Institute for Defense Analyses', 'Research Staff') --Outside Industry-- ('Danske Bank', 'Senior Portfolio Manager') ('IBM Canada', 'Principal Engineer') ('The Moving Picture Company', 'R&D Software Engineer') ('Metron, Inc.', 'Analyst') ('Canadian Imperial Bank of Commerce', 'Senior Analyst') ('Origin Energy', 'Senior Analyst') ('HP Autonomy', 'Systems Performance Engineer') ('Rbs Securities', 'Advisor') ('Arete Associates', 'Staff Scientist') ('Microsoft Bing Search', 'Program Manager') ('Morgan Stanley', 'Desk Strategist, Vice President') ('J.P. Morgan', 'Associate') ('Goldman Sachs', 'Broker') ('HP Autonomy', 'Technology Consultant') ('WegoWise', 'Data Scientist') ('Waters Pacific Pte Ltd', 'Senior Engineering Manager(R&D)') ('Eyekon Systems LLC', 'Founder') ('Smith&Nephew, Endoscopy', 'Development Engineer') ('GE Global Research', 'Optical Scientist') ('UBS London', 'Associate Director') ('Intel', 'Senior Process Engineer') ('Intellectual Ventures', 'Research Analyst') ('Izentis LLC', 'CEO') ('Lam Research', 'Mechanical Engineer') --Postdoc-- ('Jet Propulsion Laboratory', 'Postdoctoral Associate') ('NRAO', 'Postdoctoral Fellow') ('Harvard- CfA', 'Hubble Fellow') ('Northwestern University', 'Postdoctoral Fellow') ('University of Florida', 'Postdoctoral Associate') ('University of Alabama', 'Postdoctoral Associate') ('Yale University', 'Postdoctoral Fellow') ('NASA', 'Postdoctoral Fellow') ('Astronomical Institute Anton Pannekoek', 'Postdoc') ('Purdue University', 'Postdoctoral Researcher') ('Kapteyn Astronomical Institute', 'Postdoctoral Researcher') ('UC Berkeley', 'Postdoctoral Fellow') ('Caltech', 'Hubble Fellow') ('Caltech-LIGO', 'Postdoctoral Fellow') ('Cornell University', 'Einstein Fellow') ('Jet Propulsion Laboratory', 'NASA Postdoctoral Fellow') ('UC Berkeley', 'Postdoctoral Fellow') ('LIGO Hanford Observatory', 'Postdoc') ('MIT-LIGO', 'Postdoctoral Associate') ("Institut d'Astrophysique de Paris", 'Postdoctoral Fellow') ('MIT', 'Postdoctoral Fellow') ('Stanford University', 'Postdoctoral Scholar') ('Yale University', 'Postdoctoral Associate') --Scientific Research -- ('MIT Lincoln Laboratory', 'Technical Staff') ('MIT-Lincoln Lab', 'Technical Staff') ('MIT Lincoln Laboratory', 'Technical Staff') ('NASA - Goddard', 'Astrophysicist') ('UC Berkeley', 'Research Physicist') ('Caltech', 'Research Physicist') ('Boston University', 'Research Associate') ('MIT Lincoln Laboratory', 'Technical Staff') ('NASA, Ames for 3 yrs; currently, MIT', 'MechE PhD program')
NUM_COLORS = 7
cm = plt.get_cmap('Paired')
df2=df.groupby(['Area'])
bins=np.arange(13)+2002.-0.5
#n=np.arange(13)+2002.
#x_label=['%i' % i for i in n]
y=[group['Year'] for name, group in df2]
names=[str(name) for name, group in df2]
fig,ax=plt.subplots(figsize=(15,7))
ax.set_color_cycle([cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])
ax.hist(y, bins=bins, alpha=1.0, stacked=True, normed = True, label=names)
box=ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.suptitle("Area Study")
#ax.set_xticklabels(x_label)
<matplotlib.text.Text at 0x135b5ee90>
NUM_COLORS = 28
cm = plt.get_cmap('Paired')
df2=df.groupby(['Supervisor1'])
bins=np.arange(13)+2002.-0.5
y=[group['Year'] for name, group in df2]
names=[str(name) for name, group in df2]
fig,ax=plt.subplots(figsize=(15,7))
ax.set_color_cycle([cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])
ax.hist(y, bins=bins, alpha=1.0, stacked=True, normed = True, label=names)
box=ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.suptitle("Supervisor")
<matplotlib.text.Text at 0x133ea0a50>
df3=df2=df.groupby(['Career Type','Year']) x=[group['Title'] for name, group in df3]
[', '.join(group['Title'].values) for name, group in df3 if name[0]=='Academic']