import pandas as pd from pylab import * pd.set_option('display.max_columns', 50) pd.set_option('display.line_width', 2000) print open('D:/site/gramener.com/viz/edu/TN-10-2012/README.TXT').read()[2600:3700] tn = pd.read_csv('D:/site/gramener.com/viz/edu/TN-10-2012/net10m_s.csv', names=['DIST', 'REGNO', 'NAME', 'LANG', 'ENG', 'MATH', 'SCI', 'SOC', 'MARKS', 'PASS', 'WITHHELD'], index_col=False) tn.head(10) numpy.sort(tn['LANG'].unique()) for column in ['LANG', 'ENG', 'MATH', 'SCI', 'SOC', 'MARKS']: tn[column] = tn[column].replace('AAA', numpy.nan).astype(float) tn.head(10) tn_marks = tn[['LANG', 'ENG', 'MATH', 'SCI', 'SOC']].dropna() 1 - (tn_marks > 35).sum().astype(float) / tn_marks.count() rcParams['figure.figsize'] = 25, 10 def distribution(series): # Drop the nans, get the frequency, and sort it in ascending order marks = series.dropna().value_counts().sort_index() # Ensure that all marks from 0 to 100 are present, and draw a bar chart pd.Series(marks, index=range(101)).plot(kind='bar') distribution(tn['ENG']) distribution(tn['SOC']) distribution(tn['LANG']) distribution(tn['MATH']) distribution(tn['SCI']) rcParams['figure.figsize'] = 4, 3 marks = tn[['LANG', 'ENG', 'MATH', 'SCI', 'SOC']] ((marks < 35).sum().astype(float) / marks.count()).order().plot(kind='bar') rcParams['figure.figsize'] = 25, 10 ka = pd.read_csv('d:/site/gramener.com/viz/autolyse/data/karnataka-marks-2011-subfields.csv') ka distribution(ka['English 2nd']) distribution(ka['Kannada']) distribution(ka['Mathematics']) distribution(ka['Science']) distribution(ka['Social Science']) ka.groupby('NRC_GENDER_CODE')['TOTAL_MARKS'].mean() import scipy.stats boys = ka[ka['NRC_GENDER_CODE'] == 'B']['TOTAL_MARKS'] girls = ka[ka['NRC_GENDER_CODE'] == 'G']['TOTAL_MARKS'] F, prob = scipy.stats.ttest_ind(boys.dropna(), girls.dropna()) prob icse = pd.read_csv('d:/site/gramener.com/viz/edu/CISCEResults2013/icse_scores.csv', names=['Student', 'Subject', 'Mark']) icse numpy.sort(icse['Mark'].unique()) icse['Mark'] = icse['Mark'].replace({'A': '', 'ABS *': '', 'B': '', 'C': '', 'D': '', 'PCA *': '', 'PCNA *': '', 'SPCA *': '', 'SPCNA*': '', 'X': '', 'XXX': ''}).replace('', numpy.nan).astype(float) distribution(icse['Mark']) icse['Subject'].value_counts().head(20) distribution(icse[icse['Subject'] == 'ENG']['Mark']) distribution(icse[icse['Subject'] == 'HCG']['Mark']) distribution(icse[icse['Subject'] == 'MAT']['Mark']) distribution(icse[icse['Subject'] == 'SCI']['Mark']) distribution(icse[icse['Subject'] == 'HIN']['Mark']) cbse = pd.read_csv('d:/site/gramener.com/viz/edu/CISCEResults2013/cbse_scores_revised.csv', names=['ROLLNO', 'SUB_ID', 'SUBJECT', 'MARK', 'GRADE', 'SOMETHING']) cbse.head() pd.Series(cbse['SUBJECT'].value_counts()).head(50).plot(kind='bar') cbse['MARK'].unique() cbse['SCORE'] = cbse['MARK'].dropna().str.slice(0,3).replace({'---':numpy.nan, 'ABA':numpy.nan, 'AB ':numpy.nan}).astype(float) distribution(cbse[cbse['SUBJECT'] == 'ENGLISH CORE']['SCORE']) distribution(cbse[cbse['SUBJECT'] == 'MATHEMATICS']['SCORE']) distribution(cbse[cbse['SUBJECT'] == 'PHYSICS']['SCORE']) distribution(cbse[cbse['SUBJECT'] == 'CHEMISTRY']['SCORE']) distribution(cbse[cbse['SUBJECT'] == 'ECONOMICS']['SCORE']) distribution(cbse[cbse['SUBJECT'] == 'PHYSICAL EDUCATION']['SCORE']) distribution(cbse[cbse['SUBJECT'] == 'BUSINESS STUDIES']['SCORE']) distribution(cbse[cbse['SUBJECT'] == 'ACCOUNTANCY']['SCORE']) tn_subjects = tn[['ENG', 'LANG', 'MATH', 'SCI', 'SOC']] tn_subjects.corr() from IPython.display import HTML def colour(data): html = [''] for column in data.columns: html.append('') html.append('') for index, row in data.iterrows(): html.append('') for column, value in row.iteritems(): # Red is 0.75, Green is 0.85, beyond that is cyan hue = (value - .75) / (.85 - .75) * 120 if value < .85 else 180 html.append(''.format(hue, value)) html.append('') html.append('
' + column + '
' + index + '{:.2f}
') return HTML(''.join(html)) colour(tn_subjects.corr()) from vis import SVG HTML(SVG('clusterplot.svg', width=300, height=300, data=tn_subjects, scatter=120, gradient=((.75, 'red'), (.80, 'yellow'), (.85, 'green')), regression=True)) import color HTML(SVG('subtreemap.svg', width = 500, height = 300, data = tn, keys = ['DIST'], values = {'MARKS': 'mean', 'REGNO': len}, size = lambda v: v['REGNO'], sort = lambda v: v.sort('REGNO', ascending=False), text = lambda v: v['DIST'], wrap = False, color = lambda v: color.gradient(v['MARKS'] / 500, ((0.40, 'red'), (0.60, 'yellow'), (0.80, 'green'))), aspect = 2, padding = 3, parents = False, ))