import re import matplotlib.pyplot as plt import numpy as np import prettytable splitter = re.compile(r'(?:[^,(]|\([^)]*\))+') with open('disciplines.txt') as f: students = f.readlines() students = [set(d.strip() for d in splitter.findall(student)) for student in students] domains = set() _ = map(domains.update, students) physics = {'Aerospace Engineering', 'Astrophysics', 'Atmospheric Science', 'Earth sciences (geology, oceanography, meteorology)', 'Engineering (civil, mechanical, chemical)', 'Geography', 'Geological Engineering : Environmental', 'Geological Engineering : Geotechnical', 'Geophysics', 'Hydrogeology', 'Hydrology', 'Physical Oceanography', 'Physics', 'Space sciences', 'astronomy', 'biomedical engineering',} chemistry = {'Chemical/Geochemical Oceanography', 'Chemistry', 'Materials Science', 'Materials Science and Engineering', 'materials science',} biology = {'Biological Oceanography', 'Biomechanics', 'Brain and neurosciences', 'Environmental Science', 'Life science (biology, genetics)', 'Life science (ecology, zoology, botany)', 'Medicine', 'Nursing/research',} mathematics_cs = {'Applied Mathematics', 'Applied math', 'Computer science and electrical engineering', 'Data analysis', 'Mathematics', 'Statistics', 'Tech support', 'data analysis', 'lab tech', 'or support programmer', 'statistics'} human_social = {'Admin', 'Business', 'Design', 'Economics', 'Human factors & applied psychology', 'Humanities and social sciences', 'Law', 'Legal', 'Librarianship', 'Library Science', 'Library Systems', 'Library science', 'Non-profit', 'editing/publishing', 'finance',} domains = [physics, chemistry, biology, mathematics_cs, human_social] domain_labels = ["Physics", "Chemistry", "Biology", "Mathematics & Computer Science", "Humanities & Social Sciences"] counts = np.zeros(5, int) for i, domain in enumerate(domains): for student in students: counts[i] += bool(student & domain) ndat = len(domains) dat = 100 * np.asarray(counts, float) / len(students) fig = plt.figure(1) p = plt.subplot(1, 1, 1) plt.bar(range(ndat), dat, align='center', fc='r') plt.axis([-0.5, ndat - 0.5, 0, 100]) plt.xticks(range(ndat), domain_labels, rotation=90) plt.ylabel("Percentage (%)") plt.savefig("students_by_domain.svg", bbox_inches="tight") plt.show() plt.close() tab = prettytable.PrettyTable(["Domain", "Percentage (%)"]) for x, y in zip(domain_labels, dat): tab.add_row([x, "{0:.2f}".format(y)]) print tab.get_string() print tab.get_html_string()