#!/usr/bin/env python # coding: utf-8 # This tutorial was partially adapted from http://enipedia.tudelft.nl/wiki/OpenRefine_Tutorial, where you can learn more about Open Refine. It used to be called Google Refine so try that too when you are searching for information. However, in our case, we use python to do the same thing. # # Downloading Data # # The university data can be downloaed from http://enipedia.tudelft.nl/enipedia/images/f/ff/UniversityData.zip # ## What you can learn # # The data contains quite a few issues, and this tutorial shows how to do things like: # # - Cleaning up inconsistent spelling of terms (i.e. "USA", "U.S.A", "U.S.", etc). # - Converting values that are text descriptions of numeric values (i.e. $123 million) to actual numeric values (i.e. 123000000) which are usable for analysis. # - Identifying which rows of a specific column contain a search term # - Extracting and cleaning values for dates # - Removing duplicate rows # - Using a scatterplot to visualize relationships between values in different columns # - Finding geographic coordinates for a list of place names (i.e. the names of universities, etc.) # - Exporting cleaned data to Excel # # Reading Data with Pandas # In[487]: get_ipython().run_line_magic('matplotlib', 'inline') import sys import pandas as pd import statsmodels.api as sm from collections import Counter, defaultdict import numpy as np import datetime import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') # In[579]: df = pd.read_csv('universityData.csv', sep = '\t', encoding = 'utf-8') df.head() # In[580]: print df.university[2] # Apparently, there are duplicate rows. # In[489]: len(df) # # Deduplicate entries # In[490]: df = df.drop_duplicates() len(df) # We already see an issue here where there is both the full name of a country (United States) and its abbreviation (US). To fix this, we can just copy/paste "United States" as the new cell value. # # # Clean up country names # In[491]: df['country'].unique() # In[492]: country_df = df.groupby('country').size() for k in country_df.index: print(k, country_df[k]) # In[493]: us_condition = df['country'].isin(['U.S.', 'U.S.A.', 'US', 'United States', 'United States )', 'United States of America']) df['country'][us_condition] = 'USA' # In[494]: ca_condition =[] for i in df['country']: if i.__contains__('Canada'): ca_condition.append(True) else: ca_condition.append(False) df['country'][ca_condition] = 'Canada' # In[495]: ca_condition =[] for i in df['country']: if i.__contains__('China'): ca_condition.append(True) else: ca_condition.append(False) df['country'][ca_condition] = 'China' # In[496]: ca_condition =[] for i in df['country']: if i.__contains__('England'): ca_condition.append(True) else: ca_condition.append(False) df['country'][ca_condition] = 'England' # In[497]: ca_condition =[] for i in df['country']: if i.__contains__('Scotland'): ca_condition.append(True) else: ca_condition.append(False) df['country'][ca_condition] = 'England' # In[498]: uk_condition = df['country'].isin(['UK', 'United Kingdom', 'Wales']) df['country'][uk_condition] = 'England' # In[499]: ca_condition =[] for i in df['country']: if i.__contains__('Netherlands'): ca_condition.append(True) else: ca_condition.append(False) df['country'][ca_condition] = 'Netherlands' # In[500]: ca_condition =[] for i in df['country']: if i.__contains__('Russia'): ca_condition.append(True) else: ca_condition.append(False) df['country'][ca_condition] = 'Russia' # In[501]: ca_condition =[] for i in df['country']: if i.__contains__('Bahamas'): ca_condition.append(True) else: ca_condition.append(False) df['country'][ca_condition] = 'Bahamas' # In[502]: 'Russian Federation'.__contains__('Russia') # In[503]: uk_condition = df['country']=='Cura%C3%A7ao' df['country'][uk_condition] = "Curacao" # In[504]: df[df['country']==','] # # Universidad Juárez Autónoma de Tabasco is a public institution of higher learning located in Villahermosa, Tabasco, Mexico. # In[505]: uk_condition = df['country']==',' df['country'][uk_condition] = "Mexico" # In[506]: df[df['country']=='Satellite locations:'] # Nova Southeastern University (NSU) is a private nonprofit university, with a main campus located on 300 acres (120 ha) in Davie, in the US state of Florida. Formerly referred to as "Nova" and now commonly called "NSU", the university currently consists of 18 colleges and schools offering over 175 programs of study with more than 250 majors. # In[507]: uk_condition = df['country']=='Satellite locations:' df['country'][uk_condition] = "USA" # In[508]: country_df = df.groupby('country').size() for k in country_df.index: print(k, country_df[k]) # # Clean up values for the number of students # # # We need to clean the data for the number of students. Not all of the values are numeric, and many of them contain bits of text in addition to the actual number of the students. # To figure out which entries need to be fixed, we need to use a Numeric facet: # In[509]: df.describe() # In[510]: df.numStudents.unique() # In[511]: float('5.29092E7') # In[512]: df.numStudents = [str(i).replace('+', '').replace('~', '') for i in df.numStudents] df.numStudents = [str(i).replace(',', '').replace('-', '').strip() for i in df.numStudents] df.numStudents = [i.split(' ')[0] for i in df.numStudents] # In[513]: # https://stackoverflow.com/questions/4138202/using-isdigit-for-floats def isDigit(x): try: float(x) return True except ValueError: return False # In[514]: isDigit('1130.0') # In[515]: str.isdigit('1130.0') # In[516]: df['numStudents'] = df['numStudents'].apply(lambda x: np.float(x) if isDigit(x) else np.nan) # In[517]: odds = {} for i in df['numStudents']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 # In[518]: len(df) # In[519]: odds # In[520]: df.numStudents.iloc[0] # In[521]: df.numStudents.unique()[:10] # In[522]: df.numStudents.max(), df.numStudents.min() # In[523]: # df.numStudents.astype('float') # In[524]: # pd.to_numeric(df.numStudents)#, errors='ignore') # In[525]: df.describe() # In[526]: plt.hist(df.numStudents.dropna(), bins = 200, color='blue') plt.yscale('log') plt.xscale('log') plt.show() # # Clean up values for the endowment # # First remove the numeric facet for numStudents and create a new numeric facet for endowment. Select only the non-numeric values, as was done for the number of students. # Already we see issues like "US$1.3 billion" and "US $186 million" # In[527]: df['endowment'] # In[528]: np.sum(odds.values()) # In[529]: odds = {} for i in df['endowment']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 odds.items() # In[530]: np.float('1E6') # In[531]: df.endowment = [str(i).replace('US $', '').replace('US$', '') for i in df.endowment] df.endowment = [str(i).replace('USD$', '').replace('USD $', '') for i in df.endowment] df.endowment = [str(i).replace('U.S. $', '').replace(',', '').strip() for i in df.endowment] endowment = [] for i in df.endowment: if i.__contains__('$'): endowment.append(str(i).split('$')[1]) else: endowment.append(i) df.endowment = endowment # In[532]: df.endowment = [str(i).replace(' million', 'E6').replace(' billion', 'E9').strip() for i in df.endowment] df.endowment = [str(i).replace('million', 'E6').replace('billion', 'E9').strip() for i in df.endowment] df.endowment = [str(i).replace(' Million', 'E6').replace(' Billion', 'E9').strip() for i in df.endowment] df.endowment = [str(i).split(' ')[0] for i in df.endowment] df.endowment = [str(i).replace('M', 'E6').strip() for i in df.endowment] df.endowment = [str(i).replace(';', '').replace('+', '').strip() for i in df.endowment] # df.endowment = [str(i).split('xbf')[1] for i in df.endowment] # df.endowment = [str(i).split('xb')[1] for i in df.endowment] # df.endowment = [str(i).split('xa')[1] for i in df.endowment] # After most of this has been cleaned up, select the non-numeric values, and delete them, just as was done for the numStudents. # In[533]: df['endowment'] = df['endowment'].apply(lambda x: np.float(x) if isDigit(x) else np.nan) # In[534]: df.describe() # - 巴西雷亚尔的符号 R\$ # - CANADA DOLLARS C\$ # - 澳元的货币符号 A\$ # Both "million" and "Million" are in the values, so it's useful to convert all the values to lowercase instead of cleaning this up twice. # # In[535]: df.head() # # Others # # numFaculty, numDoctoral, numStaff, numPostgrad, numUndergrad, established # # ## numFaculty # In[536]: df.numFaculty = [str(i).replace('Total: ', '').replace(',', '') for i in df.numFaculty] df.numFaculty = [str(i).replace('>', '').replace('~', '') for i in df.numFaculty] df.numFaculty = [str(i).split(' ')[0] for i in df.numFaculty] # In[537]: df['numFaculty'] = df['numFaculty'].apply(lambda x: np.float(x) if isDigit(x) else np.nan) # In[538]: odds = {} for i in df['numFaculty']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 odds.items() # ## numDoctoral # In[539]: odds = {} for i in df['numDoctoral']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 odds.items() # In[540]: df.numDoctoral = [str(i).replace('~', '') for i in df.numFaculty] # In[541]: df['numDoctoral'] = df['numDoctoral'].apply(lambda x: np.float(x) if isDigit(x) else np.nan) # ## numStaff # In[542]: odds = {} for i in df['numStaff']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 odds.items() # In[543]: df.numStaff = [str(i).replace('Total: ', '').replace(',', '') for i in df.numStaff] df.numStaff = [str(i).replace('>', '').replace('~', '') for i in df.numStaff] df.numStaff = [str(i).replace('Around ', '').replace('appx. ', '') for i in df.numStaff] df.numStaff = [str(i).replace(' employees', '').replace('Approximately ', '') for i in df.numStaff] df.numStaff = [str(i).replace('Full-time: ', '').replace('Part-time: ', '') for i in df.numStaff] df.numStaff = [str(i).replace('approximately ', '') for i in df.numStaff] df.numStaff = [str(i).split(' ')[0] for i in df.numStaff] # In[544]: df['numStaff'] = df['numStaff'].apply(lambda x: np.float(x) if isDigit(x) else np.nan) # ## numPostgrad # In[545]: odds = {} for i in df['numPostgrad']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 odds.items() # In[546]: df.numPostgrad = [str(i).replace('~', '').replace(',', '') for i in df.numPostgrad] df.numPostgrad = [str(i).replace('approx. ', '').replace('Approx. ', '') for i in df.numPostgrad] df.numPostgrad = [str(i).replace('Approximately ', '').replace('Aprx. ', '') for i in df.numPostgrad] df.numPostgrad = [str(i).replace('+', '').replace('over', '') for i in df.numPostgrad] df.numPostgrad = [str(i).split(' ')[0] for i in df.numPostgrad] # In[547]: df['numPostgrad'] = df['numPostgrad'].apply(lambda x: np.float(x) if isDigit(x) else np.nan) # ## numUndergrad # In[548]: odds = {} for i in df['numUndergrad']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 odds.items() # In[549]: df.numUndergrad = [str(i).replace('~', '').replace(',', '') for i in df.numUndergrad] df.numUndergrad = [str(i).replace('approx. ', '').replace('Approx. ', '') for i in df.numUndergrad] df.numUndergrad = [str(i).replace('Approximately ', '').replace('Aprx. ', '') for i in df.numUndergrad] df.numUndergrad = [str(i).replace('approximately ', '').replace('Around ', '') for i in df.numUndergrad] df.numUndergrad = [str(i).replace('+', '').replace('over', '') for i in df.numUndergrad] df.numUndergrad = [str(i).split(' ')[0] for i in df.numUndergrad] # In[550]: df['numUndergrad'] = df['numUndergrad'].apply(lambda x: np.float(x) if isDigit(x) else np.nan) # ## established # In[551]: df.established # In[570]: odds = {} for i in df['established']: if not isDigit(i): try: odds[i] += 1 except: odds[i] = 1 odds.items() # In[571]: import re def getYear(s): try: match = re.match(r'.*([1-3][0-9]{3})', s) return np.int(match.group(1)) except: return np.nan # In[572]: df.established = [getYear(i) for i in df.established] # In[573]: df.describe() # In[ ]: # # University # In[90]: df['university'].unique(), df['university'].unique().size # In[236]: # university_df = df.groupby('university').size() # for k in university_df.index: # print(k, university_df[k]) # In[ ]: