#!/usr/bin/env python # coding: utf-8 # # Data Cleaning and Preparation # In[1]: import pandas as pd import numpy as np # ## Missing data # In[3]: string_data = pd.Series(["aardvark", "artichoke", np.nan, "avocado"]) string_data # In[14]: # check for NAs i.e "not available" or NaN i.e "not a number" string_data.isnull() # In[15]: string_data[0] = None string_data # In[16]: string_data.dropna() # In[39]: from numpy import nan as NA df = pd.DataFrame([[1, 2, 3, 4, 5, 7], [4, 5, 6, None, NA, 7], [NA, NA, NA, NA, NA, NA], [4, 5, NA, NA, None, 6]]) df # In[28]: df.dropna() # In[29]: df.dropna(how = "all") # In[40]: df[6] = NA df # In[37]: df.dropna(axis = 1, how = "all") # In[41]: df2 = df.fillna(0) df2 # In[42]: df3 = df.fillna(df.mean()) df3 # ## Data transformation # In[50]: df = pd.DataFrame({"h1" : ["one", "two", "three"] * 3, "h2" : [1, 2, 2, 1, 5, 5, 2, 2, 5]}) df # In[51]: df.duplicated() # In[52]: df2 = df.drop_duplicates() df2 # In[58]: df2 = df["h1"].drop_duplicates() df2 # In[59]: data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}) data # In[60]: meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'} # In[65]: lowered = data.food.str.lower() lowered # In[67]: data["animal"] = lowered.map(meat_to_animal) data # In[68]: data = pd.Series([1., -999., 2., -999., -1000., 3.]) data # In[69]: data.replace(-999, np.nan) # In[71]: data.replace([-999, -1000], np.nan) # In[73]: data.replace({-999 : np.nan, -1000: 0}) # In[20]: df = pd.DataFrame(np.arange(start = 1, stop = 13, step = 1).reshape(3, 4)) df.index = ["ohio", "colorado", "ny"] df.columns = ["one", "two", "three", "four"] df # In[33]: to_upper = lambda x: x.upper() # In[34]: df.index.map(to_upper) # In[36]: df.index = df.index.map(to_upper) df # In[39]: df.rename(index = str.upper, columns = str.title) # In[65]: age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] age # In[60]: bins =[18, 25, 35, 60, 100] # In[67]: cats = pd.cut(age, bins) cats # In[68]: cats.categories # In[69]: cats.codes # In[70]: pd.value_counts(cats) # In[74]: df = pd.DataFrame(np.random.randn(1000, 4)) df.head() # In[75]: df.describe() # In[79]: df[2][np.abs(df[2]) > 3] # In[81]: df.sample(frac = 0.75).head() # ## String manipulation # In[82]: val = "a, b, guide" val # In[83]: val.split(",") # In[95]: pieces = [x.strip() for x in val.split(",")] pieces # In[96]: "__".join(pieces) # In[97]: val.count("a") # ### Regular expressions # Flexible way to search or match string patterns # In[98]: import re # In[100]: text = "foo bar\t baz \tqux" text # In[103]: re.split("\s+", text) # In[104]: regex = re.compile("\s+") # In[105]: regex.split(text) # In[ ]: