Data Cleaning and Preparation¶

In [1]:

import pandas as pd
import numpy as np

Missing data¶

In [3]:

string_data = pd.Series(["aardvark", "artichoke", np.nan, "avocado"])
string_data

Out[3]:

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [14]:

# check for NAs i.e "not available" or NaN i.e "not a number"
string_data.isnull()

Out[14]:

0    False
1    False
2     True
3    False
dtype: bool

In [15]:

string_data[0] = None
string_data

Out[15]:

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [16]:

string_data.dropna()

Out[16]:

1    artichoke
3      avocado
dtype: object

In [39]:

from numpy import nan as NA
df = pd.DataFrame([[1, 2, 3, 4, 5, 7], 
                  [4, 5, 6, None, NA, 7],
                   [NA, NA, NA, NA, NA, NA],
                  [4, 5, NA, NA, None, 6]])
df

Out[39]:

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

In [28]:

df.dropna()

Out[28]:

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0

In [29]:

df.dropna(how = "all")

Out[29]:

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
3	4.0	5.0	NaN	NaN	NaN	6.0

In [40]:

df[6] = NA
df

Out[40]:

	0	1	2	3	4	5	6
0	1.0	2.0	3.0	4.0	5.0	7.0	NaN
1	4.0	5.0	6.0	NaN	NaN	7.0	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0	NaN

In [37]:

df.dropna(axis = 1, how = "all")

Out[37]:

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

In [41]:

df2 = df.fillna(0)
df2

Out[41]:

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	0.0	0.0	7.0
2	0.0	0.0	0.0	0.0	0.0	0.0
3	4.0	5.0	0.0	0.0	0.0	6.0

In [42]:

df3 = df.fillna(df.mean())
df3

Out[42]:

	0	1	2	3	4	5	6
0	1.0	2.0	3.0	4.0	5.0	7.000000	NaN
1	4.0	5.0	6.0	4.0	5.0	7.000000	NaN
2	3.0	4.0	4.5	4.0	5.0	6.666667	NaN
3	4.0	5.0	4.5	4.0	5.0	6.000000	NaN

Data transformation¶

In [50]:

df = pd.DataFrame({"h1" : ["one", "two", "three"] * 3, 
                  "h2" : [1, 2, 2, 1, 5, 5, 2, 2, 5]})
df

Out[50]:

	h1	h2
0	one	1
1	two	2
2	three	2
3	one	1
4	two	5
5	three	5
6	one	2
7	two	2
8	three	5

In [51]:

df.duplicated()

Out[51]:

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7     True
8     True
dtype: bool

In [52]:

df2 = df.drop_duplicates()
df2

Out[52]:

	h1	h2
0	one	1
1	two	2
2	three	2
4	two	5
5	three	5
6	one	2

In [58]:

df2 = df["h1"].drop_duplicates()
df2

Out[58]:

0      one
1      two
2    three
Name: h1, dtype: object

In [59]:

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Out[59]:

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

In [60]:

meat_to_animal = {  
    'bacon': 'pig',  
    'pulled pork': 'pig',  
    'pastrami': 'cow',  
    'corned beef': 'cow',  
    'honey ham': 'pig',  
    'nova lox': 'salmon'}

In [65]:

lowered = data.food.str.lower()
lowered

Out[65]:

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [67]:

data["animal"] = lowered.map(meat_to_animal)
data

Out[67]:

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	Pastrami	6.0	cow
4	corned beef	7.5	cow
5	Bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon

In [68]:

data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

Out[68]:

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [69]:

data.replace(-999, np.nan)

Out[69]:

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [71]:

data.replace([-999, -1000], np.nan)

Out[71]:

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [73]:

data.replace({-999 : np.nan, 
             -1000: 0})

Out[73]:

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [20]:

df = pd.DataFrame(np.arange(start = 1, stop = 13, step = 1).reshape(3, 4))
df.index = ["ohio", "colorado", "ny"]
df.columns = ["one", "two", "three", "four"]
df

Out[20]:

	one	two	three	four
ohio	1	2	3	4
colorado	5	6	7	8
ny	9	10	11	12

In [33]:

to_upper = lambda x: x.upper()

In [34]:

df.index.map(to_upper)

Out[34]:

Index(['OHIO', 'COLORADO', 'NY'], dtype='object')

In [36]:

df.index = df.index.map(to_upper)
df

Out[36]:

	one	two	three	four
OHIO	1	2	3	4
COLORADO	5	6	7	8
NY	9	10	11	12

In [39]:

df.rename(index = str.upper, columns = str.title)

Out[39]:

	One	Two	Three	Four
OHIO	1	2	3	4
COLORADO	5	6	7	8
NY	9	10	11	12

In [65]:

age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
age

Out[65]:

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [60]:

bins =[18, 25, 35, 60, 100]

In [67]:

cats = pd.cut(age, bins)
cats

Out[67]:

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [68]:

cats.categories

Out[68]:

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [69]:

cats.codes

Out[69]:

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [70]:

pd.value_counts(cats)

Out[70]:

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [74]:

df = pd.DataFrame(np.random.randn(1000, 4))
df.head()

Out[74]:

	0	1	2	3
0	-1.060885	0.257677	-0.439057	-0.585580
1	0.992414	0.848633	0.279456	1.804831
2	-0.465408	-1.573807	0.313331	0.939957
3	0.247694	-0.232562	-0.763437	3.137660
4	-0.455083	1.724922	-0.467447	0.595734

In [75]:

df.describe()

Out[75]:

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.024221	-0.005284	-0.007271	0.058116
std	0.999047	0.981500	1.027020	0.993744
min	-3.060966	-3.082180	-3.907713	-3.536450
25%	-0.651549	-0.667142	-0.709838	-0.641517
50%	0.028659	0.023977	-0.028669	0.048662
75%	0.683996	0.725258	0.675920	0.739192
max	3.759436	2.994700	3.263231	3.197917

In [79]:

df[2][np.abs(df[2]) > 3]

Out[79]:

145   -3.907713
971    3.263231
Name: 2, dtype: float64

In [81]:

df.sample(frac = 0.75).head()

Out[81]:

	0	1	2	3
555	-0.055178	-0.704377	1.067797	-0.264149
295	-0.338811	-0.589736	-0.562751	-0.357064
329	0.634184	0.472945	-0.562836	-0.461018
727	-0.735358	-2.652772	-2.111481	0.254709
484	1.133932	0.770841	1.053564	-1.719157

String manipulation¶

In [82]:

val = "a, b,  guide"
val

Out[82]:

'a, b,  guide'

In [83]:

val.split(",")

Out[83]:

['a', ' b', '  guide']

In [95]:

pieces = [x.strip() for x in val.split(",")]
pieces

Out[95]:

['a', 'b', 'guide']

In [96]:

"__".join(pieces)

Out[96]:

'a__b__guide'

In [97]:

val.count("a")

Out[97]:

Regular expressions¶

Flexible way to search or match string patterns

In [98]:

import re

In [100]:

text = "foo     bar\t baz   \tqux"
text

Out[100]:

'foo     bar\t baz   \tqux'

In [103]:

re.split("\s+", text)

Out[103]:

['foo', 'bar', 'baz', 'qux']

In [104]:

regex = re.compile("\s+")

In [105]:

regex.split(text)

Out[105]:

['foo', 'bar', 'baz', 'qux']

In [ ]:

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

	0	1	2	3	4	5	6
0	1.0	2.0	3.0	4.0	5.0	7.0	NaN
1	4.0	5.0	6.0	NaN	NaN	7.0	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0	NaN

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	0.0	0.0	7.0
2	0.0	0.0	0.0	0.0	0.0	0.0
3	4.0	5.0	0.0	0.0	0.0	6.0

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

	0	1	2	3	4	5	6
0	1.0	2.0	3.0	4.0	5.0	7.0	NaN
1	4.0	5.0	6.0	NaN	NaN	7.0	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0	NaN

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	0.0	0.0	7.0
2	0.0	0.0	0.0	0.0	0.0	0.0
3	4.0	5.0	0.0	0.0	0.0	6.0

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

	0	1	2	3	4	5	6
0	1.0	2.0	3.0	4.0	5.0	7.0	NaN
1	4.0	5.0	6.0	NaN	NaN	7.0	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0	NaN

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	NaN	NaN	7.0
2	NaN	NaN	NaN	NaN	NaN	NaN
3	4.0	5.0	NaN	NaN	NaN	6.0

	0	1	2	3	4	5
0	1.0	2.0	3.0	4.0	5.0	7.0
1	4.0	5.0	6.0	0.0	0.0	7.0
2	0.0	0.0	0.0	0.0	0.0	0.0
3	4.0	5.0	0.0	0.0	0.0	6.0