Data Transformation¶

1. Removing Duplicates¶

In [1]:

import pandas as pd
import numpy as np

In [2]:

data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Out[2]:

	k1	k2
0	one	1
1	one	1
2	one	2
3	two	3
4	two	3
5	two	4
6	two	4

In [3]:

# The DataFrame method duplicated returns a boolean Series indicating whether each
# row is a duplicate or not

data.duplicated()

Out[3]:

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [4]:

#  drop_duplicates  drop the duplicates
data.drop_duplicates()

Out[4]:

	k1	k2
0	one	1
2	one	2
3	two	3
5	two	4

In [5]:

# Both of these methods by default consider all of the columns; 
# until we defined a specific column name
data.drop_duplicates(['k1'])

Out[5]:

	k1	k2
0	one	1
3	two	3

In [6]:

data.drop_duplicates(['k2'])

Out[6]:

	k1	k2
0	one	1
2	one	2
3	two	3
5	two	4

In [7]:

# duplicated and drop_duplicates by default keep the first observed value combination.
# Passing take_last=True will return the last one:

data.drop_duplicates(['k1', 'k2'], take_last=True)

C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:4: FutureWarning: the take_last=True keyword is deprecated, use keep='last' instead

Out[7]:

	k1	k2
1	one	1
2	one	2
4	two	3
6	two	4

In [9]:

data.drop_duplicates(['k1', 'k2'], keep='last')

Out[9]:

	k1	k2
1	one	1
2	one	2
4	two	3
6	two	4

2. Transforming Data Using a Function or Mapping¶

In [10]:

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                               'corned beef', 'Bacon', 'pastrami', 'honey ham',
                               'nova lox'],
                      'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Out[10]:

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

In [11]:

meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal

Out[11]:

{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled pork': 'pig'}

In [12]:

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

Out[12]:

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	Pastrami	6.0	cow
4	corned beef	7.5	cow
5	Bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon

In [13]:

data['food'].map(lambda x: meat_to_animal[x.lower()])

Out[13]:

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Replacing Values¶

In [14]:

data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

Out[14]:

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [16]:

# The -999 values might be sentinel values for missing data. To replace these with NA
# values that pandas understands, we can use replace

data.replace(-999, np.nan)

Out[16]:

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [18]:

# replace multiple values at once

data.replace([-999,-1000],  np.nan)

Out[18]:

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [19]:

# To use a different replacement for each value, pass a list of substitutes:
data.replace([-999,-1000],  [np.nan, 0])

Out[19]:

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [20]:

# The argument passed can also be a dict:
data.replace({-999: np.nan, -1000: 0})

Out[20]:

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Renaming Axis Indexes¶

In [22]:

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                     index=['Ohio', 'Colorado', 'New York'],
                     columns=['one', 'two', 'three', 'four'])
data

Out[22]:

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

In [24]:

data.index.map(str.upper)

Out[24]:

array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)

In [25]:

data.index = data.index.map(str.upper)
data

Out[25]:

	one	two	three	four
OHIO	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

In [27]:

# o create a transformed version of a data set without modifying the original,
# a useful method is rename:
data.rename(index=str.title, columns=str.upper)

Out[27]:

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

In [28]:

#  rename can be used in conjunction with a dict-like object providing new values
#  for a subset of the axis labels

data.rename(index={'OHIO': 'INDIANA'},
                columns={'three': 'peekaboo'})

Out[28]:

	one	two	peekaboo	four
INDIANA	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

In [30]:

# rename saves having to copy the DataFrame manually and assign to its index and columns 
# attributes. Should you wish to modify a data set in place, pass inplace=True

_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Out[30]:

	one	two	three	four
INDIANA	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

Discretization and Binning¶

In [31]:

# Continuous data is often discretized or otherwised separated into “bins” for analysis.

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
ages

Out[31]:

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [32]:

bins = [18, 25, 35, 60, 100]
bins

Out[32]:

[18, 25, 35, 60, 100]

In [33]:

cats = pd.cut(ages, bins)
cats

Out[33]:

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [34]:

cats.labels

C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: 'labels' is deprecated. Use 'codes' instead
  if __name__ == '__main__':

Out[34]:

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [35]:

cats.codes

Out[35]:

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [44]:

[bins[i] for i in cats.codes]

Out[44]:

[18, 18, 18, 25, 18, 18, 35, 25, 60, 35, 35, 25]

In [45]:

cats.levels

C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: Accessing 'levels' is deprecated, use 'categories'
  if __name__ == '__main__':

Out[45]:

Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')

In [46]:

cats.categories

Out[46]:

Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')

In [47]:

pd.value_counts(cats)

Out[47]:

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [48]:

# Consistent with mathematical notation for intervals, a parenthesis means that the side
# is open while the square bracket means it is closed (inclusive). Which side is closed can
# be changed by passing right=False:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

Out[48]:

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [49]:

# You can also pass your own bin names by passing a list or array to the labels option:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

Out[49]:

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [50]:

# If you pass cut a integer number of bins instead of explicit bin edges, it will compute
# equal-length bins based on the minimum and maximum values in the data. 
data = np.random.rand(20)
data

Out[50]:

array([ 0.30040941,  0.42335852,  0.73095696,  0.74586086,  0.99197458,
        0.10334217,  0.53164815,  0.63212415,  0.29426088,  0.6753901 ,
        0.32015222,  0.26670971,  0.32528884,  0.51031056,  0.33031443,
        0.59880679,  0.69966579,  0.32159514,  0.71322471,  0.82226222])

In [53]:

cat=pd.cut(data, 4, precision=2)
cat

Out[53]:

[(0.1, 0.33], (0.33, 0.55], (0.55, 0.77], (0.55, 0.77], (0.77, 0.99], ..., (0.55, 0.77], (0.55, 0.77], (0.1, 0.33], (0.55, 0.77], (0.77, 0.99]]
Length: 20
Categories (4, object): [(0.1, 0.33] < (0.33, 0.55] < (0.55, 0.77] < (0.77, 0.99]]

In [55]:

cat.categories

Out[55]:

Index(['(0.1, 0.33]', '(0.33, 0.55]', '(0.55, 0.77]', '(0.77, 0.99]'], dtype='object')

In [57]:

cat.codes

Out[57]:

array([0, 1, 2, 2, 3, 0, 1, 2, 0, 2, 0, 0, 0, 1, 1, 2, 2, 0, 2, 3], dtype=int8)

In [59]:

pd.value_counts(cat)

Out[59]:

(0.55, 0.77]    7
(0.1, 0.33]     7
(0.33, 0.55]    4
(0.77, 0.99]    2
dtype: int64

In [62]:

cat=pd.cut(data, 7, precision=2)
cat

Out[62]:

[(0.23, 0.36], (0.36, 0.48], (0.61, 0.74], (0.74, 0.87], (0.87, 0.99], ..., (0.48, 0.61], (0.61, 0.74], (0.23, 0.36], (0.61, 0.74], (0.74, 0.87]]
Length: 20
Categories (7, object): [(0.1, 0.23] < (0.23, 0.36] < (0.36, 0.48] < (0.48, 0.61] < (0.61, 0.74] < (0.74, 0.87] < (0.87, 0.99]]

In [63]:

pd.value_counts(cat)

Out[63]:

(0.23, 0.36]    7
(0.61, 0.74]    5
(0.48, 0.61]    3
(0.74, 0.87]    2
(0.87, 0.99]    1
(0.36, 0.48]    1
(0.1, 0.23]     1
dtype: int64

In [65]:

# A closely related function, qcut, bins the data based on sample quantiles.
data = np.random.randn(1000)

cats = pd.qcut(data, 4) # Cut into quartiles
cats

Out[65]:

[(-0.605, 0.0514], (-0.605, 0.0514], [-3.296, -0.605], [-3.296, -0.605], [-3.296, -0.605], ..., (0.0514, 0.735], (0.735, 3.246], (0.0514, 0.735], (-0.605, 0.0514], (0.735, 3.246]]
Length: 1000
Categories (4, object): [[-3.296, -0.605] < (-0.605, 0.0514] < (0.0514, 0.735] < (0.735, 3.246]]

In [66]:

pd.value_counts(cats)

Out[66]:

(0.735, 3.246]      250
(0.0514, 0.735]     250
(-0.605, 0.0514]    250
[-3.296, -0.605]    250
dtype: int64

In [68]:

# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):
cat=pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
cat

Out[68]:

[(-1.251, 0.0514], (-1.251, 0.0514], [-3.296, -1.251], [-3.296, -1.251], [-3.296, -1.251], ..., (0.0514, 1.281], (1.281, 3.246], (0.0514, 1.281], (-1.251, 0.0514], (1.281, 3.246]]
Length: 1000
Categories (4, object): [[-3.296, -1.251] < (-1.251, 0.0514] < (0.0514, 1.281] < (1.281, 3.246]]

In [69]:

pd.value_counts(cat)

Out[69]:

(0.0514, 1.281]     400
(-1.251, 0.0514]    400
(1.281, 3.246]      100
[-3.296, -1.251]    100
dtype: int64

Detecting and Filtering Outliers¶

In [71]:

# Filtering or transforming outliers is largely a matter of applying array operations.

np.random.seed(12345)

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Out[71]:

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

In [72]:

# Suppose you wanted to find values in one of the columns exceeding three in magnitude:
col = data[3]

col[np.abs(col) > 3]

Out[72]:

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [73]:

# To select all rows having a value exceeding 3 or -3, you can use the any method on a
# boolean DataFrame:

 data[(np.abs(data) > 3).any(1)]

Out[73]:

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

In [77]:

# code to cap values outside the interval -3 to 3
data[np.abs(data) > 3] = np.sign(data) * 3

data.describe()

Out[77]:

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067623	0.068473	0.025153	-0.002081
std	0.995485	0.990253	1.003977	0.989736
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.000000	2.653656	3.000000	3.000000

In [78]:

# data[np.abs(data) > 3]

Permutation and Random Sampling¶

In [79]:

# Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do using
# the numpy.random.permutation function. Calling permutation with the length of the axis
# you want to permute produces an array of integers indicating the new ordering:

df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
df

Out[79]:

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

In [82]:

sampler = np.random.permutation(5)
sampler

Out[82]:

array([1, 3, 4, 0, 2])

In [83]:

df.take(sampler)

Out[83]:

	0	1	2	3
1	4	5	6	7
3	12	13	14	15
4	16	17	18	19
0	0	1	2	3
2	8	9	10	11

In [84]:

# To select a random subset without replacement, one way is to slice off the first k ele-
# ments of the array returned by permutation, where k is the desired subset size. There
# are much more efficient sampling-without-replacement algorithms, but this is an easy
# strategy that uses readily available tools

df.take(np.random.permutation(len(df))[:3])

Out[84]:

	0	1	2	3
1	4	5	6	7
3	12	13	14	15
0	0	1	2	3

In [85]:

# To generate a sample with replacement, the fastest way is to use np.random.randint to
# draw random integers:
bag = np.array([5, 7, -1, 6, 4])

sampler = np.random.randint(0, len(bag), size=10)
sampler

Out[85]:

array([2, 2, 0, 3, 0, 4, 1, 1, 2, 3])

In [86]:

draws = bag.take(sampler)
draws

Out[86]:

array([-1, -1,  5,  6,  5,  4,  7,  7, -1,  6])

Computing Indicator/Dummy Variables¶

Another type of transformation for statistical modeling or machine learning applica- tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a column in a DataFrame has k distinct values, you would derive a matrix or DataFrame containing k columns containing all 1’s and 0’s. pandas has a get_dummies function for doing this, though devising one yourself is not difficult. Let’s return to an earlier ex- ample DataFrame

In [87]:

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df

Out[87]:

	data1	key
0	0	b
1	1	b
2	2	a
3	3	c
4	4	a
5	5	b

In [88]:

pd.get_dummies(df['key'])

Out[88]:

	a	b	c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

In [89]:

dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Out[89]:

	key_a	key_b	key_c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

In [91]:

df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Out[91]:

	data1	key_a	key_b	key_c
0	0	0.0	1.0	0.0
1	1	0.0	1.0	0.0
2	2	1.0	0.0	0.0
3	3	0.0	0.0	1.0
4	4	1.0	0.0	0.0
5	5	0.0	1.0	0.0

In [93]:

# reading the movie data
movies = pd.read_csv("dataset/movies.csv")
movies.head()

Out[93]:

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

In [105]:

# Adding indicator variables for each genre requires a little bit of wrangling. First, we
# extract the list of unique genres in the dataset (using a nice set.union trick)

genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))

# for i in genre_iter:
#    print(i)

In [106]:

genres

Out[106]:

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [107]:

dummies = pd.DataFrame(np.zeros((len(movies), len(genres))), columns=genres)

In [108]:

for i, gen in enumerate(movies.genres):
        dummies.ix[i, gen.split('|')] = 1

In [109]:

movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [110]:

movies_windic.ix[0]

Out[110]:

movieId                                                               1
title                                                  Toy Story (1995)
genres                      Adventure|Animation|Children|Comedy|Fantasy
Genre_(no genres listed)                                              0
Genre_Action                                                          0
Genre_Adventure                                                       1
Genre_Animation                                                       1
Genre_Children                                                        1
Genre_Comedy                                                          1
Genre_Crime                                                           0
Genre_Documentary                                                     0
Genre_Drama                                                           0
Genre_Fantasy                                                         1
Genre_Film-Noir                                                       0
Genre_Horror                                                          0
Genre_IMAX                                                            0
Genre_Musical                                                         0
Genre_Mystery                                                         0
Genre_Romance                                                         0
Genre_Sci-Fi                                                          0
Genre_Thriller                                                        0
Genre_War                                                             0
Genre_Western                                                         0
Name: 0, dtype: object

In [ ]:

	a	b	c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

	key_a	key_b	key_c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

	data1	key_a	key_b	key_c
0	0	0.0	1.0	0.0
1	1	0.0	1.0	0.0
2	2	1.0	0.0	0.0
3	3	0.0	0.0	1.0
4	4	1.0	0.0	0.0
5	5	0.0	1.0	0.0

	a	b	c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

	key_a	key_b	key_c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

	data1	key_a	key_b	key_c
0	0	0.0	1.0	0.0
1	1	0.0	1.0	0.0
2	2	1.0	0.0	0.0
3	3	0.0	0.0	1.0
4	4	1.0	0.0	0.0
5	5	0.0	1.0	0.0

	a	b	c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

	key_a	key_b	key_c
0	0.0	1.0	0.0
1	0.0	1.0	0.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	1.0	0.0	0.0
5	0.0	1.0	0.0

	data1	key_a	key_b	key_c
0	0	0.0	1.0	0.0
1	1	0.0	1.0	0.0
2	2	1.0	0.0	0.0
3	3	0.0	0.0	1.0
4	4	1.0	0.0	0.0
5	5	0.0	1.0	0.0