import pandas as pd
import numpy as np
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | one | 1 |
2 | one | 2 |
3 | two | 3 |
4 | two | 3 |
5 | two | 4 |
6 | two | 4 |
# The DataFrame method duplicated returns a boolean Series indicating whether each
# row is a duplicate or not
data.duplicated()
0 False 1 True 2 False 3 False 4 True 5 False 6 True dtype: bool
# drop_duplicates drop the duplicates
data.drop_duplicates()
k1 | k2 | |
---|---|---|
0 | one | 1 |
2 | one | 2 |
3 | two | 3 |
5 | two | 4 |
# Both of these methods by default consider all of the columns;
# until we defined a specific column name
data.drop_duplicates(['k1'])
k1 | k2 | |
---|---|---|
0 | one | 1 |
3 | two | 3 |
data.drop_duplicates(['k2'])
k1 | k2 | |
---|---|---|
0 | one | 1 |
2 | one | 2 |
3 | two | 3 |
5 | two | 4 |
# duplicated and drop_duplicates by default keep the first observed value combination.
# Passing take_last=True will return the last one:
data.drop_duplicates(['k1', 'k2'], take_last=True)
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:4: FutureWarning: the take_last=True keyword is deprecated, use keep='last' instead
k1 | k2 | |
---|---|---|
1 | one | 1 |
2 | one | 2 |
4 | two | 3 |
6 | two | 4 |
data.drop_duplicates(['k1', 'k2'], keep='last')
k1 | k2 | |
---|---|---|
1 | one | 1 |
2 | one | 2 |
4 | two | 3 |
6 | two | 4 |
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami', 'honey ham',
'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food | ounces | |
---|---|---|
0 | bacon | 4.0 |
1 | pulled pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | corned beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | honey ham | 5.0 |
8 | nova lox | 6.0 |
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
meat_to_animal
{'bacon': 'pig', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon', 'pastrami': 'cow', 'pulled pork': 'pig'}
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data
food | ounces | animal | |
---|---|---|---|
0 | bacon | 4.0 | pig |
1 | pulled pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | corned beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | honey ham | 5.0 | pig |
8 | nova lox | 6.0 | salmon |
data['food'].map(lambda x: meat_to_animal[x.lower()])
0 pig 1 pig 2 pig 3 cow 4 cow 5 pig 6 cow 7 pig 8 salmon Name: food, dtype: object
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0 1.0 1 -999.0 2 2.0 3 -999.0 4 -1000.0 5 3.0 dtype: float64
# The -999 values might be sentinel values for missing data. To replace these with NA
# values that pandas understands, we can use replace
data.replace(-999, np.nan)
0 1.0 1 NaN 2 2.0 3 NaN 4 -1000.0 5 3.0 dtype: float64
# replace multiple values at once
data.replace([-999,-1000], np.nan)
0 1.0 1 NaN 2 2.0 3 NaN 4 NaN 5 3.0 dtype: float64
# To use a different replacement for each value, pass a list of substitutes:
data.replace([-999,-1000], [np.nan, 0])
0 1.0 1 NaN 2 2.0 3 NaN 4 0.0 5 3.0 dtype: float64
# The argument passed can also be a dict:
data.replace({-999: np.nan, -1000: 0})
0 1.0 1 NaN 2 2.0 3 NaN 4 0.0 5 3.0 dtype: float64
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
data.index.map(str.upper)
array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)
data.index = data.index.map(str.upper)
data
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
# o create a transformed version of a data set without modifying the original,
# a useful method is rename:
data.rename(index=str.title, columns=str.upper)
ONE | TWO | THREE | FOUR | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
# rename can be used in conjunction with a dict-like object providing new values
# for a subset of the axis labels
data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})
one | two | peekaboo | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
# rename saves having to copy the DataFrame manually and assign to its index and columns
# attributes. Should you wish to modify a data set in place, pass inplace=True
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data
one | two | three | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
# Continuous data is often discretized or otherwised separated into “bins” for analysis.
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
ages
[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
bins
[18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Length: 12 Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.labels
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: 'labels' is deprecated. Use 'codes' instead if __name__ == '__main__':
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
[bins[i] for i in cats.codes]
[18, 18, 18, 25, 18, 18, 35, 25, 60, 35, 35, 25]
cats.levels
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: Accessing 'levels' is deprecated, use 'categories' if __name__ == '__main__':
Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')
cats.categories
Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')
pd.value_counts(cats)
(18, 25] 5 (35, 60] 3 (25, 35] 3 (60, 100] 1 dtype: int64
# Consistent with mathematical notation for intervals, a parenthesis means that the side
# is open while the square bracket means it is closed (inclusive). Which side is closed can
# be changed by passing right=False:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)] Length: 12 Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
# You can also pass your own bin names by passing a list or array to the labels option:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult] Length: 12 Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
# If you pass cut a integer number of bins instead of explicit bin edges, it will compute
# equal-length bins based on the minimum and maximum values in the data.
data = np.random.rand(20)
data
array([ 0.30040941, 0.42335852, 0.73095696, 0.74586086, 0.99197458, 0.10334217, 0.53164815, 0.63212415, 0.29426088, 0.6753901 , 0.32015222, 0.26670971, 0.32528884, 0.51031056, 0.33031443, 0.59880679, 0.69966579, 0.32159514, 0.71322471, 0.82226222])
cat=pd.cut(data, 4, precision=2)
cat
[(0.1, 0.33], (0.33, 0.55], (0.55, 0.77], (0.55, 0.77], (0.77, 0.99], ..., (0.55, 0.77], (0.55, 0.77], (0.1, 0.33], (0.55, 0.77], (0.77, 0.99]] Length: 20 Categories (4, object): [(0.1, 0.33] < (0.33, 0.55] < (0.55, 0.77] < (0.77, 0.99]]
cat.categories
Index(['(0.1, 0.33]', '(0.33, 0.55]', '(0.55, 0.77]', '(0.77, 0.99]'], dtype='object')
cat.codes
array([0, 1, 2, 2, 3, 0, 1, 2, 0, 2, 0, 0, 0, 1, 1, 2, 2, 0, 2, 3], dtype=int8)
pd.value_counts(cat)
(0.55, 0.77] 7 (0.1, 0.33] 7 (0.33, 0.55] 4 (0.77, 0.99] 2 dtype: int64
cat=pd.cut(data, 7, precision=2)
cat
[(0.23, 0.36], (0.36, 0.48], (0.61, 0.74], (0.74, 0.87], (0.87, 0.99], ..., (0.48, 0.61], (0.61, 0.74], (0.23, 0.36], (0.61, 0.74], (0.74, 0.87]] Length: 20 Categories (7, object): [(0.1, 0.23] < (0.23, 0.36] < (0.36, 0.48] < (0.48, 0.61] < (0.61, 0.74] < (0.74, 0.87] < (0.87, 0.99]]
pd.value_counts(cat)
(0.23, 0.36] 7 (0.61, 0.74] 5 (0.48, 0.61] 3 (0.74, 0.87] 2 (0.87, 0.99] 1 (0.36, 0.48] 1 (0.1, 0.23] 1 dtype: int64
# A closely related function, qcut, bins the data based on sample quantiles.
data = np.random.randn(1000)
cats = pd.qcut(data, 4) # Cut into quartiles
cats
[(-0.605, 0.0514], (-0.605, 0.0514], [-3.296, -0.605], [-3.296, -0.605], [-3.296, -0.605], ..., (0.0514, 0.735], (0.735, 3.246], (0.0514, 0.735], (-0.605, 0.0514], (0.735, 3.246]] Length: 1000 Categories (4, object): [[-3.296, -0.605] < (-0.605, 0.0514] < (0.0514, 0.735] < (0.735, 3.246]]
pd.value_counts(cats)
(0.735, 3.246] 250 (0.0514, 0.735] 250 (-0.605, 0.0514] 250 [-3.296, -0.605] 250 dtype: int64
# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):
cat=pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
cat
[(-1.251, 0.0514], (-1.251, 0.0514], [-3.296, -1.251], [-3.296, -1.251], [-3.296, -1.251], ..., (0.0514, 1.281], (1.281, 3.246], (0.0514, 1.281], (-1.251, 0.0514], (1.281, 3.246]] Length: 1000 Categories (4, object): [[-3.296, -1.251] < (-1.251, 0.0514] < (0.0514, 1.281] < (1.281, 3.246]]
pd.value_counts(cat)
(0.0514, 1.281] 400 (-1.251, 0.0514] 400 (1.281, 3.246] 100 [-3.296, -1.251] 100 dtype: int64
# Filtering or transforming outliers is largely a matter of applying array operations.
np.random.seed(12345)
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | -0.067684 | 0.067924 | 0.025598 | -0.002298 |
std | 0.998035 | 0.992106 | 1.006835 | 0.996794 |
min | -3.428254 | -3.548824 | -3.184377 | -3.745356 |
25% | -0.774890 | -0.591841 | -0.641675 | -0.644144 |
50% | -0.116401 | 0.101143 | 0.002073 | -0.013611 |
75% | 0.616366 | 0.780282 | 0.680391 | 0.654328 |
max | 3.366626 | 2.653656 | 3.260383 | 3.927528 |
# Suppose you wanted to find values in one of the columns exceeding three in magnitude:
col = data[3]
col[np.abs(col) > 3]
97 3.927528 305 -3.399312 400 -3.745356 Name: 3, dtype: float64
# To select all rows having a value exceeding 3 or -3, you can use the any method on a
# boolean DataFrame:
data[(np.abs(data) > 3).any(1)]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
5 | -0.539741 | 0.476985 | 3.248944 | -1.021228 |
97 | -0.774363 | 0.552936 | 0.106061 | 3.927528 |
102 | -0.655054 | -0.565230 | 3.176873 | 0.959533 |
305 | -2.315555 | 0.457246 | -0.025907 | -3.399312 |
324 | 0.050188 | 1.951312 | 3.260383 | 0.963301 |
400 | 0.146326 | 0.508391 | -0.196713 | -3.745356 |
499 | -0.293333 | -0.242459 | -3.056990 | 1.918403 |
523 | -3.428254 | -0.296336 | -0.439938 | -0.867165 |
586 | 0.275144 | 1.179227 | -3.184377 | 1.369891 |
808 | -0.362528 | -3.548824 | 1.553205 | -2.186301 |
900 | 3.366626 | -2.372214 | 0.851010 | 1.332846 |
# code to cap values outside the interval -3 to 3
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | -0.067623 | 0.068473 | 0.025153 | -0.002081 |
std | 0.995485 | 0.990253 | 1.003977 | 0.989736 |
min | -3.000000 | -3.000000 | -3.000000 | -3.000000 |
25% | -0.774890 | -0.591841 | -0.641675 | -0.644144 |
50% | -0.116401 | 0.101143 | 0.002073 | -0.013611 |
75% | 0.616366 | 0.780282 | 0.680391 | 0.654328 |
max | 3.000000 | 2.653656 | 3.000000 | 3.000000 |
# data[np.abs(data) > 3]
# Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do using
# the numpy.random.permutation function. Calling permutation with the length of the axis
# you want to permute produces an array of integers indicating the new ordering:
df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
sampler = np.random.permutation(5)
sampler
array([1, 3, 4, 0, 2])
df.take(sampler)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
1 | 4 | 5 | 6 | 7 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
0 | 0 | 1 | 2 | 3 |
2 | 8 | 9 | 10 | 11 |
# To select a random subset without replacement, one way is to slice off the first k ele-
# ments of the array returned by permutation, where k is the desired subset size. There
# are much more efficient sampling-without-replacement algorithms, but this is an easy
# strategy that uses readily available tools
df.take(np.random.permutation(len(df))[:3])
0 | 1 | 2 | 3 | |
---|---|---|---|---|
1 | 4 | 5 | 6 | 7 |
3 | 12 | 13 | 14 | 15 |
0 | 0 | 1 | 2 | 3 |
# To generate a sample with replacement, the fastest way is to use np.random.randint to
# draw random integers:
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
sampler
array([2, 2, 0, 3, 0, 4, 1, 1, 2, 3])
draws = bag.take(sampler)
draws
array([-1, -1, 5, 6, 5, 4, 7, 7, -1, 6])
Another type of transformation for statistical modeling or machine learning applica- tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a column in a DataFrame has k distinct values, you would derive a matrix or DataFrame containing k columns containing all 1’s and 0’s. pandas has a get_dummies function for doing this, though devising one yourself is not difficult. Let’s return to an earlier ex- ample DataFrame
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df
data1 | key | |
---|---|---|
0 | 0 | b |
1 | 1 | b |
2 | 2 | a |
3 | 3 | c |
4 | 4 | a |
5 | 5 | b |
pd.get_dummies(df['key'])
a | b | c | |
---|---|---|---|
0 | 0.0 | 1.0 | 0.0 |
1 | 0.0 | 1.0 | 0.0 |
2 | 1.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 1.0 |
4 | 1.0 | 0.0 | 0.0 |
5 | 0.0 | 1.0 | 0.0 |
dummies = pd.get_dummies(df['key'], prefix='key')
dummies
key_a | key_b | key_c | |
---|---|---|---|
0 | 0.0 | 1.0 | 0.0 |
1 | 0.0 | 1.0 | 0.0 |
2 | 1.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 1.0 |
4 | 1.0 | 0.0 | 0.0 |
5 | 0.0 | 1.0 | 0.0 |
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1 | key_a | key_b | key_c | |
---|---|---|---|---|
0 | 0 | 0.0 | 1.0 | 0.0 |
1 | 1 | 0.0 | 1.0 | 0.0 |
2 | 2 | 1.0 | 0.0 | 0.0 |
3 | 3 | 0.0 | 0.0 | 1.0 |
4 | 4 | 1.0 | 0.0 | 0.0 |
5 | 5 | 0.0 | 1.0 | 0.0 |
# reading the movie data
movies = pd.read_csv("dataset/movies.csv")
movies.head()
movieId | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
1 | 2 | Jumanji (1995) | Adventure|Children|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
# Adding indicator variables for each genre requires a little bit of wrangling. First, we
# extract the list of unique genres in the dataset (using a nice set.union trick)
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
# for i in genre_iter:
# print(i)
genres
['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
dummies = pd.DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
for i, gen in enumerate(movies.genres):
dummies.ix[i, gen.split('|')] = 1
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]
movieId 1 title Toy Story (1995) genres Adventure|Animation|Children|Comedy|Fantasy Genre_(no genres listed) 0 Genre_Action 0 Genre_Adventure 1 Genre_Animation 1 Genre_Children 1 Genre_Comedy 1 Genre_Crime 0 Genre_Documentary 0 Genre_Drama 0 Genre_Fantasy 1 Genre_Film-Noir 0 Genre_Horror 0 Genre_IMAX 0 Genre_Musical 0 Genre_Mystery 0 Genre_Romance 0 Genre_Sci-Fi 0 Genre_Thriller 0 Genre_War 0 Genre_Western 0 Name: 0, dtype: object