Import necessary dependencies and settings

In [1]:
import pandas as pd
import numpy as np

Transforming Nominal Features

In [2]:
vg_df = pd.read_csv('datasets/vgsales.csv', encoding='utf-8')
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]
Out[2]:
Name Platform Year Genre Publisher
1 Super Mario Bros. NES 1985.0 Platform Nintendo
2 Mario Kart Wii Wii 2008.0 Racing Nintendo
3 Wii Sports Resort Wii 2009.0 Sports Nintendo
4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo
5 Tetris GB 1989.0 Puzzle Nintendo
6 New Super Mario Bros. DS 2006.0 Platform Nintendo
In [3]:
genres = np.unique(vg_df['Genre'])
genres
Out[3]:
array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)
In [4]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings
Out[4]:
{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}
In [5]:
vg_df['GenreLabel'] = genre_labels
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]
Out[5]:
Name Platform Year Genre GenreLabel
1 Super Mario Bros. NES 1985.0 Platform 4
2 Mario Kart Wii Wii 2008.0 Racing 6
3 Wii Sports Resort Wii 2009.0 Sports 10
4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing 7
5 Tetris GB 1989.0 Puzzle 5
6 New Super Mario Bros. DS 2006.0 Platform 4

Transforming Ordinal Features

In [6]:
poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)

np.unique(poke_df['Generation'])
Out[6]:
array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)
In [7]:
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, 
               'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}

poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]
Out[7]:
Name Generation GenerationLabel
4 Octillery Gen 2 2
5 Helioptile Gen 6 6
6 Dialga Gen 4 4
7 DeoxysDefense Forme Gen 3 3
8 Rapidash Gen 1 1
9 Swanna Gen 5 5

Encoding Categorical Features

One-hot Encoding Scheme

In [8]:
poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]
Out[8]:
Name Generation Legendary
4 Octillery Gen 2 False
5 Helioptile Gen 6 False
6 Dialga Gen 4 True
7 DeoxysDefense Forme Gen 3 True
8 Rapidash Gen 1 False
9 Swanna Gen 5 False
In [9]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# transform and map pokemon generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['Generation'])
poke_df['Gen_Label'] = gen_labels

# transform and map pokemon legendary status
leg_le = LabelEncoder()
leg_labels = leg_le.fit_transform(poke_df['Legendary'])
poke_df['Lgnd_Label'] = leg_labels

poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
poke_df_sub.iloc[4:10]
Out[9]:
Name Generation Gen_Label Legendary Lgnd_Label
4 Octillery Gen 2 1 False 0
5 Helioptile Gen 6 5 False 0
6 Dialga Gen 4 3 True 1
7 DeoxysDefense Forme Gen 3 2 True 1
8 Rapidash Gen 1 0 False 0
9 Swanna Gen 5 4 False 0
In [10]:
# encode generation labels using one-hot encoding scheme
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)

# encode legendary status labels using one-hot encoding scheme
leg_ohe = OneHotEncoder()
leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()
leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]
leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)
In [11]:
poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,
              ['Legendary', 'Lgnd_Label'],leg_feature_labels], [])
poke_df_ohe[columns].iloc[4:10]
Out[11]:
Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True
4 Octillery Gen 2 1 0.0 1.0 0.0 0.0 0.0 0.0 False 0 1.0 0.0
5 Helioptile Gen 6 5 0.0 0.0 0.0 0.0 0.0 1.0 False 0 1.0 0.0
6 Dialga Gen 4 3 0.0 0.0 0.0 1.0 0.0 0.0 True 1 0.0 1.0
7 DeoxysDefense Forme Gen 3 2 0.0 0.0 1.0 0.0 0.0 0.0 True 1 0.0 1.0
8 Rapidash Gen 1 0 1.0 0.0 0.0 0.0 0.0 0.0 False 0 1.0 0.0
9 Swanna Gen 5 4 0.0 0.0 0.0 0.0 1.0 0.0 False 0 1.0 0.0
In [12]:
new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True], 
                           ['CharMyToast', 'Gen 4', False]],
                           columns=['Name', 'Generation', 'Legendary'])
new_poke_df
Out[12]:
Name Generation Legendary
0 PikaZoom Gen 3 True
1 CharMyToast Gen 4 False
In [13]:
new_gen_labels = gen_le.transform(new_poke_df['Generation'])
new_poke_df['Gen_Label'] = new_gen_labels

new_leg_labels = leg_le.transform(new_poke_df['Legendary'])
new_poke_df['Lgnd_Label'] = new_leg_labels

new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
Out[13]:
Name Generation Gen_Label Legendary Lgnd_Label
0 PikaZoom Gen 3 2 True 1
1 CharMyToast Gen 4 3 False 0
In [14]:
new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()
new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)

new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Lgnd_Label']]).toarray()
new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=leg_feature_labels)

new_poke_ohe = pd.concat([new_poke_df, new_gen_features, new_leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'], gen_feature_labels,
               ['Legendary', 'Lgnd_Label'], leg_feature_labels], [])
new_poke_ohe[columns]
Out[14]:
Name Generation Gen_Label Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6 Legendary Lgnd_Label Legendary_False Legendary_True
0 PikaZoom Gen 3 2 0.0 0.0 1.0 0.0 0.0 0.0 True 1 0.0 1.0
1 CharMyToast Gen 4 3 0.0 0.0 0.0 1.0 0.0 0.0 False 0 1.0 0.0
In [15]:
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]
Out[15]:
Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6
4 Octillery Gen 2 0 1 0 0 0 0
5 Helioptile Gen 6 0 0 0 0 0 1
6 Dialga Gen 4 0 0 0 1 0 0
7 DeoxysDefense Forme Gen 3 0 0 1 0 0 0
8 Rapidash Gen 1 1 0 0 0 0 0
9 Swanna Gen 5 0 0 0 0 1 0

Dummy Coding Scheme

In [16]:
gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)
pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]
Out[16]:
Name Generation Gen 2 Gen 3 Gen 4 Gen 5 Gen 6
4 Octillery Gen 2 1 0 0 0 0
5 Helioptile Gen 6 0 0 0 0 1
6 Dialga Gen 4 0 0 1 0 0
7 DeoxysDefense Forme Gen 3 0 1 0 0 0
8 Rapidash Gen 1 0 0 0 0 0
9 Swanna Gen 5 0 0 0 1 0
In [17]:
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_dummy_features = gen_onehot_features.iloc[:,:-1]
pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]
Out[17]:
Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5
4 Octillery Gen 2 0 1 0 0 0
5 Helioptile Gen 6 0 0 0 0 0
6 Dialga Gen 4 0 0 0 1 0
7 DeoxysDefense Forme Gen 3 0 0 1 0 0
8 Rapidash Gen 1 1 0 0 0 0
9 Swanna Gen 5 0 0 0 0 1

Effect Coding Scheme

In [18]:
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]
C:\Program Files\Anaconda3\lib\site-packages\pandas\core\indexing.py:517: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
Out[18]:
Name Generation Gen 1 Gen 2 Gen 3 Gen 4 Gen 5
4 Octillery Gen 2 0.0 1.0 0.0 0.0 0.0
5 Helioptile Gen 6 -1.0 -1.0 -1.0 -1.0 -1.0
6 Dialga Gen 4 0.0 0.0 0.0 1.0 0.0
7 DeoxysDefense Forme Gen 3 0.0 0.0 1.0 0.0 0.0
8 Rapidash Gen 1 1.0 0.0 0.0 0.0 0.0
9 Swanna Gen 5 0.0 0.0 0.0 0.0 1.0

Feature Hashing scheme

In [19]:
unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)
Total game genres: 12
['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'
 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']
In [20]:
from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]
Out[20]:
Name Genre 0 1 2 3 4 5
1 Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0
2 Mario Kart Wii Racing -1.0 0.0 0.0 0.0 0.0 -1.0
3 Wii Sports Resort Sports -2.0 2.0 0.0 -2.0 0.0 0.0
4 Pokemon Red/Pokemon Blue Role-Playing -1.0 1.0 2.0 0.0 1.0 -1.0
5 Tetris Puzzle 0.0 1.0 1.0 -2.0 1.0 -1.0
6 New Super Mario Bros. Platform 0.0 2.0 2.0 -1.0 1.0 0.0
In [21]:
fh.get_params()
Out[21]:
{'dtype': numpy.float64,
 'input_type': 'string',
 'n_features': 6,
 'non_negative': False}