import pandas as pd
import numpy as np
vg_df = pd.read_csv('datasets/vgsales.csv', encoding='utf-8')
vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7]
Name | Platform | Year | Genre | Publisher | |
---|---|---|---|---|---|
1 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo |
2 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo |
3 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo |
4 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo |
5 | Tetris | GB | 1989.0 | Puzzle | Nintendo |
6 | New Super Mario Bros. | DS | 2006.0 | Platform | Nintendo |
genres = np.unique(vg_df['Genre'])
genres
array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle', 'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports', 'Strategy'], dtype=object)
from sklearn.preprocessing import LabelEncoder
gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
genre_mappings
{0: 'Action', 1: 'Adventure', 2: 'Fighting', 3: 'Misc', 4: 'Platform', 5: 'Puzzle', 6: 'Racing', 7: 'Role-Playing', 8: 'Shooter', 9: 'Simulation', 10: 'Sports', 11: 'Strategy'}
vg_df['GenreLabel'] = genre_labels
vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7]
Name | Platform | Year | Genre | GenreLabel | |
---|---|---|---|---|---|
1 | Super Mario Bros. | NES | 1985.0 | Platform | 4 |
2 | Mario Kart Wii | Wii | 2008.0 | Racing | 6 |
3 | Wii Sports Resort | Wii | 2009.0 | Sports | 10 |
4 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | 7 |
5 | Tetris | GB | 1989.0 | Puzzle | 5 |
6 | New Super Mario Bros. | DS | 2006.0 | Platform | 4 |
poke_df = pd.read_csv('datasets/Pokemon.csv', encoding='utf-8')
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)
np.unique(poke_df['Generation'])
array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3,
'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}
poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10]
Name | Generation | GenerationLabel | |
---|---|---|---|
4 | Octillery | Gen 2 | 2 |
5 | Helioptile | Gen 6 | 6 |
6 | Dialga | Gen 4 | 4 |
7 | DeoxysDefense Forme | Gen 3 | 3 |
8 | Rapidash | Gen 1 | 1 |
9 | Swanna | Gen 5 | 5 |
poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10]
Name | Generation | Legendary | |
---|---|---|---|
4 | Octillery | Gen 2 | False |
5 | Helioptile | Gen 6 | False |
6 | Dialga | Gen 4 | True |
7 | DeoxysDefense Forme | Gen 3 | True |
8 | Rapidash | Gen 1 | False |
9 | Swanna | Gen 5 | False |
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# transform and map pokemon generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['Generation'])
poke_df['Gen_Label'] = gen_labels
# transform and map pokemon legendary status
leg_le = LabelEncoder()
leg_labels = leg_le.fit_transform(poke_df['Legendary'])
poke_df['Lgnd_Label'] = leg_labels
poke_df_sub = poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
poke_df_sub.iloc[4:10]
Name | Generation | Gen_Label | Legendary | Lgnd_Label | |
---|---|---|---|---|---|
4 | Octillery | Gen 2 | 1 | False | 0 |
5 | Helioptile | Gen 6 | 5 | False | 0 |
6 | Dialga | Gen 4 | 3 | True | 1 |
7 | DeoxysDefense Forme | Gen 3 | 2 | True | 1 |
8 | Rapidash | Gen 1 | 0 | False | 0 |
9 | Swanna | Gen 5 | 4 | False | 0 |
# encode generation labels using one-hot encoding scheme
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)
# encode legendary status labels using one-hot encoding scheme
leg_ohe = OneHotEncoder()
leg_feature_arr = leg_ohe.fit_transform(poke_df[['Lgnd_Label']]).toarray()
leg_feature_labels = ['Legendary_'+str(cls_label) for cls_label in leg_le.classes_]
leg_features = pd.DataFrame(leg_feature_arr, columns=leg_feature_labels)
poke_df_ohe = pd.concat([poke_df_sub, gen_features, leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'],gen_feature_labels,
['Legendary', 'Lgnd_Label'],leg_feature_labels], [])
poke_df_ohe[columns].iloc[4:10]
Name | Generation | Gen_Label | Gen 1 | Gen 2 | Gen 3 | Gen 4 | Gen 5 | Gen 6 | Legendary | Lgnd_Label | Legendary_False | Legendary_True | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | Octillery | Gen 2 | 1 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | False | 0 | 1.0 | 0.0 |
5 | Helioptile | Gen 6 | 5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | False | 0 | 1.0 | 0.0 |
6 | Dialga | Gen 4 | 3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | True | 1 | 0.0 | 1.0 |
7 | DeoxysDefense Forme | Gen 3 | 2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | True | 1 | 0.0 | 1.0 |
8 | Rapidash | Gen 1 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | False | 0 | 1.0 | 0.0 |
9 | Swanna | Gen 5 | 4 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | False | 0 | 1.0 | 0.0 |
new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True],
['CharMyToast', 'Gen 4', False]],
columns=['Name', 'Generation', 'Legendary'])
new_poke_df
Name | Generation | Legendary | |
---|---|---|---|
0 | PikaZoom | Gen 3 | True |
1 | CharMyToast | Gen 4 | False |
new_gen_labels = gen_le.transform(new_poke_df['Generation'])
new_poke_df['Gen_Label'] = new_gen_labels
new_leg_labels = leg_le.transform(new_poke_df['Legendary'])
new_poke_df['Lgnd_Label'] = new_leg_labels
new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 'Lgnd_Label']]
Name | Generation | Gen_Label | Legendary | Lgnd_Label | |
---|---|---|---|---|---|
0 | PikaZoom | Gen 3 | 2 | True | 1 |
1 | CharMyToast | Gen 4 | 3 | False | 0 |
new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()
new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)
new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Lgnd_Label']]).toarray()
new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=leg_feature_labels)
new_poke_ohe = pd.concat([new_poke_df, new_gen_features, new_leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'], gen_feature_labels,
['Legendary', 'Lgnd_Label'], leg_feature_labels], [])
new_poke_ohe[columns]
Name | Generation | Gen_Label | Gen 1 | Gen 2 | Gen 3 | Gen 4 | Gen 5 | Gen 6 | Legendary | Lgnd_Label | Legendary_False | Legendary_True | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | PikaZoom | Gen 3 | 2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | True | 1 | 0.0 | 1.0 |
1 | CharMyToast | Gen 4 | 3 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | False | 0 | 1.0 | 0.0 |
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10]
Name | Generation | Gen 1 | Gen 2 | Gen 3 | Gen 4 | Gen 5 | Gen 6 | |
---|---|---|---|---|---|---|---|---|
4 | Octillery | Gen 2 | 0 | 1 | 0 | 0 | 0 | 0 |
5 | Helioptile | Gen 6 | 0 | 0 | 0 | 0 | 0 | 1 |
6 | Dialga | Gen 4 | 0 | 0 | 0 | 1 | 0 | 0 |
7 | DeoxysDefense Forme | Gen 3 | 0 | 0 | 1 | 0 | 0 | 0 |
8 | Rapidash | Gen 1 | 1 | 0 | 0 | 0 | 0 | 0 |
9 | Swanna | Gen 5 | 0 | 0 | 0 | 0 | 1 | 0 |
gen_dummy_features = pd.get_dummies(poke_df['Generation'], drop_first=True)
pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]
Name | Generation | Gen 2 | Gen 3 | Gen 4 | Gen 5 | Gen 6 | |
---|---|---|---|---|---|---|---|
4 | Octillery | Gen 2 | 1 | 0 | 0 | 0 | 0 |
5 | Helioptile | Gen 6 | 0 | 0 | 0 | 0 | 1 |
6 | Dialga | Gen 4 | 0 | 0 | 1 | 0 | 0 |
7 | DeoxysDefense Forme | Gen 3 | 0 | 1 | 0 | 0 | 0 |
8 | Rapidash | Gen 1 | 0 | 0 | 0 | 0 | 0 |
9 | Swanna | Gen 5 | 0 | 0 | 0 | 1 | 0 |
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_dummy_features = gen_onehot_features.iloc[:,:-1]
pd.concat([poke_df[['Name', 'Generation']], gen_dummy_features], axis=1).iloc[4:10]
Name | Generation | Gen 1 | Gen 2 | Gen 3 | Gen 4 | Gen 5 | |
---|---|---|---|---|---|---|---|
4 | Octillery | Gen 2 | 0 | 1 | 0 | 0 | 0 |
5 | Helioptile | Gen 6 | 0 | 0 | 0 | 0 | 0 |
6 | Dialga | Gen 4 | 0 | 0 | 0 | 1 | 0 |
7 | DeoxysDefense Forme | Gen 3 | 0 | 0 | 1 | 0 | 0 |
8 | Rapidash | Gen 1 | 1 | 0 | 0 | 0 | 0 |
9 | Swanna | Gen 5 | 0 | 0 | 0 | 0 | 1 |
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
gen_effect_features = gen_onehot_features.iloc[:,:-1]
gen_effect_features.loc[np.all(gen_effect_features == 0, axis=1)] = -1.
pd.concat([poke_df[['Name', 'Generation']], gen_effect_features], axis=1).iloc[4:10]
C:\Program Files\Anaconda3\lib\site-packages\pandas\core\indexing.py:517: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self.obj[item] = s
Name | Generation | Gen 1 | Gen 2 | Gen 3 | Gen 4 | Gen 5 | |
---|---|---|---|---|---|---|---|
4 | Octillery | Gen 2 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
5 | Helioptile | Gen 6 | -1.0 | -1.0 | -1.0 | -1.0 | -1.0 |
6 | Dialga | Gen 4 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
7 | DeoxysDefense Forme | Gen 3 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
8 | Rapidash | Gen 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
9 | Swanna | Gen 5 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)
Total game genres: 12 ['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing' 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
pd.concat([vg_df[['Name', 'Genre']], pd.DataFrame(hashed_features)], axis=1).iloc[1:7]
Name | Genre | 0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|---|---|
1 | Super Mario Bros. | Platform | 0.0 | 2.0 | 2.0 | -1.0 | 1.0 | 0.0 |
2 | Mario Kart Wii | Racing | -1.0 | 0.0 | 0.0 | 0.0 | 0.0 | -1.0 |
3 | Wii Sports Resort | Sports | -2.0 | 2.0 | 0.0 | -2.0 | 0.0 | 0.0 |
4 | Pokemon Red/Pokemon Blue | Role-Playing | -1.0 | 1.0 | 2.0 | 0.0 | 1.0 | -1.0 |
5 | Tetris | Puzzle | 0.0 | 1.0 | 1.0 | -2.0 | 1.0 | -1.0 |
6 | New Super Mario Bros. | Platform | 0.0 | 2.0 | 2.0 | -1.0 | 1.0 | 0.0 |
fh.get_params()
{'dtype': numpy.float64, 'input_type': 'string', 'n_features': 6, 'non_negative': False}