In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import numpy as np
import pandas as pd
from operator import xor
from pprint import pprint
In [3]:
pokemons = pd.read_csv('https://raw.githubusercontent.com/veekun/pokedex/master/pokedex/data/csv/pokemon.csv').loc[:, ['id', 'identifier']]
id2name = {pokemon_id: pokemon_name for pokemon_id, pokemon_name in zip(pokemons['id'], pokemons['identifier']) if pokemon_id <= 250}
name2id = {pokemon_name:pokemon_id for pokemon_id, pokemon_name in id2name.items()}
In [4]:
# Print first 10 id-name pairs
pprint({k:v for k, v in id2name.items() if k <= 10})
{1: 'bulbasaur',
 2: 'ivysaur',
 3: 'venusaur',
 4: 'charmander',
 5: 'charmeleon',
 6: 'charizard',
 7: 'squirtle',
 8: 'wartortle',
 9: 'blastoise',
 10: 'caterpie'}

(Original) Pokemon DataFrame

In [5]:
original_poke = pd.read_csv('pokemon.csv', low_memory=False).set_index('_id') # https://www.kaggle.com/semioniy/predictemall
In [6]:
original_poke.head()
Out[6]:
pokemonId latitude longitude appearedLocalTime cellId_90m cellId_180m cellId_370m cellId_730m cellId_1460m cellId_2920m ... cooc_143 cooc_144 cooc_145 cooc_146 cooc_147 cooc_148 cooc_149 cooc_150 cooc_151 class
_id
NTgxMDkzOTk4MTM5MjUwMjIzNw== 16 20.525745 -97.460829 2016-09-08T03:57:45 9645139108510564000 9645139108711890000 9645139108443455000 9645139109517197000 9645139113812165000 9645139130992034000 ... False False False False False False False False False 16
OTQ1NDgzODc1MjM3NDEzMTI2MQ== 133 20.523695 -97.461167 2016-09-08T03:57:37 9645139109852742000 9645139109785633000 9645139110590940000 9645139109517197000 9645139113812165000 9645139130992034000 ... False False False False False False False False False 133
NTQ0OTQ0NDA1Nzg2ODg3OTg2OQ== 16 38.903590 -77.199780 2016-09-08T03:57:25 9923201472785285000 9923201472986612000 9923201473791918000 9923201477013144000 9923201481308110000 9923201498487980000 ... False False False False False False False False False 16
NTU2MTU1NDM4NzA2MDk1MDcxNw== 13 47.665903 -122.312561 2016-09-08T03:56:22 6093392705025474600 6093392705092583400 6093392705897889800 6093392702676664300 6093392715561566200 6093392767101173800 ... False False False False False False False False False 13
MTY2ODg4MTAzMTczMDE0MTUwNTM= 133 47.666454 -122.311628 2016-09-08T03:56:08 6093392707709829100 6093392707776938000 6093392708045373400 6093392711266598900 6093392715561566200 6093392767101173800 ... False False False False False False False False False 133

5 rows × 207 columns

Pokemon DataFrame

In [7]:
poke = original_poke.copy()

Filter by Country/Continent

In [8]:
print('No. of samples from Asia:', poke.continent.value_counts()['Asia'])
print('No. of samples from Singapore:', poke.city.value_counts()['Singapore'])
No. of samples from Asia: 15617
No. of samples from Singapore: 1421
In [9]:
# We select Pokemon samples from Asia
poke = poke[poke.continent == 'Asia']
poke.shape
Out[9]:
(15617, 207)

Filtered Pokemon DataFrame - Only allow columns starting with cooc (i.e. the co-occurrence with other pokemons) or pokemonId

In [10]:
filtered_cols = [col for col in poke.columns if (col.startswith('cooc') or col.startswith('pokemonId'))]
filtered_poke = poke[filtered_cols]
In [11]:
filtered_poke.shape
Out[11]:
(15617, 152)
In [12]:
filtered_poke.head()
Out[12]:
pokemonId cooc_1 cooc_2 cooc_3 cooc_4 cooc_5 cooc_6 cooc_7 cooc_8 cooc_9 ... cooc_142 cooc_143 cooc_144 cooc_145 cooc_146 cooc_147 cooc_148 cooc_149 cooc_150 cooc_151
_id
MTMxODgwNTY0ODM1OTMzNTAzOTc= 79 False False False False False False False False False ... False False False False False False False False False False
NjIxNjQ2OTI3OTI4NDEyNDA2MQ== 19 False False False False False False False False False ... False False False False False False False False False False
MTMwMTg2OTUyMDQ3MTcwNTk0MDU= 127 False False False False False False False False False ... False False False False False False False False False False
MzEwMzUyMTk5ODQyNjQ4NjIyMQ== 109 False False False False False False False False False ... False False False False False False False False False False
MjcwNzMzODYxMTY3NTkzNTQ1Mw== 11 False False False False False False False False False ... False False False False False False False False False False

5 rows × 152 columns

Convert Booleans to Integers

In [13]:
filtered_poke_numeric = filtered_poke.astype(int)
filtered_poke_numeric.head()
Out[13]:
pokemonId cooc_1 cooc_2 cooc_3 cooc_4 cooc_5 cooc_6 cooc_7 cooc_8 cooc_9 ... cooc_142 cooc_143 cooc_144 cooc_145 cooc_146 cooc_147 cooc_148 cooc_149 cooc_150 cooc_151
_id
MTMxODgwNTY0ODM1OTMzNTAzOTc= 79 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
NjIxNjQ2OTI3OTI4NDEyNDA2MQ== 19 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
MTMwMTg2OTUyMDQ3MTcwNTk0MDU= 127 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
MzEwMzUyMTk5ODQyNjQ4NjIyMQ== 109 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
MjcwNzMzODYxMTY3NTkzNTQ1Mw== 11 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 152 columns

Groupby by pokemonId

In [14]:
corr_poke = filtered_poke_numeric.groupby('pokemonId').mean()
corr_poke.head()
Out[14]:
cooc_1 cooc_2 cooc_3 cooc_4 cooc_5 cooc_6 cooc_7 cooc_8 cooc_9 cooc_10 ... cooc_142 cooc_143 cooc_144 cooc_145 cooc_146 cooc_147 cooc_148 cooc_149 cooc_150 cooc_151
pokemonId
1 0.000000 0.000000 0.0 0.005587 0.0 0.0 0.005587 0.005587 0.0 0.039106 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0
2 0.000000 0.000000 0.0 0.090909 0.0 0.0 0.000000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.090909 0.0 0.0 0.0 0.0
4 0.021739 0.021739 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.021739 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0
6 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0
7 0.023810 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.047619 ... 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0

5 rows × 151 columns

In [15]:
corr_poke.shape
Out[15]:
(120, 151)
In [16]:
corr_poke.columns = [int(col.replace('cooc_', '')) for col in corr_poke.columns]
corr_poke.drop([150, 151], axis=1, inplace=True)  # Drop Pokemons 150 and 151 (i.e. Mewtwo and Mew)
corr_poke.head()
Out[16]:
1 2 3 4 5 6 7 8 9 10 ... 140 141 142 143 144 145 146 147 148 149
pokemonId
1 0.000000 0.000000 0.0 0.005587 0.0 0.0 0.005587 0.005587 0.0 0.039106 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0
2 0.000000 0.000000 0.0 0.090909 0.0 0.0 0.000000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.090909 0.0 0.0
4 0.021739 0.021739 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.021739 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0
6 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0
7 0.023810 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.047619 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0

5 rows × 149 columns

In [17]:
corr_poke.shape
Out[17]:
(120, 149)
In [18]:
corr_poke.index = [id2name[col] for col in corr_poke.index]
corr_poke.columns = [id2name[col] for col in corr_poke.columns]
corr_poke.head()
Out[18]:
bulbasaur ivysaur venusaur charmander charmeleon charizard squirtle wartortle blastoise caterpie ... kabuto kabutops aerodactyl snorlax articuno zapdos moltres dratini dragonair dragonite
bulbasaur 0.000000 0.000000 0.0 0.005587 0.0 0.0 0.005587 0.005587 0.0 0.039106 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0
ivysaur 0.000000 0.000000 0.0 0.090909 0.0 0.0 0.000000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.090909 0.0 0.0
charmander 0.021739 0.021739 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.021739 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0
charizard 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0
squirtle 0.023810 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.000000 0.0 0.047619 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0

5 rows × 149 columns

In [19]:
from sklearn.preprocessing import StandardScaler, normalize

corr_poke_matrix_norm = normalize(corr_poke.as_matrix(), norm='l2')  # Alternatively, try: "norm='l1'"
corr_poke_matrix_std = StandardScaler().fit_transform(corr_poke.as_matrix())
In [20]:
# Using Normalize
corr_poke_matrix_norm
Out[20]:
array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.25      ,
         0.        ,  0.        ],
       [ 0.06537205,  0.06537205,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.04275691,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])
In [21]:
# Using StandardScaler
corr_poke_matrix_std
Out[21]:
array([[-0.18895474, -0.15889737,  0.        , ..., -0.26526348,
         0.        , -0.11953245],
       [-0.18895474, -0.15889737,  0.        , ...,  6.68155228,
         0.        , -0.11953245],
       [ 0.04510047,  7.58108456,  0.        , ..., -0.26526348,
         0.        , -0.11953245],
       ..., 
       [-0.18895474,  7.58108456,  0.        , ..., -0.26526348,
         0.        , -0.11953245],
       [-0.18895474, -0.15889737,  0.        , ..., -0.26526348,
         0.        , -0.11953245],
       [-0.18895474, -0.15889737,  0.        , ..., -0.26526348,
         0.        , -0.11953245]])

Plot using Seaborn's heatmap (on L1/L2 normalize)

In [22]:
X = pd.DataFrame(corr_poke_matrix_norm, index=corr_poke.index, columns=corr_poke.columns)
X.head(3)
Out[22]:
bulbasaur ivysaur venusaur charmander charmeleon charizard squirtle wartortle blastoise caterpie ... kabuto kabutops aerodactyl snorlax articuno zapdos moltres dratini dragonair dragonite
bulbasaur 0.000000 0.000000 0.0 0.01827 0.0 0.0 0.01827 0.01827 0.0 0.127887 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.0 0.0
ivysaur 0.000000 0.000000 0.0 0.25000 0.0 0.0 0.00000 0.00000 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.25 0.0 0.0
charmander 0.065372 0.065372 0.0 0.00000 0.0 0.0 0.00000 0.00000 0.0 0.065372 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.0 0.0

3 rows × 149 columns

In [23]:
plt.figure(figsize = (40, 25))
sns_plot = sns.heatmap(X, annot=False, cmap='afmhot', linewidths=.05, square=True) # Note that I used "cmap='afmhot"
# colormaps (cmap): http://matplotlib.org/examples/color/colormaps_reference.html

Plot using Seaborn's heatmap (on StandardScalar) and save the figure

In [24]:
X = pd.DataFrame(corr_poke_matrix_std, index=corr_poke.index, columns=corr_poke.columns)
X.head(3)
Out[24]:
bulbasaur ivysaur venusaur charmander charmeleon charizard squirtle wartortle blastoise caterpie ... kabuto kabutops aerodactyl snorlax articuno zapdos moltres dratini dragonair dragonite
bulbasaur -0.188955 -0.158897 0.0 -0.015907 0.0 0.0 0.073788 6.819762 0.0 -0.075668 ... -0.171889 0.0 0.0 -0.122228 0.0 0.0 0.0 -0.265263 0.0 -0.119532
ivysaur -0.188955 -0.158897 0.0 1.828829 0.0 0.0 -0.260031 -0.162927 0.0 -0.432027 ... -0.171889 0.0 0.0 -0.122228 0.0 0.0 0.0 6.681552 0.0 -0.119532
charmander 0.045100 7.581085 0.0 -0.136693 0.0 0.0 -0.260031 -0.162927 0.0 -0.233927 ... -0.171889 0.0 0.0 -0.122228 0.0 0.0 0.0 -0.265263 0.0 -0.119532

3 rows × 149 columns

In [25]:
plt.figure(figsize = (40, 25))
sns_plot = sns.heatmap(X, annot=False, cmap='RdBu_r', linewidths=.05, square=True)

plt.savefig("pokemon_output.png")

By: Jovian Lin (jovianlin.com)