# coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns # In[2]: import numpy as np import pandas as pd from operator import xor from pprint import pprint # In[3]: pokemons = pd.read_csv('https://raw.githubusercontent.com/veekun/pokedex/master/pokedex/data/csv/pokemon.csv').loc[:, ['id', 'identifier']] id2name = {pokemon_id: pokemon_name for pokemon_id, pokemon_name in zip(pokemons['id'], pokemons['identifier']) if pokemon_id <= 250} name2id = {pokemon_name:pokemon_id for pokemon_id, pokemon_name in id2name.items()} # In[4]: # Print first 10 id-name pairs pprint({k:v for k, v in id2name.items() if k <= 10}) # ### (Original) Pokemon DataFrame # In[5]: original_poke = pd.read_csv('pokemon.csv', low_memory=False).set_index('_id') # https://www.kaggle.com/semioniy/predictemall # In[6]: original_poke.head() # ### Pokemon DataFrame # In[7]: poke = original_poke.copy() # ### Filter by Country/Continent # In[8]: print('No. of samples from Asia:', poke.continent.value_counts()['Asia']) print('No. of samples from Singapore:', poke.city.value_counts()['Singapore']) # In[9]: # We select Pokemon samples from Asia poke = poke[poke.continent == 'Asia'] poke.shape # ### Filtered Pokemon DataFrame - Only allow columns starting with `cooc` (i.e. the co-occurrence with other pokemons) or `pokemonId` # In[10]: filtered_cols = [col for col in poke.columns if (col.startswith('cooc') or col.startswith('pokemonId'))] filtered_poke = poke[filtered_cols] # In[11]: filtered_poke.shape # In[12]: filtered_poke.head() # ### Convert Booleans to Integers # In[13]: filtered_poke_numeric = filtered_poke.astype(int) filtered_poke_numeric.head() # ### Groupby by `pokemonId` # In[14]: corr_poke = filtered_poke_numeric.groupby('pokemonId').mean() corr_poke.head() # In[15]: corr_poke.shape # In[16]: corr_poke.columns = [int(col.replace('cooc_', '')) for col in corr_poke.columns] corr_poke.drop([150, 151], axis=1, inplace=True) # Drop Pokemons 150 and 151 (i.e. Mewtwo and Mew) corr_poke.head() # In[17]: corr_poke.shape # In[18]: corr_poke.index = [id2name[col] for col in corr_poke.index] corr_poke.columns = [id2name[col] for col in corr_poke.columns] corr_poke.head() # In[19]: from sklearn.preprocessing import StandardScaler, normalize corr_poke_matrix_norm = normalize(corr_poke.as_matrix(), norm='l2') # Alternatively, try: "norm='l1'" corr_poke_matrix_std = StandardScaler().fit_transform(corr_poke.as_matrix()) # In[20]: # Using Normalize corr_poke_matrix_norm # In[21]: # Using StandardScaler corr_poke_matrix_std # # Plot using Seaborn's `heatmap` (on L1/L2 normalize) # In[22]: X = pd.DataFrame(corr_poke_matrix_norm, index=corr_poke.index, columns=corr_poke.columns) X.head(3) # [All about colormaps](http://matplotlib.org/examples/color/colormaps_reference.htm) # In[23]: plt.figure(figsize = (40, 25)) sns_plot = sns.heatmap(X, annot=False, cmap='afmhot', linewidths=.05, square=True) # Note that I used "cmap='afmhot" # colormaps (cmap): http://matplotlib.org/examples/color/colormaps_reference.html # # Plot using Seaborn's `heatmap` (on StandardScalar) and save the figure # In[24]: X = pd.DataFrame(corr_poke_matrix_std, index=corr_poke.index, columns=corr_poke.columns) X.head(3) # [All about colormaps](http://matplotlib.org/examples/color/colormaps_reference.htm) # In[25]: plt.figure(figsize = (40, 25)) sns_plot = sns.heatmap(X, annot=False, cmap='RdBu_r', linewidths=.05, square=True) plt.savefig("pokemon_output.png") # # Link to `pokemon_output.png` # https://www.dropbox.com/s/0qlyj9xc9v45gne/pokemon_output.png?dl=0 # By: Jovian Lin ([jovianlin.com](http://jovianlin.com))