#!/usr/bin/env python
# coding: utf-8

# # Emotions Dataset
# 
# This notebook walks you through a slightly more detailed example showing how a Planetoid can be used to visualise the results of an ML pipeline.
# 
# We are using the [emotions dataset from OpenML](https://www.openml.org/d/41545) which is a database of features derived from some songs that have been labelled according to their _mood_.

# In[1]:


import sys
sys.path.append("..")


# In[2]:


#temp, will probably have to modify the demo to use an installed version from PyPi
from planetoids import planetoids as pt
import pandas as pd
import numpy as np


# ## Pull the data from OpenML.
# 
# In order to get the proper target labels we need to follow this `idmax` approach.

# In[3]:


from sklearn.datasets import fetch_openml
data = fetch_openml(name='emotions', version=5, target_column=None)
df = pd.DataFrame(data.data, columns=data.feature_names)

#reverse the current one-hot nature of the targets
target = df[data.feature_names[-6:]].idxmax(1).values

#remove the target from the data
df = df.iloc[:,:-6]

#get an int representation of the target labels
b, target_int = np.unique(target, return_inverse=True)


# In[4]:


df.shape


# ## Preprocessing
# 
# Use a cross-validated recursive feature elimination approach to identify the most important features using a forest of extremely randomised trees with max_depth=3

# In[5]:


from sklearn.feature_selection import RFECV
from sklearn.ensemble import ExtraTreesClassifier
estimator = ExtraTreesClassifier(n_estimators=50, max_depth=3, random_state=42)
selector = RFECV(estimator, step=1, cv=5, verbose=0)
selector = selector.fit(df.values, target)
feature_reduction = round((len(df.columns) - selector.n_features_)/len(df.columns), 2)
print('Feature Reduction: {}%'.format(100*feature_reduction))


# In[6]:


#print all the features with rank 1
df.columns[selector.ranking_ == 1]


# Plot the 10 most important features

# In[7]:


importances = selector.estimator_.feature_importances_
pd.DataFrame(list(zip(df.columns, importances)), columns=['feature', 'importance']).set_index('feature').sort_values('importance').tail(10).plot(kind='barh')


# Using the rank 1 features, reduce the dataset down to 2 dimesions

# In[8]:


from umap import UMAP #see demo notebook 2 to learn more about umap
embedding = UMAP(n_components=2,
                 metric='cosine',
                 n_neighbors=250, #focus on more global structure
                 angular_rp_forest=True,
                 target_weight=0.7,
                 spread=2.2,
                 min_dist=1.6,
                 random_state=42).fit_transform(df[df.columns[selector.ranking_ == 1]].values, target_int)
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

reduced = pd.DataFrame(embedding, columns=['Component1', 'Component2'])
reduced['Emotion'] = target
reduced['Emotion'] = reduced['Emotion']

x=reduced['Component2']
y=reduced['Component1']

# Get unique names of species
uniq = list(set(target))

# Set the color map to match the number of species
z = range(1,len(uniq))
cmap = plt.get_cmap('tab10')
cNorm  = colors.Normalize(vmin=0, vmax=len(uniq))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cmap)

# Plot each class
for i in range(len(uniq)):
    indx = reduced['Emotion'] == uniq[i]
    plt.scatter(x[indx], y[indx], s=15, color=scalarMap.to_rgba(i), label=uniq[i])

plt.legend()
plt.tight_layout()
plt.show()


# As you can see in the scatter plot above, this process has produced some nice clusters we can use to seed a new Planetoid.

# In[9]:


emotions = pt.Planetoid(reduced,
                        'Component1',
                        'Component2',
                        'Emotion')
emotions.fit_terraform(projection="kavrayskiy7",
                       planet_name='Emotions Demo')