# Imports necesarios
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
artists_billboard = pd.read_csv(r"artists_billboard_fix3.csv")
artists_billboard.shape
(635, 11)
artists_billboard.head()
id | title | artist | mood | tempo | genre | artist_type | chart_date | durationSeg | top | anioNacimiento | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Small Town Throwdown | BRANTLEY GILBERT featuring JUSTIN MOORE & THOM... | Brooding | Medium Tempo | Traditional | Male | 20140628 | 191.0 | 0 | 1975.0 |
1 | 1 | Bang Bang | JESSIE J, ARIANA GRANDE & NICKI MINAJ | Energizing | Medium Tempo | Pop | Female | 20140816 | 368.0 | 0 | 1989.0 |
2 | 2 | Timber | PITBULL featuring KE$HA | Excited | Medium Tempo | Urban | Mixed | 20140118 | 223.0 | 1 | 1993.0 |
3 | 3 | Sweater Weather | THE NEIGHBOURHOOD | Brooding | Medium Tempo | Alternative & Punk | Male | 20140104 | 206.0 | 0 | 1989.0 |
4 | 4 | Automatic | MIRANDA LAMBERT | Yearning | Medium Tempo | Traditional | Female | 20140301 | 232.0 | 0 | 0.0 |
artists_billboard.groupby('top').size()
top 0 494 1 141 dtype: int64
sb.factorplot('top',data=artists_billboard,kind="count")
<seaborn.axisgrid.FacetGrid at 0x117c23490>
sb.factorplot('artist_type',data=artists_billboard,kind="count")
<seaborn.axisgrid.FacetGrid at 0x117e768d0>
sb.factorplot('top',data=artists_billboard,hue='artist_type',kind="count")
<seaborn.axisgrid.FacetGrid at 0x117be4e50>
sb.factorplot('mood',data=artists_billboard,kind="count", aspect=3)
<seaborn.axisgrid.FacetGrid at 0x118e8d050>
sb.factorplot('tempo',data=artists_billboard,hue='top',kind="count")
<seaborn.axisgrid.FacetGrid at 0x118e97950>
sb.factorplot('genre',data=artists_billboard,kind="count", aspect=3)
<seaborn.axisgrid.FacetGrid at 0x1198d4e90>
sb.factorplot('mood',data=artists_billboard,hue='top',kind="count", aspect=3)
<seaborn.axisgrid.FacetGrid at 0x119849ad0>
sb.factorplot('anioNacimiento',data=artists_billboard,kind="count", aspect=3)
<seaborn.axisgrid.FacetGrid at 0x11952c150>
#artists_billboard[['anioNacimiento', 'top']].groupby(['anioNacimiento'], as_index=False).agg(['mean', 'count', 'sum'])
nacimientosPorAnio = artists_billboard['anioNacimiento']
len(nacimientosPorAnio[nacimientosPorAnio<=0])
139
Notamos que tenemos 139 registros de canciones de las que desconocemos el año de nacimiento del artista. Deberemos tratar estos datos para poder utilizar el árbol.
colores=['orange','blue']
tamanios=[60,40]
f1 = artists_billboard['anioNacimiento'].values
f2 = artists_billboard['durationSeg'].values
asignar=[]
for index, row in artists_billboard.iterrows():
asignar.append(colores[row['top']])
plt.scatter(f1, f2, c=asignar, s=30)
plt.axis([1960,2005,0,600])
plt.show()
No perece haber ningún patron a la vista, están bastante mezclados los top de los no-top.
f1 = artists_billboard['chart_date'].values
f2 = artists_billboard['durationSeg'].values
asignar=[]
asignar2=[]
for index, row in artists_billboard.iterrows():
asignar.append(colores[row['top']])
asignar2.append(tamanios[row['top']])
plt.scatter(f1, f2, c=asignar, s=tamanios)
plt.axis([20030101,20160101,0,600])
plt.show()
Esto es porque inicialmente tomé información de 2014 y 2015 y había casi 500 no-top contra sólo 11 registros top. Entonces tomé a los artistas que alcanzaron el top entre 2004 y 2013 para sumar casos positivos y equilibrar un poco nuestros datos de entrada. Así y todo, sigue estando desbalanceado.
def edad_fix(anio):
if anio==0:
return None
return anio
artists_billboard['anioNacimiento']=artists_billboard.apply(lambda x: edad_fix(x['anioNacimiento']), axis=1);
def calcula_edad(anio,cuando):
cad = str(cuando)
momento = cad[:4]
if anio==0.0:
return None
return int(momento) - anio
artists_billboard['edad_en_billboard']=artists_billboard.apply(lambda x: calcula_edad(x['anioNacimiento'],x['chart_date']), axis=1);
artists_billboard.head()
id | title | artist | mood | tempo | genre | artist_type | chart_date | durationSeg | top | anioNacimiento | edad_en_billboard | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Small Town Throwdown | BRANTLEY GILBERT featuring JUSTIN MOORE & THOM... | Brooding | Medium Tempo | Traditional | Male | 20140628 | 191.0 | 0 | 1975.0 | 39.0 |
1 | 1 | Bang Bang | JESSIE J, ARIANA GRANDE & NICKI MINAJ | Energizing | Medium Tempo | Pop | Female | 20140816 | 368.0 | 0 | 1989.0 | 25.0 |
2 | 2 | Timber | PITBULL featuring KE$HA | Excited | Medium Tempo | Urban | Mixed | 20140118 | 223.0 | 1 | 1993.0 | 21.0 |
3 | 3 | Sweater Weather | THE NEIGHBOURHOOD | Brooding | Medium Tempo | Alternative & Punk | Male | 20140104 | 206.0 | 0 | 1989.0 | 25.0 |
4 | 4 | Automatic | MIRANDA LAMBERT | Yearning | Medium Tempo | Traditional | Female | 20140301 | 232.0 | 0 | NaN | NaN |
age_avg = artists_billboard['edad_en_billboard'].mean()
age_std = artists_billboard['edad_en_billboard'].std()
age_null_count = artists_billboard['edad_en_billboard'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
conValoresNulos = np.isnan(artists_billboard['edad_en_billboard'])
artists_billboard.loc[np.isnan(artists_billboard['edad_en_billboard']), 'edad_en_billboard'] = age_null_random_list
artists_billboard['edad_en_billboard'] = artists_billboard['edad_en_billboard'].astype(int)
print("Edad Promedio: " + str(age_avg))
print("Desvió Std Edad: " + str(age_std))
print("Intervalo para asignar edad aleatoria: " + str(int(age_avg - age_std)) + " a " + str(int(age_avg + age_std)))
Edad Promedio: 30.1028225806 Desvió Std Edad: 8.40078832862 Intervalo para asignar edad aleatoria: 21 a 38
f1 = artists_billboard['edad_en_billboard'].values
f2 = artists_billboard.index
colores = ['orange','blue','green']
asignar=[]
for index, row in artists_billboard.iterrows():
if (conValoresNulos[index]):
asignar.append(colores[2]) # verde
else:
asignar.append(colores[row['top']])
plt.scatter(f1, f2, c=asignar, s=30)
plt.axis([15,50,0,650])
plt.show()
Realizaremos un mapeo de los atributos de entrada para poder transformarlos a categorias que podamos utilzar en nuestro árbol de decisión
separador = "### ### ###"
grouped11 = artists_billboard.groupby('mood').size()#.sum().reset_index()
neworder11 = grouped11.sort_values(ascending=False)
print(neworder11)
print(separador)
print("Tempos de Canción: " + str(artists_billboard['tempo'].unique()))
print(separador)
print("Tipos de Artista: " + str(artists_billboard['artist_type'].unique()))
print(separador)
grouped11 = artists_billboard.groupby('genre').size()#.sum().reset_index()
neworder11 = grouped11.sort_values(ascending=False)
print(neworder11)
mood Empowering 88 Cool 78 Energizing 71 Yearning 68 Defiant 64 Excited 62 Sensual 55 Urgent 30 Sophisticated 15 Rowdy 15 Brooding 14 Upbeat 12 Aggressive 11 Melancholy 10 Gritty 9 Romantic 7 Sentimental 7 Fiery 6 Peaceful 4 Easygoing 4 Lively 2 Stirring 2 Other 1 dtype: int64 ### ### ### Tempos de Canción: ['Medium Tempo' 'Slow Tempo' 'Fast Tempo'] ### ### ### Tipos de Artista: ['Male' 'Female' 'Mixed'] ### ### ### genre Urban 215 Pop 188 Traditional 113 Alternative & Punk 60 Electronica 27 Rock 13 Other 12 Soundtrack 5 Jazz 2 dtype: int64
# Mood Mapping
artists_billboard['moodEncoded'] = artists_billboard['mood'].map( {'Energizing': 6,
'Empowering': 6,
'Cool': 5,
'Yearning': 4, # anhelo, deseo, ansia
'Excited': 5, #emocionado
'Defiant': 3,
'Sensual': 2,
'Gritty': 3, #coraje
'Sophisticated': 4,
'Aggressive': 4, # provocativo
'Fiery': 4, #caracter fuerte
'Urgent': 3,
'Rowdy': 4, #ruidoso alboroto
'Sentimental': 4,
'Easygoing': 1, # sencillo
'Melancholy': 4,
'Romantic': 2,
'Peaceful': 1,
'Brooding': 4, # melancolico
'Upbeat': 5, #optimista alegre
'Stirring': 5, #emocionante
'Lively': 5, #animado
'Other': 0,'':0} ).astype(int)
# Tempo Mapping
artists_billboard['tempoEncoded'] = artists_billboard['tempo'].map( {'Fast Tempo': 0, 'Medium Tempo': 2, 'Slow Tempo': 1, '': 0} ).astype(int)
# Genre Mapping
artists_billboard['genreEncoded'] = artists_billboard['genre'].map( {'Urban': 4,
'Pop': 3,
'Traditional': 2,
'Alternative & Punk': 1,
'Electronica': 1,
'Rock': 1,
'Soundtrack': 0,
'Jazz': 0,
'Other':0,'':0}
).astype(int)
# artist_type Mapping
artists_billboard['artist_typeEncoded'] = artists_billboard['artist_type'].map( {'Female': 2, 'Male': 3, 'Mixed': 1, '': 0} ).astype(int)
# Mapping edad en la que llegaron al billboard
artists_billboard.loc[ artists_billboard['edad_en_billboard'] <= 21, 'edadEncoded'] = 0
artists_billboard.loc[(artists_billboard['edad_en_billboard'] > 21) & (artists_billboard['edad_en_billboard'] <= 26), 'edadEncoded'] = 1
artists_billboard.loc[(artists_billboard['edad_en_billboard'] > 26) & (artists_billboard['edad_en_billboard'] <= 30), 'edadEncoded'] = 2
artists_billboard.loc[(artists_billboard['edad_en_billboard'] > 30) & (artists_billboard['edad_en_billboard'] <= 40), 'edadEncoded'] = 3
artists_billboard.loc[ artists_billboard['edad_en_billboard'] > 40, 'edadEncoded'] = 4
# Mapping Song Duration
artists_billboard.loc[ artists_billboard['durationSeg'] <= 150, 'durationEncoded'] = 0
artists_billboard.loc[(artists_billboard['durationSeg'] > 150) & (artists_billboard['durationSeg'] <= 180), 'durationEncoded'] = 1
artists_billboard.loc[(artists_billboard['durationSeg'] > 180) & (artists_billboard['durationSeg'] <= 210), 'durationEncoded'] = 2
artists_billboard.loc[(artists_billboard['durationSeg'] > 210) & (artists_billboard['durationSeg'] <= 240), 'durationEncoded'] = 3
artists_billboard.loc[(artists_billboard['durationSeg'] > 240) & (artists_billboard['durationSeg'] <= 270), 'durationEncoded'] = 4
artists_billboard.loc[(artists_billboard['durationSeg'] > 270) & (artists_billboard['durationSeg'] <= 300), 'durationEncoded'] = 5
artists_billboard.loc[ artists_billboard['durationSeg'] > 300, 'durationEncoded'] = 6
drop_elements = ['id','title','artist','mood','tempo','genre','artist_type','chart_date','anioNacimiento','durationSeg','edad_en_billboard']
artists_encoded = artists_billboard.drop(drop_elements, axis = 1)
artists_encoded.head()
top | moodEncoded | tempoEncoded | genreEncoded | artist_typeEncoded | edadEncoded | durationEncoded | |
---|---|---|---|---|---|---|---|
0 | 0 | 4 | 2 | 2 | 3 | 3.0 | 2.0 |
1 | 0 | 6 | 2 | 3 | 2 | 1.0 | 6.0 |
2 | 1 | 5 | 2 | 4 | 1 | 0.0 | 3.0 |
3 | 0 | 4 | 2 | 1 | 3 | 1.0 | 2.0 |
4 | 0 | 4 | 2 | 2 | 2 | 3.0 | 3.0 |
artists_encoded.describe()
top | moodEncoded | tempoEncoded | genreEncoded | artist_typeEncoded | edadEncoded | durationEncoded | |
---|---|---|---|---|---|---|---|
count | 635.000000 | 635.000000 | 635.000000 | 635.000000 | 635.000000 | 635.000000 | 635.000000 |
mean | 0.222047 | 4.344882 | 1.730709 | 2.755906 | 2.459843 | 2.025197 | 3.179528 |
std | 0.415950 | 1.350003 | 0.603553 | 1.165463 | 0.740583 | 1.145280 | 1.775017 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 3.000000 | 2.000000 | 2.000000 | 2.000000 | 1.000000 | 2.000000 |
50% | 0.000000 | 4.000000 | 2.000000 | 3.000000 | 3.000000 | 2.000000 | 3.000000 |
75% | 0.000000 | 5.500000 | 2.000000 | 4.000000 | 3.000000 | 3.000000 | 4.000000 |
max | 1.000000 | 6.000000 | 2.000000 | 4.000000 | 3.000000 | 4.000000 | 6.000000 |
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sb.heatmap(artists_encoded.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x11a47a4d0>
artists_encoded[['moodEncoded', 'top']].groupby(['moodEncoded'], as_index=False).agg(['mean', 'count', 'sum'])
top | |||
---|---|---|---|
mean | count | sum | |
moodEncoded | |||
0 | 0.000000 | 1 | 0 |
1 | 0.000000 | 8 | 0 |
2 | 0.274194 | 62 | 17 |
3 | 0.145631 | 103 | 15 |
4 | 0.136986 | 146 | 20 |
5 | 0.294872 | 156 | 46 |
6 | 0.270440 | 159 | 43 |
artists_encoded[['artist_typeEncoded', 'top']].groupby(['artist_typeEncoded'], as_index=False).agg(['mean', 'count', 'sum'])
top | |||
---|---|---|---|
mean | count | sum | |
artist_typeEncoded | |||
1 | 0.305263 | 95 | 29 |
2 | 0.320261 | 153 | 49 |
3 | 0.162791 | 387 | 63 |
artists_encoded[['genreEncoded', 'top']].groupby(['genreEncoded'], as_index=False).agg(['mean', 'count', 'sum'])
top | |||
---|---|---|---|
mean | count | sum | |
genreEncoded | |||
0 | 0.105263 | 19 | 2 |
1 | 0.070000 | 100 | 7 |
2 | 0.008850 | 113 | 1 |
3 | 0.319149 | 188 | 60 |
4 | 0.330233 | 215 | 71 |
artists_encoded[['tempoEncoded', 'top']].groupby(['tempoEncoded'], as_index=False).agg(['mean', 'count', 'sum'])
top | |||
---|---|---|---|
mean | count | sum | |
tempoEncoded | |||
0 | 0.226415 | 53 | 12 |
1 | 0.246154 | 65 | 16 |
2 | 0.218569 | 517 | 113 |
artists_encoded[['durationEncoded', 'top']].groupby(['durationEncoded'], as_index=False).agg(['mean', 'count', 'sum'])
top | |||
---|---|---|---|
mean | count | sum | |
durationEncoded | |||
0.0 | 0.295775 | 71 | 21 |
1.0 | 0.333333 | 30 | 10 |
2.0 | 0.212963 | 108 | 23 |
3.0 | 0.202381 | 168 | 34 |
4.0 | 0.232143 | 112 | 26 |
5.0 | 0.145455 | 55 | 8 |
6.0 | 0.208791 | 91 | 19 |
artists_encoded[['edadEncoded', 'top']].groupby(['edadEncoded'], as_index=False).agg(['mean', 'count', 'sum'])
top | |||
---|---|---|---|
mean | count | sum | |
edadEncoded | |||
0.0 | 0.257576 | 66 | 17 |
1.0 | 0.300613 | 163 | 49 |
2.0 | 0.260563 | 142 | 37 |
3.0 | 0.165899 | 217 | 36 |
4.0 | 0.042553 | 47 | 2 |
cv = KFold(n_splits=10) # Numero deseado de "folds" que haremos
accuracies = list()
max_attributes = len(list(artists_encoded))
depth_range = range(1, max_attributes + 1)
# Testearemos la profundidad de 1 a cantidad de atributos +1
for depth in depth_range:
fold_accuracy = []
tree_model = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=20,
min_samples_leaf=5,
max_depth = depth,
class_weight={1:3.5})
for train_fold, valid_fold in cv.split(artists_encoded):
f_train = artists_encoded.loc[train_fold]
f_valid = artists_encoded.loc[valid_fold]
model = tree_model.fit(X = f_train.drop(['top'], axis=1),
y = f_train["top"])
valid_acc = model.score(X = f_valid.drop(['top'], axis=1),
y = f_valid["top"]) # calculamos la precision con el segmento de validacion
fold_accuracy.append(valid_acc)
avg = sum(fold_accuracy)/len(fold_accuracy)
accuracies.append(avg)
# Mostramos los resultados obtenidos
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))
Max Depth Average Accuracy 1 0.556101 2 0.556126 3 0.564038 4 0.648859 5 0.617386 6 0.614236 7 0.625124
# Crear arrays de entrenamiento y las etiquetas que indican si llegó a top o no
y_train = artists_encoded['top']
x_train = artists_encoded.drop(['top'], axis=1).values
# Crear Arbol de decision con profundidad = 4
decision_tree = tree.DecisionTreeClassifier(criterion='entropy',
min_samples_split=20,
min_samples_leaf=5,
max_depth = 4,
class_weight={1:3.5})
decision_tree.fit(x_train, y_train)
# exportar el modelo a archivo .dot
with open(r"tree1.dot", 'w') as f:
f = tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 7,
impurity = True,
feature_names = list(artists_encoded.drop(['top'], axis=1)),
class_names = ['No', 'N1 Billboard'],
rounded = True,
filled= True )
# Convertir el archivo .dot a png para poder visualizarlo
check_call(['dot','-Tpng',r'tree1.dot','-o',r'tree1.png'])
PImage("tree1.png")
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
print(acc_decision_tree)
64.88
#predecir artista CAMILA CABELLO featuring YOUNG THUG
# con su canción Havana llego a numero 1 Billboard US en 2017
x_test = pd.DataFrame(columns=('top','moodEncoded', 'tempoEncoded', 'genreEncoded','artist_typeEncoded','edadEncoded','durationEncoded'))
x_test.loc[0] = (1,5,2,4,1,0,3)
y_pred = decision_tree.predict(x_test.drop(['top'], axis = 1))
print("Prediccion: " + str(y_pred))
y_proba = decision_tree.predict_proba(x_test.drop(['top'], axis = 1))
print("Probabilidad de Acierto: " + str(round(y_proba[0][y_pred][0]* 100, 2))+"%")
Prediccion: [1] Probabilidad de Acierto: 83.73%
#predecir artista Imagine Dragons
# con su canción Believer llego al puesto 42 Billboard US en 2017
x_test = pd.DataFrame(columns=('top','moodEncoded', 'tempoEncoded', 'genreEncoded','artist_typeEncoded','edadEncoded','durationEncoded'))
x_test.loc[0] = (0,4,2,1,3,2,3)
y_pred = decision_tree.predict(x_test.drop(['top'], axis = 1))
print("Prediccion: " + str(y_pred))
y_proba = decision_tree.predict_proba(x_test.drop(['top'], axis = 1))
print("Probabilidad de Acierto: " + str(round(y_proba[0][y_pred][0]* 100, 2))+"%")
Prediccion: [0] Probabilidad de Acierto: 88.89%