import pandas as pd

DATA_PATH = "/home/saasbook/datascience-sp14/hw2" # Make this the /path/to/the/data

def parse_artists_tags(filename):
    df = pd.read_csv(filename, sep="|", names=["ArtistID", "ArtistName", "Tag", "Count"])
    return df

def parse_user_artists_matrix(filename):
    df = pd.read_csv(filename)
    return df

artists_tags = parse_artists_tags(DATA_PATH + "/artists-tags.txt")
user_art_mat = parse_user_artists_matrix(DATA_PATH + "/userart-mat-training.csv")

print "Number of tags %d" % 0 # Change this line. Should be 952803
print "Number of artists %d" % 0 # Change this line. Should be 17119

# TODO Implement this. You can change the function arguments if necessary
# Return a data structure that contains (artist id, artist name, top tag) for every artist
def calculate_top_tag(all_tags):
    pass

top_tags = calculate_top_tag(artists_tags)

# Print the top tag for Nirvana
# Artist ID for Nirvana is 5b11f4ce-a62d-471e-81fc-a69a8278c7da
# Should be 'Grunge'
print "Top tag for Nirvana is %s" % "TODO: Tag goes here" # Complete this line 

def create_user_matrix(input_data):
    pass

user_np_matrix = create_user_matrix(user_art_mat)

print user_np_matrix.shape # Should be (17119, 846)

%%time
from sklearn.cluster import KMeans

# Run K-means using 5 cluster centers on user_np_matrix
kmeans_5 = None

%%time
kmeans_25 = None

%%time
kmeans_50 = None

print "Inertia for KMeans with 5 clusters = %lf " % 0.0
print "Inertia for KMeans with 25 clusters =  %lf " % 0.0
print "Inertia for KMeans with 50 clusters = %lf " % 0.0

from sklearn.metrics import silhouette_score

# NOTE: Use 500 sample points to calculate the silhouette score
def get_silhouette_score(data, model):
    pass

print "Silhouette Score for KMeans with 5 clusters = %lf" % 0.0
print "Silhouette Score for KMeans with 25 clusters = %lf " % 0.0
print "Silhouette Score for KMeans with 50 clusters = %lf " % 0.0

# Return a data structure that contains artist_id, artist_name, top tag, cluster_label for every artist
def join_tags_labels(artists_data, user_data, kmeans_model):
    pass

# Run the function for all the models
kmeans_5_joined = None
kmeans_25_joined = None
kmeans_50_joined = None

# Return a data structure that contains cluster_id, list of top 5 tags for every cluster
def assign_cluster_tags(joined_data):
    pass
    
kmeans_5_genres = None
kmeans_25_genres = None
kmeans_50_genres = None

def get_cluster_purity(joined_data):
    pass
    
print "Purity for KMeans with 5 centers %lf " % 0.0
print "Purity for KMeans with 25 centers %lf " % 0.0
print "Purity for KMeans with 50 centers %lf " % 0.0

def get_accuracy(joined_data):
    pass
    
print "Accuracy of KMeans with 5 centers %lf " % 0.0
print "Accuracy of KMeans with 25 centers %lf " % 0.0
print "Accuracy of KMeans with 50 centers %lf " % 0.0

user_art_mat_test = parse_user_artists_matrix(DATA_PATH + "/userart-mat-test.csv")
# NOTE: the astype(float) converts integer to floats here
user_np_matrix_test = create_user_matrix(user_art_mat_test).astype(float)

user_np_matrix_test.shape # Should be (1902, 846)

# For every artist return a list of labels
def predict_cluster(test_data, test_np_matrix, kmeans_model):
    pass

# Call the function for every model from before
kmeans_5_predicted = None
kmeans_25_predicted = None
kmeans_50_predicted = None

# Calculate recall for our predictions
def verify_predictions(predicted_artist_labels, cluster_genres, top_tag_data):
    pass

# Use verify_predictions for every model
print "Recall of KMeans with 5 centers %lf " % 0.0
print "Recall of KMeans with 25 centers %lf " % 0.0
print "Recall of KMeans with 50 centers %lf " % 0.0

from sklearn.decomposition import RandomizedPCA
import numpy as np

sample_percent = 0.20
rows_to_sample = int(np.ceil(sample_percent * user_np_matrix.shape[0]))
sampled_data = input_data[np.random.choice(input_data.shape[0], rows_to_sample, replace=False),:]

# Return the data reduced to 2 principal components
def get_reduced_data(input_data):
    pass

user_np_2d = get_reduced_data(sampled_data)

# TODO: Write code to fit and plot reduced_data.
import pylab as pl
%pylab inline

%pylab inline
import pandas as pd

train = pd.read_csv(DATA_PATH + "/train_model_data.csv")
validation = pd.read_csv(DATA_PATH + "/validation_model_data.csv")

# TODO: Your commands for data exploration here.

# TODO: Your commands to generate a scatter plot here.

def basic_prep(data, col):
    #TODO - make a copy of the original dataset but with the categorical variables removed! *Cluster* should be thought of as a 
    #categorical variable and should be removed! Make use of pandas ".drop" function.
    
    #TODO - impute missing values with the mean of those columns, use pandas ".fillna" function to accomplish this.
    
    pass

#This will create two new data frames, one that contains training data - in this case all the numeric columns,
#and one that contains response data - in this case, the "plays" column.
train_basic_features, train_basic_response = basic_prep(train, 'plays')
validation_basic_features, validation_basic_response = basic_prep(validation, 'plays')

from sklearn import linear_model

def fit_model(X, y):
    #TODO - Write a function that fits a linear model to a dataset given a column of values to predict.
    pass

def score_model(model, X, y, Xv, yv):
    #TODO - Write a function that returns scores of a model given its training 
    #features and response and validation features and response. 
    #The output should be a tuple of two model scores.
    pass

def fit_model_and_score(data, response, validation, val_response):
    #TODO - Given a training dataset, a validation dataset, and the name of a column to predict, 
    #Using the model's ".score()" method, return the model score on the training data *and* the validation data
    #as a tuple of two doubles.
    pass
    #END TODO

print fit_model_and_score(train_basic_features, train_basic_response, validation_basic_features, validation_basic_response)

model = fit_model(train_basic_features, train_basic_response)


def residuals(features, y, model):
    #TODO - Write a function that calculates model residuals given input features, ground truth, and the model.
    pass

#TODO - Plot the histogram of the residuals of your current model.

from sklearn import linear_model

#TODO - Using what you built above, build a model using the log of the number of plays as the response variable.


#TODO Plot residuals of your log model. Note - we want to see these on a "plays" scale, not a "log(plays)" scale!

from sklearn import feature_extraction

def one_hot_dataframe(data, cols, vec=None):
    """ Takes a dataframe and a list of columns that need to be encoded.
        Returns a tuple comprising the data, and the fitted vectorizor.
        
        Based on https://gist.github.com/kljensen/5452382
    """
    if vec is None:
        vec = feature_extraction.DictVectorizer()
        vec.fit(data[cols].to_dict(outtype='records'))
    
    vecData = pd.DataFrame(vec.transform(data[cols].to_dict(outtype='records')).toarray())
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    
    data = data.drop(cols, axis=1)
    data = data.join(vecData)
    return (data, vec)

def prep_dset(data, col, vec=None):
    #Convert the clusters to strings.
    new_data = data
    new_data['cluster'] = new_data['cluster'].apply(str)
    
    #Encode the data with OneHot Encoding.
    new_data, vec = one_hot_dataframe(new_data, ['country1','cluster'], vec)

    #Eliminate features we don't want to use in the model.
    badcols = ['country2','country3','artid','key','age']
    new_data = new_data.drop(badcols, axis=1)
    
    new_data = new_data.fillna(new_data.mean())
    
    return (new_data.drop([col], axis=1), pd.DataFrame(new_data[col]), vec)
    

train_cats_features, train_cats_response, vec = prep_dset(train, 'plays')
validation_cats_features, validation_cats_response, _ = prep_dset(validation, 'plays', vec)

print fit_model_and_score(train_cats_features, train_cats_response, validation_cats_features, validation_cats_response)

from sklearn import tree

def fit_tree(X, y, depth=10):
    ##TODO: Using the DecisionTreeRegressor, train a model to depth 10.
    pass

def fit_model_and_score_tree(train_features, train_response, val_features, val_response):
    ##TODO: Fit a tree model and report the score on both the training set and test set.
    pass

print fit_model_and_score_tree(train_basic_features, train_basic_response, validation_basic_features, validation_basic_response)
print fit_model_and_score_tree(train_cats_features, train_cats_response, validation_cats_features, validation_cats_response)

import StringIO 
import pydot 
from IPython.display import Image

tmodel = fit_tree(train_basic_features, train_basic_response, 3)

def display_tree(tmodel):
    dot_data = StringIO.StringIO() 
    tree.export_graphviz(tmodel, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    return Image(graph.create_png())
    
display_tree(tmodel)

tmodel = fit_tree(train_basic_features, train_basic_response, 10)

pd.DataFrame(tmodel.feature_importances_, train_basic_features.columns)