import pandas as pd DATA_PATH = "/home/saasbook/datascience-sp14/hw2" # Make this the /path/to/the/data def parse_artists_tags(filename): df = pd.read_csv(filename, sep="|", names=["ArtistID", "ArtistName", "Tag", "Count"]) return df def parse_user_artists_matrix(filename): df = pd.read_csv(filename) return df artists_tags = parse_artists_tags(DATA_PATH + "/artists-tags.txt") user_art_mat = parse_user_artists_matrix(DATA_PATH + "/userart-mat-training.csv") print "Number of tags %d" % 0 # Change this line. Should be 952803 print "Number of artists %d" % 0 # Change this line. Should be 17119 # TODO Implement this. You can change the function arguments if necessary # Return a data structure that contains (artist id, artist name, top tag) for every artist def calculate_top_tag(all_tags): pass top_tags = calculate_top_tag(artists_tags) # Print the top tag for Nirvana # Artist ID for Nirvana is 5b11f4ce-a62d-471e-81fc-a69a8278c7da # Should be 'Grunge' print "Top tag for Nirvana is %s" % "TODO: Tag goes here" # Complete this line def create_user_matrix(input_data): pass user_np_matrix = create_user_matrix(user_art_mat) print user_np_matrix.shape # Should be (17119, 846) %%time from sklearn.cluster import KMeans # Run K-means using 5 cluster centers on user_np_matrix kmeans_5 = None %%time kmeans_25 = None %%time kmeans_50 = None print "Inertia for KMeans with 5 clusters = %lf " % 0.0 print "Inertia for KMeans with 25 clusters = %lf " % 0.0 print "Inertia for KMeans with 50 clusters = %lf " % 0.0 from sklearn.metrics import silhouette_score # NOTE: Use 500 sample points to calculate the silhouette score def get_silhouette_score(data, model): pass print "Silhouette Score for KMeans with 5 clusters = %lf" % 0.0 print "Silhouette Score for KMeans with 25 clusters = %lf " % 0.0 print "Silhouette Score for KMeans with 50 clusters = %lf " % 0.0 # Return a data structure that contains artist_id, artist_name, top tag, cluster_label for every artist def join_tags_labels(artists_data, user_data, kmeans_model): pass # Run the function for all the models kmeans_5_joined = None kmeans_25_joined = None kmeans_50_joined = None # Return a data structure that contains cluster_id, list of top 5 tags for every cluster def assign_cluster_tags(joined_data): pass kmeans_5_genres = None kmeans_25_genres = None kmeans_50_genres = None def get_cluster_purity(joined_data): pass print "Purity for KMeans with 5 centers %lf " % 0.0 print "Purity for KMeans with 25 centers %lf " % 0.0 print "Purity for KMeans with 50 centers %lf " % 0.0 def get_accuracy(joined_data): pass print "Accuracy of KMeans with 5 centers %lf " % 0.0 print "Accuracy of KMeans with 25 centers %lf " % 0.0 print "Accuracy of KMeans with 50 centers %lf " % 0.0 user_art_mat_test = parse_user_artists_matrix(DATA_PATH + "/userart-mat-test.csv") # NOTE: the astype(float) converts integer to floats here user_np_matrix_test = create_user_matrix(user_art_mat_test).astype(float) user_np_matrix_test.shape # Should be (1902, 846) # For every artist return a list of labels def predict_cluster(test_data, test_np_matrix, kmeans_model): pass # Call the function for every model from before kmeans_5_predicted = None kmeans_25_predicted = None kmeans_50_predicted = None # Calculate recall for our predictions def verify_predictions(predicted_artist_labels, cluster_genres, top_tag_data): pass # Use verify_predictions for every model print "Recall of KMeans with 5 centers %lf " % 0.0 print "Recall of KMeans with 25 centers %lf " % 0.0 print "Recall of KMeans with 50 centers %lf " % 0.0 from sklearn.decomposition import RandomizedPCA import numpy as np sample_percent = 0.20 rows_to_sample = int(np.ceil(sample_percent * user_np_matrix.shape[0])) sampled_data = input_data[np.random.choice(input_data.shape[0], rows_to_sample, replace=False),:] # Return the data reduced to 2 principal components def get_reduced_data(input_data): pass user_np_2d = get_reduced_data(sampled_data) # TODO: Write code to fit and plot reduced_data. import pylab as pl %pylab inline %pylab inline import pandas as pd train = pd.read_csv(DATA_PATH + "/train_model_data.csv") validation = pd.read_csv(DATA_PATH + "/validation_model_data.csv") # TODO: Your commands for data exploration here. # TODO: Your commands to generate a scatter plot here. def basic_prep(data, col): #TODO - make a copy of the original dataset but with the categorical variables removed! *Cluster* should be thought of as a #categorical variable and should be removed! Make use of pandas ".drop" function. #TODO - impute missing values with the mean of those columns, use pandas ".fillna" function to accomplish this. pass #This will create two new data frames, one that contains training data - in this case all the numeric columns, #and one that contains response data - in this case, the "plays" column. train_basic_features, train_basic_response = basic_prep(train, 'plays') validation_basic_features, validation_basic_response = basic_prep(validation, 'plays') from sklearn import linear_model def fit_model(X, y): #TODO - Write a function that fits a linear model to a dataset given a column of values to predict. pass def score_model(model, X, y, Xv, yv): #TODO - Write a function that returns scores of a model given its training #features and response and validation features and response. #The output should be a tuple of two model scores. pass def fit_model_and_score(data, response, validation, val_response): #TODO - Given a training dataset, a validation dataset, and the name of a column to predict, #Using the model's ".score()" method, return the model score on the training data *and* the validation data #as a tuple of two doubles. pass #END TODO print fit_model_and_score(train_basic_features, train_basic_response, validation_basic_features, validation_basic_response) model = fit_model(train_basic_features, train_basic_response) def residuals(features, y, model): #TODO - Write a function that calculates model residuals given input features, ground truth, and the model. pass #TODO - Plot the histogram of the residuals of your current model. from sklearn import linear_model #TODO - Using what you built above, build a model using the log of the number of plays as the response variable. #TODO Plot residuals of your log model. Note - we want to see these on a "plays" scale, not a "log(plays)" scale! from sklearn import feature_extraction def one_hot_dataframe(data, cols, vec=None): """ Takes a dataframe and a list of columns that need to be encoded. Returns a tuple comprising the data, and the fitted vectorizor. Based on https://gist.github.com/kljensen/5452382 """ if vec is None: vec = feature_extraction.DictVectorizer() vec.fit(data[cols].to_dict(outtype='records')) vecData = pd.DataFrame(vec.transform(data[cols].to_dict(outtype='records')).toarray()) vecData.columns = vec.get_feature_names() vecData.index = data.index data = data.drop(cols, axis=1) data = data.join(vecData) return (data, vec) def prep_dset(data, col, vec=None): #Convert the clusters to strings. new_data = data new_data['cluster'] = new_data['cluster'].apply(str) #Encode the data with OneHot Encoding. new_data, vec = one_hot_dataframe(new_data, ['country1','cluster'], vec) #Eliminate features we don't want to use in the model. badcols = ['country2','country3','artid','key','age'] new_data = new_data.drop(badcols, axis=1) new_data = new_data.fillna(new_data.mean()) return (new_data.drop([col], axis=1), pd.DataFrame(new_data[col]), vec) train_cats_features, train_cats_response, vec = prep_dset(train, 'plays') validation_cats_features, validation_cats_response, _ = prep_dset(validation, 'plays', vec) print fit_model_and_score(train_cats_features, train_cats_response, validation_cats_features, validation_cats_response) from sklearn import tree def fit_tree(X, y, depth=10): ##TODO: Using the DecisionTreeRegressor, train a model to depth 10. pass def fit_model_and_score_tree(train_features, train_response, val_features, val_response): ##TODO: Fit a tree model and report the score on both the training set and test set. pass print fit_model_and_score_tree(train_basic_features, train_basic_response, validation_basic_features, validation_basic_response) print fit_model_and_score_tree(train_cats_features, train_cats_response, validation_cats_features, validation_cats_response) import StringIO import pydot from IPython.display import Image tmodel = fit_tree(train_basic_features, train_basic_response, 3) def display_tree(tmodel): dot_data = StringIO.StringIO() tree.export_graphviz(tmodel, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) return Image(graph.create_png()) display_tree(tmodel) tmodel = fit_tree(train_basic_features, train_basic_response, 10) pd.DataFrame(tmodel.feature_importances_, train_basic_features.columns)