%matplotlib inline import json import requests import pandas as pd import numpy as np import matplotlib.pyplot as plt pd.set_option('display.width', 500) pd.set_option('display.max_columns', 30) # set some nicer defaults for matplotlib from matplotlib import rcParams #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecessary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() api_key = 'PUT YOUR KEY HERE' movie_id = '770672122' # toy story 3 url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id #these are "get parameters" options = {'review_type': 'top_critic', 'page_limit': 20, 'page': 1, 'apikey': api_key} data = requests.get(url, params=options).text data = json.loads(data) # load a json string into a collection of lists and dicts print json.dumps(data['reviews'][0], indent=2) # dump an object into a json string from io import StringIO movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text movie_file = StringIO(movie_txt) # treat a string like a file movies = pd.read_csv(movie_file, delimiter='\t') #print the first row movies[['id', 'title', 'imdbID', 'year']].irow(0) """ Function -------- fetch_reviews(movies, row) Use the Rotten Tomatoes web API to fetch reviews for a particular movie Parameters ---------- movies : DataFrame The movies data above row : int The row of the movies DataFrame to use Returns ------- If you can match the IMDB id to a Rotten Tomatoes ID: A DataFrame, containing the first 20 Top Critic reviews for the movie. If a movie has less than 20 total reviews, return them all. This should have the following columns: critic : Name of the critic fresh : 'fresh' or 'rotten' imdb : IMDB id for the movie publication: Publication that the critic writes for quote : string containing the movie review quote review_data: Date of review rtid : Rotten Tomatoes ID for the movie title : Name of the movie If you cannot match the IMDB id to a Rotten Tomatoes ID, return None Examples -------- >>> reviews = fetch_reviews(movies, 0) >>> print len(reviews) 20 >>> print reviews.irow(1) critic Derek Adams fresh fresh imdb 114709 publication Time Out quote So ingenious in concept, design and execution ... review_date 2009-10-04 rtid 9559 title Toy story Name: 1, dtype: object """ #your code here """ Function -------- build_table Parameters ---------- movies : DataFrame The movies data above rows : int The number of rows to extract reviews for Returns -------- A dataframe The data obtained by repeatedly calling `fetch_reviews` on the first `rows` of `movies`, discarding the `None`s, and concatenating the results into a single DataFrame """ #your code here #you can toggle which lines are commented, if you #want to re-load your results to avoid repeatedly calling this function #critics = build_table(movies, 3000) #critics.to_csv('critics.csv', index=False) critics = pd.read_csv('critics.csv') #for this assignment, let's drop rows with missing data critics = critics[~critics.quote.isnull()] critics = critics[critics.fresh != 'none'] critics = critics[critics.quote.str.len() > 0] assert set(critics.columns) == set('critic fresh imdb publication ' 'quote review_date rtid title'.split()) assert len(critics) > 10000 #your code here #Your code here #Your code here #Your code here #Your code here from sklearn.feature_extraction.text import CountVectorizer text = ['Hop on pop', 'Hop off pop', 'Hop Hop hop'] print "Original text is\n", '\n'.join(text) vectorizer = CountVectorizer(min_df=0) # call `fit` to build the vocabulary vectorizer.fit(text) # call `transform` to convert text to a bag of words x = vectorizer.transform(text) # CountVectorizer uses a sparse array to save memory, but it's easier in this assignment to # convert back to a "normal" numpy array x = x.toarray() print print "Transformed text vector is \n", x # `get_feature_names` tracks which word is associated with each column of the transformed x print print "Words for each feature:" print vectorizer.get_feature_names() # Notice that the bag of words treatment doesn't preserve information about the *order* of words, # just their frequency #hint: Consult the scikit-learn documentation to # learn about what these classes do do from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB """ Function -------- make_xy Build a bag-of-words training set for the review data Parameters ----------- critics : Pandas DataFrame The review data from above vectorizer : CountVectorizer object (optional) A CountVectorizer object to use. If None, then create and fit a new CountVectorizer. Otherwise, re-fit the provided CountVectorizer using the critics data Returns ------- X : numpy array (dims: nreview, nwords) Bag-of-words representation for each review. Y : numpy array (dims: nreview) 1/0 array. 1 = fresh review, 0 = rotten review Examples -------- X, Y = make_xy(critics) """ def make_xy(critics, vectorizer=None): #Your code here X, Y = make_xy(critics) #Your code here # Your code here. Print the accuracy on the test and training dataset """ Function -------- calibration_plot Builds a plot like the one above, from a classifier and review data Inputs ------- clf : Classifier object A MultinomialNB classifier X : (Nexample, Nfeature) array The bag-of-words data Y : (Nexample) integer array 1 if a review is Fresh """ #your code here calibration_plot(clf, xtest, ytest) """ Function -------- log_likelihood Compute the log likelihood of a dataset according to a bayesian classifier. The Log Likelihood is defined by L = Sum_fresh(logP(fresh)) + Sum_rotten(logP(rotten)) Where Sum_fresh indicates a sum over all fresh reviews, and Sum_rotten indicates a sum over rotten reviews Parameters ---------- clf : Bayesian classifier x : (nexample, nfeature) array The input data y : (nexample) integer array Whether each review is Fresh """ #your code here from sklearn.cross_validation import KFold def cv_score(clf, x, y, score_func): """ Uses 5-fold cross validation to estimate a score of a classifier Inputs ------ clf : Classifier object x : Input feature vector y : Input class labels score_func : Function like log_likelihood, that takes (clf, x, y) as input, and returns a score Returns ------- The average score obtained by randomly splitting (x, y) into training and test sets, fitting on the training set, and evaluating score_func on the test set Examples cv_score(clf, x, y, log_likelihood) """ result = 0 nfold = 5 for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times clf.fit(x[train], y[train]) # fit result += score_func(clf, x[test], y[test]) # evaluate score function on held-out data return result / nfold # average # as a side note, this function is builtin to the newest version of sklearn. We could just write # sklearn.cross_validation.cross_val_score(clf, x, y, scorer=log_likelihood). #the grid of parameters to search over alphas = [0, .1, 1, 5, 10, 50] min_dfs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] #Find the best value for alpha and min_df, and the best classifier best_alpha = None best_min_df = None max_loglike = -np.inf for alpha in alphas: for min_df in min_dfs: vectorizer = CountVectorizer(min_df = min_df) X, Y = make_xy(critics, vectorizer) #your code here print "alpha: %f" % best_alpha print "min_df: %f" % best_min_df #Your code here # Your code here #Your code here