# this python code downloads some of my research groups software from the web # first use the urllib library to download the zip file from github. import urllib urllib.urlretrieve('https://github.com/sods/ods/archive/master.zip', 'master.zip') # now use the zip file library to unzip the software import zipfile zip = zipfile.ZipFile('./master.zip', 'r') for name in zip.namelist(): zip.extract(name, '.') # finally use the sys library to add the location of the new software to the python # path, where it will be found by the computer.add the module location to the python path. import sys sys.path.append("./ods-master/") import pods # Our software for python open data science (pods) import pandas as pd # the pandas library for data analysis d = pods.datasets.movie_body_count() movies = d['Y'] # this is an 'IPython magic' command that makes plots appear in the notebook. %matplotlib inline # we import this library for plotting, we use it to create a scatter plot of 'body count' per film per year. import pylab as plt fig, ax = plt.subplots(figsize=(8,8)) # create an axis to put the plot on. ax.plot(movies.Year, movies.Body_Count, 'ro') # plot the bodies vs the year. plt.xlabel('year') # always label your axes! plt.ylabel('body count') plt.title('Deaths in Films from 1949 to 2009') import numpy as np # we need the numpy library for the logarithm fig, ax = plt.subplots(figsize=(8,8)) # create an axis to put the plot on. ax.plot(movies.Year, np.log10(movies.Body_Count), 'ro') # plot the bodies vs the year. plt.xlabel('year') # always label your axes! plt.ylabel('log(body count)') plt.title('Deaths in Films from 1949 to 2009') # this command will sort movies by the 'Body_Count' column in descending order (ascending=False) movies.sort(columns='Body_Count', ascending=False) # plotting cosine tau = 2*np.pi # tau is 2 times pi. theta = np.linspace(-0.5*tau, 0.5*tau, 100) # this gives us 100 points in *radians* (in degrees they'd be from -180 to 180) f = np.cos(theta) fig, ax = plt.subplots(figsize=(12, 6)) ax.plot(theta, f, 'r-') plt.xlabel('$\\theta$') plt.ylabel('$\\cos(\\theta)$') url = 'https://www.dropbox.com/s/i4o4quo0oplq62m/movies.csv?dl=0&raw=1' import pandas as pd import os # I have a python script on my computer that collates all your ratings. Then it writes the results to a dropbox directory. # this line loads that result from the dropbox directory. pods.util.download_url(url, store_directory = 'ambassadors_movie', save_name='movies.csv') movies = pd.read_csv(os.path.join(pods.datasets.data_path, 'ambassadors_movie', 'movies.csv')).set_index('index') # this line does 'set' subtraction (it converts the lists of columns into two sets, then subtracts the set that doesn't # include user names) user_names = list(set(movies.columns)-set(movies.columns[:9])) # import the data analysis library import pandas as pd q = 2 # the dimension of our map of the 'library' counter = 0 # a counter for storing how many updates we've done. import numpy as np init_u = np.random.normal(size=(len(user_names), q))*0.001 init_v = np.random.normal(size=(len(movies.index), q))*0.001 U = pd.DataFrame(init_u, index=user_names) V = pd.DataFrame(init_v, index=movies.index) print U # first let's extract the names in movies.csv that have rated the data. The first columns are information about the movie # This command takes the columns 'Film' and 'index' and creates a new 'triplet structure' like the one we mentioned above Y = pd.melt(movies.reset_index(), id_vars=['Film', 'index'], var_name='user', value_name='rating', value_vars=user_names) # this command renames the column titled 'index' with the title 'item' Y.rename(columns={'index':'item'}, inplace=True) # This command removes all the missing values. Y = Y.dropna(axis=0) Y Y.rating.hist() plt.title('Histogram of Ratings') Y.rating = Y.rating - Y.rating.mean() def objective(Y_vals, U_vals, V_vals): obj = 0. for ind, series in Y_vals.iterrows(): film = series['item'] user = series['user'] rating = series['rating'] prediction = np.dot(U_vals.loc[user], V_vals.loc[film]) # vTu diff = prediction - rating # vTu - y obj += diff*diff return obj learn_rate = 0.1 counter = 0 import sys epochs = 100 for i in range(epochs): # loop across the ratings for ind, series in Y.iterrows(): # get which film and user and value the rating us film = series['item'] user = series['user'] y = series['rating'] # get u and v out from storage. u = U.loc[user].values v = V.loc[film].values # get the u and v vectors from the matrices. # compute the update for u u = u + learn_rate*v*(y-np.dot(u,v)) # compute the update for v v = v + learn_rate*u*(y-np.dot(u,v)) # put u and v back in storage. U.loc[user]=u V.loc[film]=v counter +=1 if not counter % 1000: # checks if there is no remainder from dividing counter by 1000. obj = objective(Y, U, V) print "Update:", counter, "Objective:", obj # create a figure axis fig, axes = plt.subplots(figsize=(8,8)) # plot the movies, and label with your names axes.plot(U[0], U[1], 'rx') for name in U.index: axes.text(U[0][name], U[1][name], name) # create a new figure axis fig, axes = plt.subplots(figsize=(8,8)) axes.plot(V[0], V[1], 'rx') for index in V.index: # display the movie name if it was rated more than 5 times. if np.sum(Y['item']==index)>2: axes.text(V[0][index], V[1][index], movies['Film'][index]) import pods d = pods.datasets.movielens100k() film_info = d['film_info'] Y_lens=d['Y'] Y_lens.describe() Y_lens.rating = Y_lens.rating - Y_lens.rating.mean() user_index = Y_lens.user.unique() item_index = Y_lens.item.unique() init_u = np.random.normal(size=(len(user_index), q))*0.001 init_v = np.random.normal(size=(len(item_index), q))*0.001 U_lens = pd.DataFrame(init_u, index=user_index) V_lens = pd.DataFrame(init_v, index=item_index) learn_rate = 0.01 counter = 0 Y_lens # warning, this will take a lot longer than the example above! # if you start it accidentally and want it to stop, use Kernel->Interrupt from the menu above. import sys epochs = 5 for i in range(epochs): # loop across the ratings for ind, series in Y_lens.iterrows(): # get which film and user and value the rating us film = series['item'] user = series['user'] y = series['rating'] # get u and v out from storage u = U_lens.loc[user].values v = V_lens.loc[film].values # get the u and v vectors from the matrices. # compute the update for u u = u + learn_rate*v*(y-np.dot(u,v)) # compute the update for v v = v + learn_rate*u*(y-np.dot(u,v)) # put u and v back in storage. U_lens.loc[user]=u V_lens.loc[film]=v counter +=1 if not counter % 100000: # checks if there is no remainder from dividing counter by 100000. # let's not compute entire objective, it would take too long. Select a every 4000th sample. obj = objective(Y_lens.loc[Y_lens.index[::4000]], U_lens, V_lens) print "Update:", counter, "Objective:", obj # create a new figure axis fig, axes = plt.subplots(figsize=(12,12)) for index in V_lens.index: # plot the movie name if it was rated more than 2000 times. if np.sum(Y_lens['item']==index)>2000: axes.plot(V_lens[0][index], V_lens[1][index], 'rx') axes.text(V_lens[0][index], V_lens[1][index], film_info['title'][index])