# this python code downloads some of my research groups software from the web

# first use the urllib library to download the zip file from github.
import urllib
urllib.urlretrieve('https://github.com/sods/ods/archive/master.zip', 'master.zip')

# now use the zip file library to unzip the software
import zipfile
zip = zipfile.ZipFile('./master.zip', 'r')
for name in zip.namelist():
    zip.extract(name, '.')

# finally use the sys library to add the location of the new software to the python 
# path, where it will be found by the computer.add the module location to the python path.    
import sys
sys.path.append("./ods-master/")

import pods # Our software for python open data science (pods)
import pandas as pd # the pandas library for data analysis

d = pods.datasets.movie_body_count()
movies = d['Y']

# this is an 'IPython magic' command that makes plots appear in the notebook.
%matplotlib inline 

# we import this library for plotting, we use it to create a scatter plot of 'body count' per film per year.
import pylab as plt

fig, ax = plt.subplots(figsize=(8,8)) # create an axis to put the plot on.

ax.plot(movies.Year, movies.Body_Count, 'ro') # plot the bodies vs the year.

plt.xlabel('year')   # always label your axes!
plt.ylabel('body count')
plt.title('Deaths in Films from 1949 to 2009')

import numpy as np # we need the numpy library for the logarithm
fig, ax = plt.subplots(figsize=(8,8)) # create an axis to put the plot on.

ax.plot(movies.Year, np.log10(movies.Body_Count), 'ro') # plot the bodies vs the year.

plt.xlabel('year')   # always label your axes!
plt.ylabel('log(body count)')
plt.title('Deaths in Films from 1949 to 2009')

# this command will sort movies by the 'Body_Count' column in descending order (ascending=False)
movies.sort(columns='Body_Count', ascending=False)


# plotting cosine
tau = 2*np.pi # tau is 2 times pi.

theta = np.linspace(-0.5*tau, 0.5*tau, 100) # this gives us 100 points in *radians* (in degrees they'd be from -180 to 180)
f = np.cos(theta)

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(theta, f, 'r-')
plt.xlabel('$\\theta$')
plt.ylabel('$\\cos(\\theta)$')

url = 'https://www.dropbox.com/s/i4o4quo0oplq62m/movies.csv?dl=0&raw=1'
import pandas as pd
import os
# I have a python script on my computer that collates all your ratings. Then it writes the results to a dropbox directory.
# this line loads that result from the dropbox directory.
pods.util.download_url(url, store_directory = 'ambassadors_movie', save_name='movies.csv')
movies = pd.read_csv(os.path.join(pods.datasets.data_path, 'ambassadors_movie', 'movies.csv')).set_index('index')
# this line does 'set' subtraction (it converts the lists of columns into two sets, then subtracts the set that doesn't 
# include user names)
user_names = list(set(movies.columns)-set(movies.columns[:9]))


# import the data analysis library
import pandas as pd
q = 2 # the dimension of our map of the 'library'
counter = 0 # a counter for storing how many updates we've done.
import numpy as np
init_u = np.random.normal(size=(len(user_names), q))*0.001
init_v = np.random.normal(size=(len(movies.index), q))*0.001
U = pd.DataFrame(init_u, index=user_names)
V = pd.DataFrame(init_v, index=movies.index)

print U

# first let's extract the names in movies.csv that have rated the data. The first columns are information about the movie

# This command takes the columns 'Film' and 'index' and creates a new 'triplet structure' like the one we mentioned above
Y = pd.melt(movies.reset_index(), id_vars=['Film', 'index'], 
            var_name='user', value_name='rating', 
            value_vars=user_names)
# this command renames the column titled 'index' with the title 'item'
Y.rename(columns={'index':'item'}, inplace=True)
# This command removes all the missing values.
Y = Y.dropna(axis=0)
Y

Y.rating.hist()
plt.title('Histogram of Ratings')

Y.rating = Y.rating - Y.rating.mean()

def objective(Y_vals, U_vals, V_vals):
    obj = 0.
    for ind, series in Y_vals.iterrows():
        film = series['item']
        user = series['user']
        rating = series['rating']
        prediction = np.dot(U_vals.loc[user], V_vals.loc[film]) # vTu
        diff = prediction - rating # vTu - y
        obj += diff*diff
    return obj

learn_rate = 0.1
counter = 0

import sys
epochs = 100
for i in range(epochs):
    # loop across the ratings
    for ind, series in Y.iterrows():
        # get which film and user and value the rating us
        film = series['item']
        user = series['user']
        y = series['rating']
        # get u and v out from storage.
        u = U.loc[user].values
        v = V.loc[film].values # get the u and v vectors from the matrices.
        # compute the update for u
        u = u + learn_rate*v*(y-np.dot(u,v))
        # compute the update for v
        v = v + learn_rate*u*(y-np.dot(u,v))
        # put u and v back in storage.
        U.loc[user]=u
        V.loc[film]=v
        counter +=1
        if not counter % 1000: # checks if there is no remainder from dividing counter by 1000.
            obj = objective(Y, U, V)
            print "Update:", counter, "Objective:", obj 

# create a figure axis
fig, axes = plt.subplots(figsize=(8,8))

# plot the movies, and label with your names
axes.plot(U[0], U[1], 'rx')
for name in U.index:
    axes.text(U[0][name], U[1][name], name)

# create a new figure axis    
fig, axes = plt.subplots(figsize=(8,8))

axes.plot(V[0], V[1], 'rx')
for index in V.index:
    # display the movie name if it was rated more than 5 times.
    if np.sum(Y['item']==index)>2:
        axes.text(V[0][index], V[1][index], movies['Film'][index])

import pods
d = pods.datasets.movielens100k()
film_info = d['film_info']
Y_lens=d['Y']
Y_lens.describe()

Y_lens.rating = Y_lens.rating - Y_lens.rating.mean() 

user_index = Y_lens.user.unique()
item_index = Y_lens.item.unique()

init_u = np.random.normal(size=(len(user_index), q))*0.001
init_v = np.random.normal(size=(len(item_index), q))*0.001
U_lens = pd.DataFrame(init_u, index=user_index)
V_lens = pd.DataFrame(init_v, index=item_index)

learn_rate = 0.01
counter = 0

Y_lens

# warning, this will take a lot longer than the example above!
# if you start it accidentally and want it to stop, use Kernel->Interrupt from the menu above.
import sys
epochs = 5
for i in range(epochs):
    # loop across the ratings
    for ind, series in Y_lens.iterrows():
        # get which film and user and value the rating us
        film = series['item']
        user = series['user']
        y = series['rating']
        # get u and v out from storage
        u = U_lens.loc[user].values
        v = V_lens.loc[film].values # get the u and v vectors from the matrices.
        # compute the update for u
        u = u + learn_rate*v*(y-np.dot(u,v))
        # compute the update for v
        v = v + learn_rate*u*(y-np.dot(u,v))
        # put u and v back in storage.
        U_lens.loc[user]=u
        V_lens.loc[film]=v
        counter +=1
        if not counter % 100000: # checks if there is no remainder from dividing counter by 100000.
            # let's not compute entire objective, it would take too long. Select a every 4000th sample.
            obj = objective(Y_lens.loc[Y_lens.index[::4000]], U_lens, V_lens)
            print "Update:", counter, "Objective:", obj 


# create a new figure axis    
fig, axes = plt.subplots(figsize=(12,12))


for index in V_lens.index:
    # plot the movie name if it was rated more than 2000 times.
    if np.sum(Y_lens['item']==index)>2000:
        axes.plot(V_lens[0][index], V_lens[1][index], 'rx')
        axes.text(V_lens[0][index], V_lens[1][index], film_info['title'][index])