In [1]:
#this follows chpt2 of
#"Programming Collective Intelligence", by T. Segaran
##ask your friends, weight according to similarity
In [2]:
from scipy import stats
#r,p=stats.pearsonr(xdata,ydata)
#slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
In [3]:
# A dictionary of movie critics and their ratings of a small set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
In [4]:
len(critics), map(len,critics.values())
Out[4]:
(7, [5, 6, 5, 6, 3, 6, 4])
In [5]:
#what has Toby reviewed:
critics['Toby']
Out[5]:
{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}
In [6]:
#look at just the critics who reviewed these two movies
dupree='You, Me and Dupree'
snakes='Snakes on a Plane'
ds={c.split()[-1]:(critics[c][dupree],critics[c][snakes])
    for c in critics if dupree in critics[c] and snakes in critics[c]}
ds
Out[6]:
{'LaSalle': (2.0, 4.0),
 'Matthews': (3.5, 4.0),
 'Puig': (2.5, 3.5),
 'Rose': (2.5, 3.5),
 'Seymour': (3.5, 3.5),
 'Toby': (1.0, 4.5)}
In [7]:
#now plot them
figure(figsize=(5,5))
xlim(0,5)
ylim(0,5)
offset={c:.01 for c in ds}
offset['Rose']=0
offset['Puig']=-.15 #move Puig down to avoid collision
plot([ds[crit][0] for crit in ds],[ds[crit][1] for crit in ds],'o')
for crit in ds: text(ds[crit][0]+.05,ds[crit][1]+offset[crit],crit)
xlabel('dupree')
ylabel('snakes');
In [8]:
#who's close to whom in the above fig?
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
  # Get the list of shared_items
  si=[item for item in prefs[person1] if item in prefs[person2]]

  # if they have no ratings in common, return 0
  if len(si)==0: return 0
    
  v1=array([prefs[person1][item] for item in si])
  v2=array([prefs[person2][item] for item in si])

  # use numpy euclidean distance (sqrt(sum of squares))
  dist=norm(v1-v2)

  #transform to similarity ranging from 0 to 1
  #truncate to three after decimal point
  return float("%.3f" % (1/(1+dist**2))) 
In [9]:
sim_distance(critics,'Lisa Rose','Gene Seymour')
Out[9]:
0.148
In [10]:
#consider a different distance measure,
# Pearson correlation coefficient
#[+1 if correlated, -1 if anticorrelated]
def show_pearson(prefs,c1,c2):
    si=[item for item in prefs[c1] if item in prefs[c2]]

    figure(figsize=(5,5))
    xlim(0,5)
    ylim(0,5)
    xdata = [prefs[c1][item] for item in si]
    ydata = [prefs[c2][item] for item in si]

    slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata)
    print 'pearson r =', '%.2f'%r_value

    xlabel(c1)
    ylabel(c2)
    
    plot(xdata,ydata,'o')
    plot(slope*array(range(0,6))+intercept,'--')
    for item in si:
        text(prefs[c1][item],prefs[c2][item],item)
In [11]:
#two fake critics roughly correlated
fcritics={'critic1':{'Dupree':1,'Night':2.5,'Lady':3,'Snakes':3.5,'Superman':4.5},
         'critic2':{'Dupree':2,'Night':3,'Lady':2.5,'Snakes':3.5,'Superman':3.5}}
show_pearson(fcritics,'critic1','critic2')
pearson r = 0.87
In [12]:
#two from original set not quite as well correlated
show_pearson(critics,'Mick LaSalle','Gene Seymour')
pearson r = 0.41
In [13]:
#now define similarity measure, analogous to sim_distance
def sim_pearson(prefs,c1,c2):
    si=[item for item in prefs[c1] if item in prefs[c2]]
    if len(si)==0: return 0
    xdata = [prefs[c1][item] for item in si]
    ydata = [prefs[c2][item] for item in si]
    r,p=stats.pearsonr(xdata,ydata)
    if isnan(r): return 0
    return float("%.3f"%r)
In [14]:
#note pearson corrects for "grade inflation", unlike euclidean distance
#one can be systematically higher than the other, offset won't matter
In [15]:
#now rank the critics, finding ones with similar taste

# Returns the best matches for person from the prefs dictionary. 
# Number of results and similarity function are optional params.
def topMatches(prefs, person, n=5, similarity=sim_pearson):
  scores=[(other, similarity(prefs,person,other))
                  for other in prefs if other!=person]
  return sorted(scores,key=lambda x:x[1],reverse=True)[:n]
In [16]:
topMatches(critics,'Toby',n=6)
Out[16]:
[('Lisa Rose', 0.991),
 ('Mick LaSalle', 0.924),
 ('Claudia Puig', 0.893),
 ('Jack Matthews', 0.663),
 ('Gene Seymour', 0.381),
 ('Michael Phillips', -1.0)]
In [17]:
# see how topmatches function works using other similarity measure
topMatches(critics,'Toby', n=3, similarity=sim_distance)
Out[17]:
[('Mick LaSalle', 0.308), ('Michael Phillips', 0.286), ('Claudia Puig', 0.235)]
In [ ]:
#but really want recommendation. could use just most similar person, but that person
#might not have seen relevant movie, or might be outlier on particular movie, instead:
In [18]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: continue
    sim=similarity(prefs,person,other)
    
    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]:

      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        if item not in totals:
            totals[item]=0
            simSums[item]=0
        totals[item] += prefs[other][item]*sim
        # Sum of similarities
        simSums[item] += sim

  # Create the normalized list
  rankings=[(item,float("%.3f"%(totals[item]/simSums[item]))) for item in totals]

  # Return the sorted list
  return sorted(rankings,key=lambda x:x[1],reverse=True)
In [19]:
getRecommendations(critics,'Toby')
#also gives likely rating
Out[19]:
[('The Night Listener', 3.348),
 ('Lady in the Water', 2.833),
 ('Just My Luck', 2.531)]
In [20]:
#or use other distance measure
getRecommendations(critics,'Toby',similarity=sim_distance)
Out[20]:
[('The Night Listener', 3.5),
 ('Lady in the Water', 2.756),
 ('Just My Luck', 2.462)]
In [ ]:
#now suppose you want matching products, i.e., amazon "customers have also bought"
In [21]:
#first reverse role of items and objects
def transformPrefs(prefs):
  result={}
  for person in prefs:
    for item in prefs[person]:
      if item not in result: result[item]={}
      
      # Flip item and person
      result[item][person]=prefs[person][item]
  return result
In [22]:
movies=transformPrefs(critics)
movies
Out[22]:
{'Just My Luck': {'Claudia Puig': 3.0,
  'Gene Seymour': 1.5,
  'Lisa Rose': 3.0,
  'Mick LaSalle': 2.0},
 'Lady in the Water': {'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 2.5,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0},
 'Snakes on a Plane': {'Claudia Puig': 3.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 4.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.0,
  'Mick LaSalle': 4.0,
  'Toby': 4.5},
 'Superman Returns': {'Claudia Puig': 4.0,
  'Gene Seymour': 5.0,
  'Jack Matthews': 5.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.5,
  'Mick LaSalle': 3.0,
  'Toby': 4.0},
 'The Night Listener': {'Claudia Puig': 4.5,
  'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 3.0,
  'Michael Phillips': 4.0,
  'Mick LaSalle': 3.0},
 'You, Me and Dupree': {'Claudia Puig': 2.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 3.5,
  'Lisa Rose': 2.5,
  'Mick LaSalle': 2.0,
  'Toby': 1.0}}
In [23]:
#now topmatches gives similar movies rather than similar reviewers
topMatches(movies,'Superman Returns')
Out[23]:
[('You, Me and Dupree', 0.658),
 ('Lady in the Water', 0.488),
 ('Snakes on a Plane', 0.112),
 ('The Night Listener', -0.18),
 ('Just My Luck', -0.423)]
In [24]:
#note negative scores, reviews who like one dislike the other
show_pearson(movies,'Just My Luck','Superman Returns')
pearson r = -0.42
In [25]:
getRecommendations(movies,'Just My Luck')
#find critics for movie ... invite to premiere?
Out[25]:
[('Michael Phillips', 4.0), ('Jack Matthews', 3.0)]
In [26]:
#now apply to de.licio.us data
# python code to interact with delicious api is here
#http://code.google.com/p/pydelicious/source
#svn checkout http://pydelicious.googlecode.com/svn/trunk/ pydelicious
#sudo python setup.py install
from pydelicious import get_popular,get_userposts,get_urlposts
import time
In [27]:
#function to get some users from the most popular posts
def initializeUserDict(tag,count=5):
  user_dict={}
  # get the top count popular posts
  for p1 in get_popular(tag=tag)[0:count]:
    print 'p1=',{k:v.encode('ascii','ignore') for k,v in p1.items()}
    # find all users who posted this
    for p2 in get_urlposts(p1['url']):
      user=p2['user']
      userasc=user.encode('ascii','ignore')
      if user == userasc and len(user)>1: user_dict[userasc]={}
  return user_dict
In [28]:
#get users from ten urls with 'programming' tag
del_users = initializeUserDict('programming',10)
print len(del_users.keys()),'users'
p1= {'extended': '', 'description': 'Gmail Email Analysis with Neo4j - and Spreadsheets | Architects Zone', 'tags': 'programming', 'url': 'http://architects.dzone.com/articles/gmail-email-analysis-neo4j-and', 'user': '', 'dt': ''}
p1= {'extended': '', 'description': '10 Questions to Ask When Hiring a Mobile App Developer | YoungEntrepreneur.com', 'tags': 'programming', 'url': 'http://www.youngentrepreneur.com/startingup/leadership-qualities-skills/10-questions-to-ask-when-hiring-a-mobile-app-developer/', 'user': '', 'dt': ''}
p1= {'extended': '', 'description': 'The Web API Checklist  43 Things To Think About When Designing, Testing, and Releasing your API | Mathieu Fenniak', 'tags': 'api', 'url': 'http://mathieu.fenniak.net/the-api-checklist/', 'user': '', 'dt': ''}
p1= {'extended': '', 'description': 'www.hyperpolyglot.org', 'tags': 'comparatif', 'url': 'http://www.hyperpolyglot.org/', 'user': '', 'dt': ''}
p1= {'extended': '', 'description': 'image.diku.dk', 'tags': 'programming', 'url': 'http://image.diku.dk/shark/sphinx_pages/build/html/index.html?utm_source=feedly', 'user': 'salloo', 'dt': ''}
p1= {'extended': '', 'description': 'P2PU | Python for Informatics | Chapter 9 - Dictionaries', 'tags': 'python', 'url': 'https://p2pu.org/en/courses/175/content/407/', 'user': '', 'dt': ''}
p1= {'extended': '', 'description': 'Educator  Intermediate Level C++ with Alvin Sylvain', 'tags': 'programming', 'url': 'http://tutolearning.com/educator-intermediate-level-c-with-alvin-sylvain/', 'user': '', 'dt': ''}
p1= {'extended': '', 'description': 'Educator  Introduction to C++ with Alvin Sylvain', 'tags': 'programming', 'url': 'http://tutolearning.com/educator-introduction-to-c-with-alvin-sylvain/', 'user': '', 'dt': ''}
p1= {'extended': '', 'description': 'Code Snippets - Snipplr Social Snippet Repository', 'tags': 'code', 'url': 'http://snipplr.com/', 'user': 'willyfresh', 'dt': ''}
p1= {'extended': '', 'description': 'How We Made GitHub Fast - GitHub', 'tags': 'github', 'url': 'https://github.com/blog/530-how-we-made-github-fast', 'user': '', 'dt': ''}
262 users
In [29]:
print del_users.keys()[:50]
[user for user in del_users.keys() if len(user)<3]
['lorn', 'sibilsalim', 'pixelomatic', 'krapaille', 'todeqiralora', 'kucisaviwy', 'suntzu23', 'jizumabesyr', 'lokosozazuwa', 'clever.netman', 'anonymas', 'kojipyfex', 'arkitekt', 'chamerling', 'arozwalak', 'dyh1919', 'kostas345crew', 'paulokeeffe', 'lovrozitnik', 'msafty', 'jgradim', 'ole1981', 'fenng', 'rkg_mbp', 'forcemajor', 'tomohiro', 'codingeye', 'jpcochran', 'staii', 'gekacebene', 'luisrojascr', 'viktorium', 'bds023', 'wihatofoho', 'roshanbh', 'taluwacerij', 'elvencao', 'bd808', 'deja_rulez', 'igoritl', 'jmatraszek', 'alpsantos', 'mi2195', 'cyberlabe.fr', 'irc', 'salabuzot', 'gareth', 'jwolski', 't.chaffee', 'juanodicio']
Out[29]:
[]
In [30]:
def fillItems(user_dict):
  all_items={}
  # Find links posted by all users initialized in user_dict
  for user in user_dict:
    if len(user)<2: continue  #skip blank users
    for i in range(3):
      try:
        posts=get_userposts(user)
        break
      except:
        print "Failed user "+user+", retrying"
        time.sleep(4)
    for post in posts:
      url=post['url'].encode('ascii','ignore')
      user_dict[user][url]=1.0
      all_items[url]=1
  
  # Fill in missing items with 0
  for ratings in user_dict.values():
    for item in all_items:
      if item not in ratings:
        ratings[item]=0.0
In [31]:
fillItems(del_users)
In [55]:
import random
user=random.choice(del_users.keys())
user
Out[55]:
'frockenstein'
In [56]:
#find users similar to that one
topMatches(del_users,user)
Out[56]:
[('delbook', 0.102),
 ('ole1981', 0.096),
 ('mrragga', 0.096),
 ('hardcod3d', 0.096),
 ('ullu', -0.003)]
In [57]:
#find recommended urls for that user
#recommendation engine for del.icio.us
#look for tags similar to one another
#can also look for people trying to manipulate popular pages by posting same via multiple accounts
#item based filtering
getRecommendations(del_users,user)[:10]
Out[57]:
[('http://cloud.ubuntu.com/2010/11/using-ubuntu-images-on-aws-free-tier/',
  0.262),
 ('http://blog.grayproductions.net/articles/load_an_ec2_gui_on_your_mac_os_x_box',
  0.262),
 ('http://betterexplained.com/articles/how-to-optimize-your-site-with-http-caching/',
  0.262),
 ('http://misoproject.com/dataset/examples/highstockandcsv.html', 0.262),
 ('http://twitter.github.com/bootstrap/', 0.262),
 ('http://flurdy.com/docs/ec2/ubuntu/', 0.262),
 ('http://michaelbushe.wordpress.com/', 0.262),
 ('http://fourkitchens.com/blog/2011/09/20/trigger-jenkins-builds-pushing-github',
  0.262),
 ('http://www.gulli.com/news/kino-to-ra-christian-solmecke-sch-tzt-risiken-f-r-nutzer-ab-2011-06-08',
  0.246),
 ('http://www.cmdbuild.org/en', 0.246)]
In [52]:
url=getRecommendations(delusers,user)[0][0]
print 'url=',url
#url='http://ebookbrowse.com/'
#find urls co-liked hence 'similar'
topMatches(transformPrefs(delusers),url)
http://thecleancoder.blogspot.com/2011/01/transformation-priority-and-sorting.html?m=1
Out[52]:
[(u'http://stackoverflow.com/questions/13132864/circular-tooltip/13137862#13137862',
  1.0),
 (u'https://speakerdeck.com/arthurakay/end-to-end-unit-testing-for-web-developers',
  1.0),
 (u'http://pastebin.com/3RSUzfDk', 1.0),
 (u'https://speakerdeck.com/arthurakay/your-code-sucks-best-practices-for-enterprise-javascript-development',
  1.0),
 (u'http://sergeytihon.wordpress.com/2013/02/28/servicestack-new-api-f-sample-web-service-out-of-a-web-server/',
  1.0)]
In [58]:
#user based filtering can be slow, use item based collaborative filtering
#precompute most similar for each item
#then look at user's top rated items, and create weighted list of items most similar to those
#based on precomputed similarities
#(comparisons between items don't change as fast as new users added)
def calculateSimilarItems(prefs,n=10):
  # Create a dictionary of items showing which other items they are most similar to.
  result={}
  # Invert the preference matrix to be item-centric
  itemPrefs=transformPrefs(prefs)
  c=0
  for item in itemPrefs:
    # Status updates for large datasets
    c+=1
    if c%100==0: print "%d / %d" % (c,len(itemPrefs))
    # Find the most similar items to this one
    scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
    result[item]=scores
  return result
In [59]:
itemsim=calculateSimilarItems(critics)
itemsim
Out[59]:
{'Just My Luck': [('Lady in the Water', 0.222),
  ('You, Me and Dupree', 0.182),
  ('The Night Listener', 0.154),
  ('Snakes on a Plane', 0.105),
  ('Superman Returns', 0.065)],
 'Lady in the Water': [('You, Me and Dupree', 0.4),
  ('The Night Listener', 0.286),
  ('Snakes on a Plane', 0.222),
  ('Just My Luck', 0.222),
  ('Superman Returns', 0.091)],
 'Snakes on a Plane': [('Lady in the Water', 0.222),
  ('The Night Listener', 0.182),
  ('Superman Returns', 0.167),
  ('Just My Luck', 0.105),
  ('You, Me and Dupree', 0.051)],
 'Superman Returns': [('Snakes on a Plane', 0.167),
  ('The Night Listener', 0.103),
  ('Lady in the Water', 0.091),
  ('Just My Luck', 0.065),
  ('You, Me and Dupree', 0.053)],
 'The Night Listener': [('Lady in the Water', 0.286),
  ('Snakes on a Plane', 0.182),
  ('Just My Luck', 0.154),
  ('You, Me and Dupree', 0.148),
  ('Superman Returns', 0.103)],
 'You, Me and Dupree': [('Lady in the Water', 0.4),
  ('Just My Luck', 0.182),
  ('The Night Listener', 0.148),
  ('Superman Returns', 0.053),
  ('Snakes on a Plane', 0.051)]}
In [60]:
#this recommender now uses the precomputed item similarities
def getRecommendedItems(prefs,itemMatch,user):
  userRatings=prefs[user]
  scores={}
  totalSim={}
  # Loop over items rated by this user
  for (item,rating) in userRatings.items( ):

    # Loop over items similar to this one
    for (item2,similarity) in itemMatch[item]:
      # Ignore if this user has already rated this item
      if item2 in userRatings: continue
      # Weighted sum of rating times similarity
      if item2 not in scores:
          scores[item2]=0
          totalSim[item2]=0
      scores[item2]+=similarity*rating
      # Sum of all the similarities
      totalSim[item2]+=similarity

  # Divide each total score by total weighting to get an average
  rankings=[(item,float("%.3f"%(scores[item]/totalSim[item]))) for item in scores]

  return sorted(rankings,key=lambda x:x[1],reverse=True)
In [61]:
#works the same for constant set
getRecommendedItems(critics,itemsim,'Toby')
Out[61]:
[('The Night Listener', 3.185),
 ('Just My Luck', 2.598),
 ('Lady in the Water', 2.473)]
In [62]:
#now get a real movie dataset from movielens, the smallest one
#http://www.grouplens.org/node/12
#http://www.grouplens.org/system/files/ml-100k.zip
#contains
#u.item list of movie ids and titles
#u.data ratings     user id, movie id, rating, timestamp
#stored in ml-100k/ folder

def loadMovieLens(path='ml-100k/'):
  # Get movie titles
  movies={}
  for line in open(path+'u.item'):
    (id,title)=line.split('|')[0:2]
    movies[id]=title
  
  # Load data
  prefs={}
  for line in open(path+'u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    if user not in prefs: prefs[user]={}
    prefs[user][movies[movieid]]=float(rating)
  return prefs
In [63]:
mprefs=loadMovieLens()
In [64]:
len(mprefs),map(len,mprefs.values()[:10])
Out[64]:
(943, [188, 230, 193, 198, 44, 20, 201, 236, 25, 21])
In [79]:
topMatches(transformPrefs(mprefs),'Terminator, The (1984)',n=10)
Out[79]:
[('8 Seconds (1994)', 1.0),
 ('Calendar Girl (1993)', 1.0),
 ('Hurricane Streets (1998)', 1.0),
 ("Ed's Next Move (1996)", 1.0),
 ('Wild Reeds (1994)', 1.0),
 ('Scarlet Letter, The (1926)', 1.0),
 ('Vermin (1998)', 1.0),
 ('Outlaw, The (1943)', 1.0),
 ('Rhyme & Reason (1997)', 1.0),
 ('Beans of Egypt, Maine, The (1994)', 1.0)]
In [80]:
#get recommendations for user 87
getRecommendations(mprefs,'87')[:30]
Out[80]:
[('They Made Me a Criminal (1939)', 5.0),
 ('Santa with Muscles (1996)', 5.0),
 ('Saint of Fort Washington, The (1993)', 5.0),
 ('Entertaining Angels: The Dorothy Day Story (1996)', 5.0),
 ('Marlene Dietrich: Shadow and Light (1996) ', 5.0),
 ('Star Kid (1997)', 5.0),
 ('Great Day in Harlem, A (1994)', 5.0),
 ('Boys, Les (1997)', 5.0),
 ('Legal Deceit (1997)', 4.899),
 ('Letter From Death Row, A (1998)', 4.815),
 ('Hearts and Minds (1996)', 4.733),
 ('Pather Panchali (1955)', 4.697),
 ('Lamerica (1994)', 4.653),
 ('Leading Man, The (1996)', 4.54),
 ('Mrs. Dalloway (1997)', 4.535),
 ('Innocents, The (1961)', 4.532),
 ('Casablanca (1942)', 4.528),
 ('Everest (1998)', 4.51),
 ('Dangerous Beauty (1998)', 4.494),
 ('Wallace & Gromit: The Best of Aardman Animation (1996)', 4.485),
 ('Wrong Trousers, The (1993)', 4.463),
 ('Kaspar Hauser (1993)', 4.452),
 ('Usual Suspects, The (1995)', 4.431),
 ('Maya Lin: A Strong Clear Vision (1994)', 4.429),
 ('Wedding Gift, The (1994)', 4.416),
 ('Affair to Remember, An (1957)', 4.378),
 ('Anna (1996)', 4.376),
 ('As Good As It Gets (1997)', 4.376),
 ('Good Will Hunting (1997)', 4.376),
 ('Close Shave, A (1995)', 4.368)]
In [81]:
#now item based
itemsim=calculateSimilarItems(mprefs,n=50)
100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664
In [85]:
#takes a while to build, but now recommendations instantaneous, and indep
# of number of users. experiment with this dataset...
getRecommendedItems(mprefs,itemsim,'189')[:30]
Out[85]:
[('B. Monkey (1998)', 5.0),
 ('Down Periscope (1996)', 5.0),
 ('Crude Oasis, The (1995)', 5.0),
 ('Rent-a-Kid (1995)', 5.0),
 ('House Party 3 (1994)', 5.0),
 ('Leading Man, The (1996)', 5.0),
 ('Mallrats (1995)', 5.0),
 ('How to Be a Player (1997)', 5.0),
 ('Party Girl (1995)', 5.0),
 ('Sneakers (1992)', 5.0),
 ('Philadelphia Story, The (1940)', 5.0),
 ('Swiss Family Robinson (1960)', 5.0),
 ('Gigi (1958)', 5.0),
 ('Love! Valour! Compassion! (1997)', 5.0),
 ('Treasure of the Sierra Madre, The (1948)', 5.0),
 ('Police Story 4: Project S (Chao ji ji hua) (1993)', 5.0),
 ('Curdled (1996)', 5.0),
 ('B*A*P*S (1997)', 5.0),
 ('Mimic (1997)', 5.0),
 ('Prophecy, The (1995)', 5.0),
 ('Stranger in the House (1997)', 5.0),
 ('Dead Presidents (1995)', 5.0),
 ('Man of the House (1995)', 5.0),
 ('Power 98 (1995)', 5.0),
 ('Renaissance Man (1994)', 5.0),
 ('Coldblooded (1995)', 5.0),
 ('Killer (Bulletproof Heart) (1994)', 5.0),
 ("Wes Craven's New Nightmare (1994)", 5.0),
 ('Original Gangstas (1996)', 5.0),
 ("Marvin's Room (1996)", 5.0)]
In [ ]: