In [1]:

# data "ml-100k" from http://grouplens.org/datasets/movielens/

In [2]:

import os
data_folder = os.path.join(".", "data")
ratings_filename = os.path.join(data_folder, "u.data")

In [3]:

import pandas as pd

In [4]:

all_ratings = pd.read_csv(ratings_filename, delimiter="\t", header=None, names = ["UserID", "MovieID", "Rating", "Datetime"])
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings[:5]

Out[4]:

	UserID	MovieID	Rating	Datetime
0	196	242	3	1997-12-04 15:55:49
1	186	302	3	1998-04-04 19:22:22
2	22	377	1	1997-11-07 07:18:36
3	244	51	2	1997-11-27 05:02:03
4	166	346	1	1998-02-02 05:33:16

In [5]:

# As you can see, there are no review for most movies, such as #213
all_ratings[all_ratings["UserID"] == 675].sort_values("MovieID")  

Out[5]:

	UserID	MovieID	Rating	Datetime
81098	675	86	4	1998-03-10 00:26:14
90696	675	223	1	1998-03-10 00:35:51
92650	675	235	1	1998-03-10 00:35:51
95459	675	242	4	1998-03-10 00:08:42
82845	675	244	3	1998-03-10 00:29:35
53293	675	258	3	1998-03-10 00:11:19
97286	675	269	5	1998-03-10 00:08:07
93720	675	272	3	1998-03-10 00:07:11
73389	675	286	4	1998-03-10 00:07:11
77524	675	303	5	1998-03-10 00:08:42
47367	675	305	4	1998-03-10 00:09:08
44300	675	306	5	1998-03-10 00:08:07
53730	675	311	3	1998-03-10 00:10:47
54284	675	312	2	1998-03-10 00:10:24
63291	675	318	5	1998-03-10 00:21:13
87082	675	321	2	1998-03-10 00:11:48
56108	675	344	4	1998-03-10 00:12:34
53046	675	347	4	1998-03-10 00:07:11
94617	675	427	5	1998-03-10 00:28:11
69915	675	463	5	1998-03-10 00:16:43
46744	675	509	5	1998-03-10 00:24:25
46598	675	531	5	1998-03-10 00:18:28
52962	675	650	5	1998-03-10 00:32:51
94029	675	750	4	1998-03-10 00:08:07
53223	675	874	4	1998-03-10 00:11:19
62277	675	891	2	1998-03-10 00:12:59
77274	675	896	5	1998-03-10 00:09:35
66194	675	900	4	1998-03-10 00:10:24
54994	675	937	1	1998-03-10 00:35:51
61742	675	1007	4	1998-03-10 00:25:22
49225	675	1101	4	1998-03-10 00:33:49
50692	675	1255	1	1998-03-10 00:35:51
74202	675	1628	5	1998-03-10 00:30:37
47866	675	1653	5	1998-03-10 00:31:53

In [6]:

# Not all reviews are favourable! Our goal is "other recommended books", so we only want favourable reviews
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[10:15]

Out[6]:

	UserID	MovieID	Rating	Datetime	Favorable
10	62	257	2	1997-11-12 22:07:14	False
11	286	1014	5	1997-11-17 15:38:45	True
12	200	222	5	1997-10-05 09:05:40	True
13	210	40	3	1998-03-27 21:59:54	False
14	224	29	3	1998-02-21 23:40:57	False

In [7]:

all_ratings[all_ratings["UserID"] == 1][:5]

Out[7]:

	UserID	MovieID	Rating	Datetime	Favorable
202	1	61	4	1997-11-03 07:33:40	True
305	1	189	3	1998-03-01 06:15:28	False
333	1	33	4	1997-11-03 07:38:19	True
334	1	160	4	1997-09-24 03:42:27	True
478	1	20	4	1998-02-14 04:51:23	True

In [8]:

# Sample the dataset. You can try increasing the size of the sample, but the run time will be considerably longer
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]  # & ratings["UserID"].isin(range(100))]

In [9]:

# We start by creating a dataset of each user's favourable reviews
favorable_ratings = ratings[ratings["Favorable"]]
favorable_ratings[:5]

Out[9]:

	UserID	MovieID	Rating	Datetime	Favorable
16	122	387	5	1997-11-11 17:47:39	True
20	119	392	4	1998-01-30 16:13:34	True
21	167	486	4	1998-04-16 14:54:12	True
26	38	95	5	1998-04-13 01:14:54	True
28	63	277	4	1997-10-01 23:10:01	True

In [10]:

# We are only interested in the reviewers who have more than one review
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
len(favorable_reviews_by_users)

Out[10]:

In [11]:

# Find out how many movies have favourable ratings
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()
num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

Out[11]:

	Favorable
MovieID
50	100.0
100	89.0
258	83.0
181	79.0
174	74.0

In [12]:

from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [13]:

import sys
frequent_itemsets = {}  # itemsets are sorted by length
min_support = 50

# k=1 candidates are the isbns with more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

There are 16 movies with more than 50 favorable reviews
I found 93 frequent itemsets of length 2
I found 295 frequent itemsets of length 3
I found 593 frequent itemsets of length 4
I found 785 frequent itemsets of length 5
I found 677 frequent itemsets of length 6
I found 373 frequent itemsets of length 7
I found 126 frequent itemsets of length 8
I found 24 frequent itemsets of length 9
I found 2 frequent itemsets of length 10
Did not find any frequent itemsets of length 11

In [14]:

print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

Found a total of 2968 frequent itemsets

In [15]:

# Now we create the association rules. First, they are candidates until the confidence has been tested
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print("There are {} candidate rules".format(len(candidate_rules)))

There are 15285 candidate rules

In [16]:

print(candidate_rules[:5])

[(frozenset({79}), 258), (frozenset({258}), 79), (frozenset({50}), 64), (frozenset({64}), 50), (frozenset({127}), 181)]

In [17]:

# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

In [18]:

# Choose only rules above a minimum confidence level
min_confidence = 0.9

In [19]:

# Filter out the rules with poor confidence
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))

In [20]:

from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

In [21]:

for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: 评论了 {0} 的人，他也会评论 {1}".format(premise, conclusion))
    print(" - 置信度Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: 评论了 frozenset({64, 98, 56, 50, 7}) 的人，他也会评论 174
 - 置信度Confidence: 1.000

Rule #2
Rule: 评论了 frozenset({98, 100, 172, 79, 50, 56}) 的人，他也会评论 7
 - 置信度Confidence: 1.000

Rule #3
Rule: 评论了 frozenset({98, 172, 181, 174, 7}) 的人，他也会评论 50
 - 置信度Confidence: 1.000

Rule #4
Rule: 评论了 frozenset({64, 98, 100, 7, 172, 50}) 的人，他也会评论 174
 - 置信度Confidence: 1.000

Rule #5
Rule: 评论了 frozenset({64, 1, 7, 172, 79, 50}) 的人，他也会评论 181
 - 置信度Confidence: 1.000

In [22]:

# Even better, we can get the movie titles themselves from the dataset
movie_name_filename = os.path.join(data_folder, "u.item")
movie_name_data = pd.read_csv(movie_name_filename, delimiter="|", header=None, encoding = "mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure",
                           "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
                           "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

In [23]:

def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [24]:

get_movie_name(4)

Out[24]:

'Get Shorty (1995)'

In [25]:

for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: 评论了 {0} 的人，他也会评论 {1}".format(premise_names, conclusion_name))
    print(" - 置信度Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) 的人，他也会评论 Raiders of the Lost Ark (1981)
 - 置信度Confidence: 1.000

Rule #2
Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) 的人，他也会评论 Twelve Monkeys (1995)
 - 置信度Confidence: 1.000

Rule #3
Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人，他也会评论 Star Wars (1977)
 - 置信度Confidence: 1.000

Rule #4
Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) 的人，他也会评论 Raiders of the Lost Ark (1981)
 - 置信度Confidence: 1.000

Rule #5
Rule: 评论了 Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) 的人，他也会评论 Return of the Jedi (1983)
 - 置信度Confidence: 1.000

In [26]:

# Evaluation using test data
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset["Favorable"]]
#test_not_favourable = test_dataset[~test_dataset["Favourable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("UserID")["MovieID"])
#test_not_favourable_by_users = dict((k, frozenset(v.values)) for k, v in test_not_favourable.groupby("UserID")["MovieID"])
#test_users = test_dataset["UserID"].unique()

In [27]:

test_dataset[:5]

Out[27]:

	UserID	MovieID	Rating	Datetime	Favorable
3	244	51	2	1997-11-27 05:02:03	False
5	298	474	4	1998-01-07 14:20:06	True
7	253	465	5	1998-04-03 18:34:27	True
8	305	451	3	1998-02-01 09:20:17	False
11	286	1014	5	1997-11-17 15:38:45	True

In [28]:

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [29]:

test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in rule_confidence}
print(len(test_confidence))

In [30]:

sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)
print(sorted_test_confidence[:5])

[((frozenset({64, 1, 7, 172, 79, 50}), 174), 1.0), ((frozenset({64, 258, 98, 7, 174, 181}), 172), 1.0), ((frozenset({64, 1, 98, 7, 79, 181, 56}), 174), 1.0), ((frozenset({64, 1, 98, 7, 172, 79, 181}), 174), 1.0), ((frozenset({64, 258, 98, 7, 174, 50, 181}), 172), 1.0)]

In [31]:

for index in range(10):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: 评论了 {0} 的人，他也会评论 {1}".format(premise_names, conclusion_name))
    print(" - 训练集上的置信度: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
    print(" - 测试集上的置信度: {0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
    print("")

Rule #1
Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) 的人，他也会评论 Raiders of the Lost Ark (1981)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.909

Rule #2
Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) 的人，他也会评论 Twelve Monkeys (1995)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.609

Rule #3
Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人，他也会评论 Star Wars (1977)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.946

Rule #4
Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) 的人，他也会评论 Raiders of the Lost Ark (1981)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.971

Rule #5
Rule: 评论了 Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) 的人，他也会评论 Return of the Jedi (1983)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.900

Rule #6
Rule: 评论了 Toy Story (1995), Silence of the Lambs, The (1991), Fargo (1996), Raiders of the Lost Ark (1981), Godfather, The (1972) 的人，他也会评论 Pulp Fiction (1994)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.750

Rule #7
Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Godfather, The (1972), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人，他也会评论 Shawshank Redemption, The (1994)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.854

Rule #8
Rule: 评论了 Pulp Fiction (1994), Toy Story (1995), Shawshank Redemption, The (1994), Godfather, The (1972) 的人，他也会评论 Silence of the Lambs, The (1991)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.870

Rule #9
Rule: 评论了 Shawshank Redemption, The (1994), Fargo (1996), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Fugitive, The (1993) 的人，他也会评论 Pulp Fiction (1994)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.756

Rule #10
Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Raiders of the Lost Ark (1981), Fugitive, The (1993), Star Wars (1977), Return of the Jedi (1983) 的人，他也会评论 Pulp Fiction (1994)
 - 训练集上的置信度: 1.000
 - 测试集上的置信度: 0.756

In [ ]: