import sqlite3
import pandas as pd
import numpy as np
conn = sqlite3.connect('./records.db')
数据挖掘是一种为了从海量数据中提取有价值信息而产生的数据处理技术。数据挖掘通过结合传统的数据分析方法和处理大数据的复杂算法,从数据中提炼出辅助决策的信息。数据挖掘的定义可以分别从技术层和商业层来看:
而数据挖掘与传统的数据分析方法也有着本质的区别:数据挖掘是在没有明确假设的前提下挖掘信息和发现知识。因而数据挖掘得到的信息具有先前未知、有效和实用三个特征。
下面将介绍如何应用数据挖掘技术从影片数据中提取票房相关的信息。
票房是电影受大众欢迎程度的一个主要衡量标准。而一部电影是否卖座,除了和电影本身剧情、拍摄技术有关之外,还会和演员导演阵容、影片上映时间等因素相关。本报告尝试通过对历史电影票房及相关信息进行分析,来预测新电影的票房高低。
在本问题中,数据集是一系列电影,数据对象是一条电影记录;对电影票房的预测转化为一个分类问题。
为了更加方便量化影片票房属性,我们可以将票房收入划分为高、中、低三档:
影片名称 | 影片票房(国内、美元) | 票房收入水平 |
---|---|---|
Avatar (阿凡达) | 425000000 | 高 |
The Zero Theorem (零点定理) | 257706 | 中 |
Video Games: The Movie (电子游戏大电影) | 23043 | 低 |
而对于影片对象的描述,可以采用如下属性:
release_season
,枚举属性,可选值有:spring / summer / autumn / winter)genre
, 枚举属性)staffs
, 枚举属性,可选值有: first-class / professional / amateur)接下来就可以通过上述属性来对影片票房水平进行分类预测。
# 影片信息
pd.read_sql('select * from movie where title = "Avatar"', conn)
id | title | genre | release_date | budget | box_office_domestic | box_office_foreign | url | imdb_profile_url | source | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1502 | Avatar | Action | 2009-12-18 00:00:00 | 425000000 | 760507648 | 0 | http://www.the-numbers.com/movie/Avatar#tab=su... | http://www.imdb.com/title/tt0499549/ | www.the-numbers.com |
# 职员信息
pd.read_sql('select * from staff limit 2', conn)
id | name | |
---|---|---|
0 | 1 | Jonathan Parker |
1 | 2 | York Alec Shackleton |
movies_count = pd.read_sql('select count(*) as 影片数量 from movie', conn)
staffs_count = pd.read_sql('select count(*) as 职员数量 from staff', conn)
movies_count['职员数量'] = staffs_count['职员数量']
movies_count
影片数量 | 职员数量 | |
---|---|---|
0 | 22292 | 10552 |
为了去除因为经济环境等第三方因素影响,实验中采用以下的数据:
sample_movies_count = pd.read_sql('select count(*) as 训练集影片数 from movie where release_date >= "2005-01-01" and release_date <= "2013-12-31" and box_office_domestic > 10000', conn)
query_movies_count = pd.read_sql('select count(*) as 测试集影片数 from movie where release_date >= "2014-01-01" and release_date <= "2014-12-31" and box_office_domestic > 10000', conn)
sample_movies_count['测试集影片数'] = query_movies_count['测试集影片数']
sample_movies_count
训练集影片数 | 测试集影片数 | |
---|---|---|
0 | 4570 | 518 |
上述原始数据中,其中影片类型(genre)的分类已由 the-numbers.com 完成。
而发行季度则按照以下方式方式划分:
release_date
月份为 11, 12, 1,则该影片发行季节为冬季;release_date
月份为 2, 3, 4,则该影片发行季节为春季;release_date
月份为 5, 6 7,则该影片发行季节为夏季;release_date
月份为 8, 9, 10,则该影片发行季节为秋季。影片制作人员水平按照以下规则划分:
avg(s_i)
avg(m_i) = sum(avg(s_i) for s_i in movie['staff']) / len(movie['staff'])
avg(m_i)
:因为测量数据集中的电影都已经上映,所以可以预先计算对应的票房水平并在后面用在检验:
通过上述方法即可完成对影片属性的标记:
pd.read_sql('select movie_property.*, movie.title, movie.box_office_domestic, movie.release_date from movie_property join movie on movie.id = movie_id where movie_id = 1502', conn)
id | movie_id | release_season | genre | staffs | gross_level | title | box_office_domestic | release_date | |
---|---|---|---|---|---|---|---|---|---|
0 | 337 | 1502 | winter | action | first-class | high | Avatar | 760507648 | 2009-12-18 00:00:00 |
本实验中采用以下 3 种算法来进行分类:
对于每个算法的分类结果,采用以下方法判定准确度:
下面详细介绍各个算法的实现。
# 获取样本集
def get_sample_movies():
return pd.read_sql('select * from movie_property where id < 4571', conn)
# 获取测试集
def get_query_movies():
return pd.read_sql('select * from movie_property where id > 4570', conn)
# 用作分类的属性
basic_properties = ['release_season', 'genre', 'staffs']
gross_level = ['high', 'medium', 'low']
# 计算准确率
def describe_algo(df):
wa_ratio = df[df['gross_level'] != df['predicted_gross_level']].count() / df.count()
print('错误率: {0}'.format(wa_ratio['gross_level']))
#%%timeit -n 1
# 数据集
id3_samples = get_sample_movies()
id3_queries = get_query_movies()
# 计算数据集的熵
def entropy(df, field):
ratios = df.groupby(field).count() / len(df)
ratios = ratios.apply(lambda ratio: - ratio * np.log2(ratio))
key = ratios.keys()[0]
return ratios[key].values.sum()
# 计算信息增益
def entropy_with_fields(df, field, target_field):
def sub_field(g):
return entropy(g, target_field) * len(g) / len(df)
return df.groupby(field).agg(sub_field).sum()[0]
def select_best_field(df, selectable_fields, target_field):
fields = [i for i in df if i in selectable_fields]
best_field, best_gain = None, 2 << 30
for field in fields:
field_gain = entropy_with_fields(df, field, target_field)
if field_gain < best_gain:
best_field, best_gain = field, field_gain
return best_field
def split_with_field(df, field):
keys = df.groupby(field).groups.keys()
return {k: df[df[field] == k] for k in keys}
# 决策树结点
class ParentNode(object):
def __init__(self, field):
self.field = field
self.children = {}
def add_child(self, field_value, child_node):
self.children[field_value] = child_node
class LeaveNode(object):
def __init__(self, value):
self.value = value
def all_same(columns):
if len(columns) < 1:
return True
for value in columns:
if value != columns.values[0]:
return False
return True
def build_tree(df, selectable_fields, target_field):
if all_same(df[target_field]) or not selectable_fields:
# FIXME why can't filter into a same value?
best_value = df[target_field].value_counts().index[0]
return LeaveNode(best_value)
field = select_best_field(df, selectable_fields, target_field)
node = ParentNode(field)
sub_selectable_fields = [i for i in selectable_fields if i != field]
for field_value, sub_df in split_with_field(df, field).items():
child_node = build_tree(sub_df, sub_selectable_fields, target_field)
node.add_child(field_value, child_node)
return node
def decide(root, query):
if isinstance(root, LeaveNode):
return root.value
field_value = query[root.field]
if field_value not in root.children:
return 'N/A'
child = root.children[query[root.field]]
return decide(child, query)
# 构造决策树
root = build_tree(id3_samples, basic_properties, 'gross_level')
# 计算预测值
id3_queries['predicted_gross_level'] = [decide(root, m) for i, m in id3_queries.iterrows()]
describe_algo(id3_queries)
错误率: 0.3938223938223938
在算法中, K = 5
#%%timeit -n 1
K = 5
# 数据集
knn_samples = get_sample_movies()
knn_queries = get_query_movies()
# 因为这里的属性都是类别属性,所以使用汉明距离
def distance(ma, mb):
dis = 0
for prop in basic_properties:
if ma[prop] != mb[prop]:
dis += 1
return dis
from collections import defaultdict
def knn(query, samples, k, target_property):
dis = distance
distances = [(dis(query, sample), sample[target_property])
for sample in samples]
distances = sorted(distances, key=lambda x: x[0])
k = min(k, len(distances))
k_first = defaultdict(lambda: 0.0)
for _, value in distances[:k]:
k_first[value] += 1
prediction, max_count, chosen = {}, - 1 << 30, None,
for value, count in k_first.items():
if count > max_count:
max_count, chosen = count, value
prediction[value] = count / k
return chosen, prediction
knn_movies_samples = [m for _, m in knn_samples.iterrows()]
def knn_many(queries, k=None):
k = k or K
return [knn(m, knn_movies_samples, k, 'gross_level')[0] for _, m in queries.iterrows()]
knn_queries['predicted_gross_level'] = knn_many(knn_queries)
describe_algo(knn_queries)
错误率: 0.45366795366795365
#%%timeit -n 1
# 数据集
nb_samples = get_sample_movies()
nb_queries = get_query_movies()
def p_key(pairs):
return '_'.join('{0}-{1}'.format(k, v) for k, v in pairs)
# 先验概率
P = {}
gross_level_groups = nb_samples.groupby('gross_level').groups
P[p_key([('gross_level', 'high')])] = len(gross_level_groups['high']) / len(nb_samples)
P[p_key([('gross_level', 'medium')])] = len(gross_level_groups['medium']) / len(nb_samples)
P[p_key([('gross_level', 'low')])] = len(gross_level_groups['low']) / len(nb_samples)
def calculate_properity(field):
total = len(nb_samples)
for key in nb_samples.groupby(field).groups.keys():
for level in gross_level:
pk = p_key([(field, key), ('gross_level', level)])
c = nb_samples[(nb_samples[field] == key) & (nb_samples['gross_level'] == level)]
P[pk] = len(c) / total
# 计算先验概率
for field in basic_properties:
calculate_properity(field)
def predict(m, priori_prob=None):
priori_prob = priori_prob or P
prediction, max_level, max_prob = {}, None, 0.0
for level in gross_level:
p_level = 1
for field in basic_properties:
pk = p_key([(field, m[field]), ('gross_level', level)])
p_level *= priori_prob.get(pk, 0) # TODO use laplace?
prediction[level] = p_level
if p_level > max_prob:
max_level, max_prob = level, p_level
return max_level, prediction
nb_queries['predicted_gross_level'] = [predict(m)[0] for _, m in nb_queries.iterrows()]
describe_algo(nb_queries)
错误率: 0.39575289575289574
从上述运行结果可以了解到各个算法准确率、运行效率:
算法名称 | 问题规模(查询条数) | 准确率 | 运行时间 |
---|---|---|---|
id3 | 518 | 61% | 1.63 秒 |
k 近邻 | 518 | 56% | 112 秒 |
朴素贝叶斯 | 518 | 61% | 0.261 秒 |
不难发现 3 种算法中 k 近邻算法效率和准确率都是最低。而 id3 算法和朴素贝叶斯算法准确率相当,但朴素贝叶斯算法在执行效率上要比 id3 算法高出不少。
下面是使用算法对最近上映的电影进行票房预测的结果:
属性 | 属性值 |
---|---|
影片名称 | 复仇者联盟 2:奥创时代 |
影片发行季度 | summer |
影片制作人员水平 | first-class |
影片类型 | action |
算法 | 预测结果 |
---|---|
id3 | 高 |
K 近邻 | 高 |
朴素贝叶斯 | 高 |
通过本实验,根据数据挖掘的基本原理,完成了数据收集、数据清洗、数据分析、算法实现和结果分析的完整步骤,加深了我对课程知识的理解。
decide(root, {'release_season': 'summer', 'genre': 'action', 'staffs':'first-class'})
'high'
knn({'release_season': 'summer', 'genre': 'action', 'staffs':'first-class'}, knn_movies_samples, K, 'gross_level')[0]
'high'
predict({'release_season': 'summer', 'genre': 'action', 'staffs':'first-class'})[0]
'high'