套路:
衡量方法:
支持度support := 所有买X的人数
置信度confidence := $\frac{所有买X和Y的人数}{所有买X的人数}$
# 引入库
import numpy as np
from operator import itemgetter
# 准备数据
# 创造随机生成的数据 (可跳过)
X = np.zeros((100, 5), dtype='bool')
for i in range(X.shape[0]):
if np.random.random() < 0.3:
# A bread winner
X[i][0] = 1
if np.random.random() < 0.5:
# Who likes milk
X[i][1] = 1
if np.random.random() < 0.2:
# Who likes cheese
X[i][2] = 1
if np.random.random() < 0.25:
# Who likes apples
X[i][3] = 1
if np.random.random() < 0.5:
# Who likes bananas
X[i][4] = 1
else:
# Not a bread winner
if np.random.random() < 0.5:
# Who likes milk
X[i][1] = 1
if np.random.random() < 0.2:
# Who likes cheese
X[i][2] = 1
if np.random.random() < 0.25:
# Who likes apples
X[i][3] = 1
if np.random.random() < 0.5:
# Who likes bananas
X[i][4] = 1
else:
if np.random.random() < 0.8:
# Who likes cheese
X[i][2] = 1
if np.random.random() < 0.6:
# Who likes apples
X[i][3] = 1
if np.random.random() < 0.7:
# Who likes bananas
X[i][4] = 1
if X[i].sum() == 0:
X[i][4] = 1 # Must buy something, so gets bananas
np.savetxt("./data/affinity_dataset.txt", X, fmt='%d') # 保存
# 读取数据
dataset_filename = "./data/affinity_dataset.txt"
X = np.loadtxt(dataset_filename) # 加载数据
n_samples, n_features = X.shape
print(X.shape)
print(X[:5])
(100, 5) [[0. 0. 1. 1. 0.] [1. 1. 0. 0. 0.] [1. 0. 0. 1. 1.] [0. 1. 1. 0. 1.] [0. 1. 0. 0. 0.]]
我们定义的规则为:买了苹果又买香蕉
下面 rule_valid 表示买了苹果又买香蕉的有多少人
显然支持度 := 所有买X的人数,即支持度=rule_valid
所以置信度=支持度/总人数
# 文件affinity_dataset.txt是生成的数据,得我们来指定列
features = ["bread", "milk", "cheese", "apples", "bananas"]
num_apple_purchases = 0 # 计数
for sample in X:
if sample[3] == 1: # 记录买 Apples 的有多少人
num_apple_purchases += 1
print("买苹果的有{0}人".format(num_apple_purchases))
rule_valid = 0
rule_invalid = 0
for sample in X:
if sample[3] == 1: # 买了苹果
if sample[4] == 1:# 又买香蕉的
rule_valid += 1
else:# 不买香蕉的
rule_invalid += 1
print("买了苹果又买香蕉的有{0}人".format(rule_valid))
print("买了苹果不买香蕉的有{0}人".format(rule_invalid))
# 计算支持度support和置信度confidence
support = rule_valid # 支持度是符合“买了苹果又买香蕉”这个规则的人数
confidence = rule_valid / num_apple_purchases
print("支持度support = {0} 置信度confidence = {1:.3f}.".format(support, confidence))
# 置信度的百分比形式
print("置信度confidence的百分比形式为 {0:.1f}%.".format(100 * confidence))
买苹果的有39人 买了苹果又买香蕉的有23人 买了苹果不买香蕉的有16人 支持度support = 23 置信度confidence = 0.590. 置信度confidence的百分比形式为 59.0%.
from collections import defaultdict
# 上面"买了苹果又买香蕉"是一种情况,现在把所有可能的情况都做一遍
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)
for sample in X:
for premise in range(n_features):
if sample[premise] == 0: continue
# 先买premise,premise代表一种食物,记做X
num_occurences[premise] += 1
for conclusion in range(n_features):
if premise == conclusion:
continue # 跳过买X又买X的情况
if sample[conclusion] == 1: # 又买了conclusion,conclusion代表一种食物,记做Y
valid_rules[(premise, conclusion)] += 1 # 买X买Y
else:
invalid_rules[(premise, conclusion)] += 1 # 买X没买Y
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
for premise, conclusion in confidence:
premise_name = features[premise]
conclusion_name = features[conclusion]
print("Rule: 买了{0},又买{1}".format(premise_name, conclusion_name))
print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
print(" - 支持度Support: {0}".format(support[(premise, conclusion)]))
print("")
Rule: 买了cheese,又买apples - 置信度Confidence: 0.553 - 支持度Support: 26 Rule: 买了apples,又买cheese - 置信度Confidence: 0.667 - 支持度Support: 26 Rule: 买了bread,又买milk - 置信度Confidence: 0.619 - 支持度Support: 13 Rule: 买了milk,又买bread - 置信度Confidence: 0.265 - 支持度Support: 13 Rule: 买了bread,又买apples - 置信度Confidence: 0.286 - 支持度Support: 6 Rule: 买了bread,又买bananas - 置信度Confidence: 0.476 - 支持度Support: 10 Rule: 买了apples,又买bread - 置信度Confidence: 0.154 - 支持度Support: 6 Rule: 买了apples,又买bananas - 置信度Confidence: 0.590 - 支持度Support: 23 Rule: 买了bananas,又买bread - 置信度Confidence: 0.185 - 支持度Support: 10 Rule: 买了bananas,又买apples - 置信度Confidence: 0.426 - 支持度Support: 23 Rule: 买了milk,又买cheese - 置信度Confidence: 0.204 - 支持度Support: 10 Rule: 买了milk,又买bananas - 置信度Confidence: 0.429 - 支持度Support: 21 Rule: 买了cheese,又买milk - 置信度Confidence: 0.213 - 支持度Support: 10 Rule: 买了cheese,又买bananas - 置信度Confidence: 0.532 - 支持度Support: 25 Rule: 买了bananas,又买milk - 置信度Confidence: 0.389 - 支持度Support: 21 Rule: 买了bananas,又买cheese - 置信度Confidence: 0.463 - 支持度Support: 25 Rule: 买了bread,又买cheese - 置信度Confidence: 0.238 - 支持度Support: 5 Rule: 买了cheese,又买bread - 置信度Confidence: 0.106 - 支持度Support: 5 Rule: 买了milk,又买apples - 置信度Confidence: 0.184 - 支持度Support: 9 Rule: 买了apples,又买milk - 置信度Confidence: 0.231 - 支持度Support: 9
# 封装一下方便调用
def print_rule(premise, conclusion, support, confidence, features):
premise_name = features[premise]
conclusion_name = features[conclusion]
print("Rule: 买了{0},又买{1}".format(premise_name, conclusion_name))
print(" - 置信度Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
print(" - 支持度Support: {0}".format(support[(premise, conclusion)]))
print("")
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)
Rule: 买了milk,又买apples - 置信度Confidence: 0.184 - 支持度Support: 9
# 按支持度support排序
from pprint import pprint
pprint(list(support.items()))
[((2, 3), 26), ((3, 2), 26), ((0, 1), 13), ((1, 0), 13), ((0, 3), 6), ((0, 4), 10), ((3, 0), 6), ((3, 4), 23), ((4, 0), 10), ((4, 3), 23), ((1, 2), 10), ((1, 4), 21), ((2, 1), 10), ((2, 4), 25), ((4, 1), 21), ((4, 2), 25), ((0, 2), 5), ((2, 0), 5), ((1, 3), 9), ((3, 1), 9)]
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5): # 打印前5个
print("Rule #{0}".format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
print_rule(premise, conclusion, support, confidence, features)
Rule #1 Rule: 买了apples,又买cheese - 置信度Confidence: 0.667 - 支持度Support: 26 Rule #2 Rule: 买了bread,又买milk - 置信度Confidence: 0.619 - 支持度Support: 13 Rule #3 Rule: 买了apples,又买bananas - 置信度Confidence: 0.590 - 支持度Support: 23 Rule #4 Rule: 买了cheese,又买apples - 置信度Confidence: 0.553 - 支持度Support: 26 Rule #5 Rule: 买了cheese,又买bananas - 置信度Confidence: 0.532 - 支持度Support: 25
特征:
算法:
中文算法:
很显然,“在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类”是基于大数据的
不过这样的规则过于简单,下面继续实验会发现准确率只有 60% 左右,当然还是比随机预测 50% 好!
from sklearn.datasets import load_iris
#X, y = np.loadtxt("X_classification.txt"), np.loadtxt("y_classification.txt") # 本地加载数据,我先下载好在 data 文件夹里了
dataset = load_iris() # 或者自己亲自下载数据再加载也行
X = dataset.data
y = dataset.target
print(dataset.DESCR) # 打印下数据集介绍
n_samples, n_features = X.shape
Iris Plants Database ==================== Notes ----- Data Set Characteristics: :Number of Instances: 150 (50 in each of three classes) :Number of Attributes: 4 numeric, predictive attributes and the class :Attribute Information: - sepal length in cm - sepal width in cm - petal length in cm - petal width in cm - class: - Iris-Setosa - Iris-Versicolour - Iris-Virginica :Summary Statistics: ============== ==== ==== ======= ===== ==================== Min Max Mean SD Class Correlation ============== ==== ==== ======= ===== ==================== sepal length: 4.3 7.9 5.84 0.83 0.7826 sepal width: 2.0 4.4 3.05 0.43 -0.4194 petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) ============== ==== ==== ======= ===== ==================== :Missing Attribute Values: None :Class Distribution: 33.3% for each of 3 classes. :Creator: R.A. Fisher :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) :Date: July, 1988 This is a copy of UCI ML iris datasets. http://archive.ics.uci.edu/ml/datasets/Iris The famous Iris database, first used by Sir R.A Fisher This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. References ---------- - Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950). - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis. (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System Structure and Classification Rule for Recognition in Partially Exposed Environments". IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. PAMI-2, No. 1, 67-71. - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions on Information Theory, May 1972, 431-433. - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II conceptual clustering system finds 3 classes in the data. - Many, many more ...
# Compute the mean for each attribute计算平均值
attribute_means = X.mean(axis=0)
assert attribute_means.shape == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')
# 划分训练集和测试集
from sklearn.cross_validation import train_test_split
# 设置随机数种子以便复现书里的内容
random_state = 14
X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("训练集数据有 {} 条".format(y_train.shape))
print("测试集数据有 {} 条".format(y_test.shape))
训练集数据有 (112,) 条 测试集数据有 (38,) 条
/home/lxy/.local/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
from collections import defaultdict
from operator import itemgetter
def train(X, y_true, feature):
"""Computes the predictors and error for a given feature using the OneR algorithm
Parameters
----------
X: array [n_samples, n_features]
The two dimensional array that holds the dataset. Each row is a sample, each column
is a feature.
y_true: array [n_samples,]
The one dimensional array that holds the class values. Corresponds to X, such that
y_true[i] is the class value for sample X[i].
feature: int
An integer corresponding to the index of the variable we wish to test.
0 <= variable < n_features
Returns
-------
predictors: dictionary of tuples: (value, prediction)
For each item in the array, if the variable has a given value, make the given prediction.
error: float
The ratio of training data that this rule incorrectly predicts.
"""
# 1.一些等下要用的变量(数据的形状如上)
n_samples, n_features = X.shape
assert 0 <= feature < n_features
values = set(X[:,feature])
predictors = dict()
errors = []
# 2.算法(对照上面的算法流程)
# 已经给定特征 feature,作为函数参数传过来了
for current_value in values:
# For 该特征对应的真值(即植物是哪一类)
most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
# 预测值:基于该特征预测的次数最多的类,即在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类
predictors[current_value] = most_frequent_class
errors.append(error)
# 计算预测值与真值的误差
total_error = sum(errors)
# 对上面计算的误差求和
# python里求和函数 sum([1, 2, 3]) == 1 + 2 + 3 == 6
return predictors, total_error
# Compute what our predictors say each sample is based on its value
#y_predicted = np.array([predictors[sample[feature]] for sample in X])
def train_feature_value(X, y_true, feature, value):
# 预测值:基于该特征预测的次数最多的类,即在所有样本里该特征 10 次有 6 次预测了 A 类,那我们对所有样本都预测为 A 类
# 我们需要一个字典型变量存每个变量预测正确的次数
class_counts = defaultdict(int)
# 对每个二元组(类别,真值)迭代计数
for sample, y in zip(X, y_true):
if sample[feature] == value:
class_counts[y] += 1
# 现在选被预测最多的类别,需要排序。(我们认为被预测最多的类别就是正确的)
sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
most_frequent_class = sorted_class_counts[0][0]
# 误差定义为分类“错误”的次数,这里“错误”指样本中没有分类为我们预测的值,即样本的真实类别不是“被预测最多的类别”
n_samples = X.shape[1]
error = sum([class_count for class_value, class_count in class_counts.items()
if class_value != most_frequent_class])
return most_frequent_class, error
# For 给定的每个特征,计算所有预测值(这里 for 写到 list 里面是 python 的语法糖)
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# 现在选择最佳模型并保存为 "model"
# 按误差排序
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("最佳模型基于第 {0} 个变量,误差为 {1:.2f}".format(best_variable, best_error))
# 选最好的模型,也就是误差最小的模型
model = {'variable': best_variable,
'predictor': all_predictors[best_variable][0]}
print(model)
最佳模型基于第 2 个变量,误差为 37.00 {'variable': 2, 'predictor': {0: 0, 1: 2}}
def predict(X_test, model):
variable = model['variable']
predictor = model['predictor']
y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
return y_predicted
y_predicted = predict(X_test, model)
print(y_predicted)
accuracy = np.mean(y_predicted == y_test) * 100
print("在测试集上的准确率 {:.1f}%".format(accuracy))
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))
[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2 2] 在测试集上的准确率 65.8% precision recall f1-score support 0 0.94 1.00 0.97 17 1 0.00 0.00 0.00 13 2 0.40 1.00 0.57 8 avg / total 0.51 0.66 0.55 38
/home/lxy/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
在测试集上的准确率 65.8%,比完全随机预测 50% 好一点点