import keras
import numpy as np
import matplotlib.pyplot as plt
from skimage import transform
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-1-8c431c2143da> in <module>() ----> 1 import keras 2 import numpy as np 3 import matplotlib.pyplot as plt 4 from skimage import transform ModuleNotFoundError: No module named 'keras'
Будем работать с датасетом FashionMNIST.
from keras.datasets import fashion_mnist
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-2-53802265acd5> in <module>() ----> 1 from keras.datasets import fashion_mnist 2 3 (X_train, y_train), (X_test, y_test) = fashion_mnist.load_data() ModuleNotFoundError: No module named 'keras'
plt.imshow(X_train[0].reshape([28,28]))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-3-adf838c7b916> in <module>() ----> 1 plt.imshow(X_train[0].reshape([28,28])) NameError: name 'plt' is not defined
Будем решать задачу классификации на 10 классов. Каждый класс соответствует одному из типов одежды. Исходная размерность признакового пространства: 784
, каждый пиксель является признаком. Будем снижать размерность признакового пространства с помощью метода главных компонент (PCA
). Ваша задача оценить качество решенения задачи классификации по метрике accuracy
в зависимости от числа главных компонент. Также оцените дисперсию функции качества в зависимости от числа главных компонент.
Нарисуйте график зависимости функции качества и ее дисперсии от числа главных компонент.
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
pca = PCA(n_components=3)
used_idx = np.random.choice(np.arange(X_train.shape[0]), 10000, replace=False)
X_train_lowdim = pca.fit_transform(X_train[used_idx].reshape([-1, 784]))
lr = LogisticRegression()
lr.fit(X_train_lowdim, y_train[used_idx])
accuracy_score(y_test, lr.predict(pca.transform(X_test.reshape([-1, 784]))))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-5-bf68e2bd6e85> in <module>() 1 pca = PCA(n_components=3) ----> 2 used_indices = np.random.choice(np.arange(X_train.shape[0]), 10000, replace=False) 3 X_train_lowdim = pca.fit_transform(X_train[used_indices].reshape([-1, 784])) 4 lr = LogisticRegression() 5 lr.fit(X_train_lowdim, y_train[used_indices]) NameError: name 'np' is not defined
Используйте следующую сетку числа главных компонент: [3, 5, 7, 12, 18, 25, 33, 40, 48, 55]
. Для ускорения сходимости можете семплировать подвыборки из X_train
.
dim = 784
def step(comp):
pca = PCA(n_components=comp)
used_idx = np.random.choice(np.arange(X_train.shape[0]), 10000, replace=False)
X_train_lowdim = pca.fit_transform(X_train[used_idx].reshape([-1, dim]))
lr = LogisticRegression()
lr.fit(X_train_lowdim, y_train[used_idx])
return lr.predict(pca.transform(X_test.reshape([-1, dim])))
def get_accuracy(comp):
acc = []
for i in range(10):
acc.append(accuracy_score(y_test, step(comp)))
return np.mean(acc), np.var(acc)
main_comps = [3, 5, 7, 12, 18, 25, 33, 40, 48, 55]
acc = []
var = []
for comp in main_comps:
mean, var = get_accuracy(comp)
acc.append(mean)
var.append(var)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-7-7eef10781cea> in <module>() 4 5 for comp in main_comps: ----> 6 mean, var = get_accuracy(comp) 7 accuracy.append(mean) 8 variances.append(var) <ipython-input-6-d437e44603b8> in get_accuracy(comp) 13 accuracy = [] 14 for i in range(10): ---> 15 accuracy.append(accuracy_score(y_test, step(comp))) 16 return np.mean(accuracy), np.var(accuracy) NameError: name 'y_test' is not defined
from matplotlib import pylab as plt
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'FreeSerif'
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.markersize'] = 12
plt.rcParams['xtick.labelsize'] = 24
plt.rcParams['ytick.labelsize'] = 24
plt.rcParams['legend.fontsize'] = 24
plt.rcParams['axes.titlesize'] = 36
plt.rcParams['axes.labelsize'] = 24
import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
plt.plot(main_comps, acc)
plt.grid()
plt.xlabel("Number of PCA components")
plt.ylabel("Accuracy metric")
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-9-50a9395c1596> in <module>() 1 import matplotlib.pyplot as plt 2 plt.figure(figsize=(10,7)) ----> 3 plt.plot(main_comps, accuracy) 4 plt.grid() 5 plt.xlabel("Number of PCA components") ~/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py in plot(*args, **kwargs) 3356 mplDeprecation) 3357 try: -> 3358 ret = ax.plot(*args, **kwargs) 3359 finally: 3360 ax._hold = washold ~/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs) 1853 "the Matplotlib list!)" % (label_namer, func.__name__), 1854 RuntimeWarning, stacklevel=2) -> 1855 return func(ax, *args, **kwargs) 1856 1857 inner.__doc__ = _add_data_doc(inner.__doc__, ~/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in plot(self, *args, **kwargs) 1525 kwargs = cbook.normalize_kwargs(kwargs, _alias_map) 1526 -> 1527 for line in self._get_lines(*args, **kwargs): 1528 self.add_line(line) 1529 lines.append(line) ~/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _grab_next_args(self, *args, **kwargs) 404 this += args[0], 405 args = args[1:] --> 406 for seg in self._plot_args(this, kwargs): 407 yield seg 408 ~/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _plot_args(self, tup, kwargs) 381 x, y = index_of(tup[-1]) 382 --> 383 x, y = self._xy_from_xy(x, y) 384 385 if self.command == 'plot': ~/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _xy_from_xy(self, x, y) 240 if x.shape[0] != y.shape[0]: 241 raise ValueError("x and y must have same first dimension, but " --> 242 "have shapes {} and {}".format(x.shape, y.shape)) 243 if x.ndim > 2 or y.ndim > 2: 244 raise ValueError("x and y can be no greater than 2-D, but have " ValueError: x and y must have same first dimension, but have shapes (10,) and (0,)
/home/sergei/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1328: UserWarning: findfont: Font family ['serif'] not found. Falling back to DejaVu Sans (prop.get_family(), self.defaultFamily[fontext]))
import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
plt.plot(main_comps, var)
plt.grid()
plt.xlabel("Number of PCA components")
plt.ylabel("Variance of accuracy metric")
plt.show()