Extract patterns in the daily load profiles of a single-household using the k-means clustering algorithm.
After finished this notebook, you should be able to explain k-means clustering algorithm, including how to use the scikit-learn implementation.
Description:
Measurements of electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years. Different electrical quantities and some sub-metering values are available.
Data set information:
Notes:
(global_active_power*1000/60 - sub_metering_1 - sub_metering_2 - sub_metering_3) represents the active energy consumed every minute (in watt hour) in the household by electrical equipment not measured in sub-meterings 1, 2 and 3.
The dataset contains some missing values in the measurements (nearly 1,25% of the rows). All calendar timestamps are present in the dataset but for some timestamps, the measurement values are missing: a missing value is represented by the absence of value between two consecutive semi-colon attribute separators. For instance, the dataset shows missing values on April 28, 2007.
Attribute information:
Source: https://archive.ics.uci.edu/ml/datasets/individual+household+electric+power+consumption
import sys
assert sys.version_info >= (3, 6)
import numpy
assert numpy.__version__ >="1.17.3"
import numpy as np
import matplotlib.pyplot as plt
import pandas
assert pandas.__version__ >= "0.25.1"
import pandas as pd
import sklearn
assert sklearn.__version__ >= "0.21.3"
from sklearn import datasets
%matplotlib inline
household_pc = None
household_pc.shape
household_pc.head()
household_power_consumption = household_pc.iloc[0:, 2:9].dropna()
household_power_consumption.head()
from sklearn.model_selection import train_test_split
X = household_power_consumption.values
X_train, X_test = train_test_split(X, train_size=.01, random_state = 42)
from sklearn import decomposition
# compute the two principal components
pca = None
pca.fit(X_train)
X_projected = None
print(pca.explained_variance_ratio_)
In scikit-learn provides a k-means implementation through the sklearn.cluster.KMeans
from sklearn import cluster
kmeans = None
kmeans.fit(X_projected)
def plot_clusters_map(X, cluster_model):
x_min, x_max = X[:, 0].min() - 5, X[:, 0].max() - 1
y_min, y_max = X[:, 1].min(), X[:, 1].max() + 5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
Z = cluster_model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z,
interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(X[:, 0], X[:, 1], 'k.', markersize=4)
centroids = cluster_model.cluster_centers_
inert = cluster_model.inertia_
plt.scatter(centroids[:, 0],
centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w',
zorder=8)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(());
plot_clusters_map(X_projected, kmeans)
from scipy.spatial.distance import cdist, pdist
# Create a set of clusters
k_range = range(1, 14)
# Fit the kmeans clustering model for each number of cluster.
kmeans_var = [None.fit(X_projected) for k in k_range]
# Get the centers for each cluster model
centroids = [X.cluster_centers_ for X in kmeans_var]
Calculate the Euclidean distance from each point to each cluster center
k_euclidean = [cdist(X_projected, cent, 'euclidean') for cent in centroids]
distances = [np.min(ke, axis=1) for ke in k_euclidean]
# Total within-cluster sum of squares
wcss = [sum(d**2) for d in distances]
# Compute the total sum of squares
tss = np.sum(pdist(X_projected)**2) / X_projected.shape[0]
# Compute the sum of squares difference between the clusters
bss = tss - wcss
Plot the curve of the variance explained in function of the number of clusters.
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(k_range, bss/tss*100, 'b*-')
ax.set_ylim((0,100))
plt.grid(True)
plt.xlabel('n_clusters')
plt.ylabel('Percentage of variance explained')
plt.title('Variance Explained vs. # of cluster (k)');