In [38]:

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [39]:

data = pd.read_csv("my_machine-learning/datasets/titanic.csv")
data = data.dropna()

In [40]:

fig = plt.figure(figsize=(8,6))
plt.hist(data['Age'],bins = 50)
plt.title("Histogram of the age variable")
plt.ylabel("Frequency")
plt.xlabel("Age")
plt.show()

In [75]:

af = data[["Age", "Fare"]]

plt.scatter(x = af["Age"], y = af["Fare"],color='red',alpha=0.5)
plt.xlabel("Age")
plt.ylabel("Fare")
plt.show()

We also see that both variables have different scales.
It may be possibale that one feature dominate other else it may be possible that large variance dominates.
We don’t want that so we must normalize variables.

In [55]:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
af = scaler.fit_transform(af)
afs = pd.DataFrame(af, columns = ["Age", "Fare"])

In [56]:

plt.scatter(x = afs["Age"], y = afs["Fare"])
plt.title("Feature Scaling")
plt.xlabel("Age")
plt.ylabel("Fare")
plt.show()

DBSCAN is going to assign points to clusters and return the labels of clusters.
If it cannot assign the value to any cluster (because it is an outlier), it returns -1

In [70]:

from sklearn.cluster import DBSCAN
outlier_detection = DBSCAN(eps = 0.5)

clusters = outlier_detection.fit_predict(afs)

In [74]:

fig = plt.figure(figsize=(10, 7))
plt.scatter(afs.iloc[:, 0], afs.iloc[:, 1], c=clusters, s=50, cmap='viridis')

plt.title('Outlier Detection using DBSACN')
plt.ylabel('Fare')
plt.xlabel('Age')
plt.show()

In [ ]: