import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv("my_machine-learning/datasets/titanic.csv")
data = data.dropna()
fig = plt.figure(figsize=(8,6))
plt.hist(data['Age'],bins = 50)
plt.title("Histogram of the age variable")
plt.ylabel("Frequency")
plt.xlabel("Age")
plt.show()
af = data[["Age", "Fare"]]
plt.scatter(x = af["Age"], y = af["Fare"],color='red',alpha=0.5)
plt.xlabel("Age")
plt.ylabel("Fare")
plt.show()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
af = scaler.fit_transform(af)
afs = pd.DataFrame(af, columns = ["Age", "Fare"])
plt.scatter(x = afs["Age"], y = afs["Fare"])
plt.title("Feature Scaling")
plt.xlabel("Age")
plt.ylabel("Fare")
plt.show()
from sklearn.cluster import DBSCAN
outlier_detection = DBSCAN(eps = 0.5)
clusters = outlier_detection.fit_predict(afs)
fig = plt.figure(figsize=(10, 7))
plt.scatter(afs.iloc[:, 0], afs.iloc[:, 1], c=clusters, s=50, cmap='viridis')
plt.title('Outlier Detection using DBSACN')
plt.ylabel('Fare')
plt.xlabel('Age')
plt.show()