Anomalies are more susceptible to isolation (hence have short path lengths) under random partitioning,
In this example, partitions are generated by randomly selecting an attribute and then randomly selecting a split value between the maximum and minimum values of
the selected attribute.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# default plot settings
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = [5, 5]
X1 = np.random.normal(loc= 0, size= (100,2))
X2 = np.random.normal(loc= 4, size= (100,2))
X3 = np.random.normal(loc= -5, size= (100,2))
A = np.array([[9,-6]])
data = np.concatenate((X1,X2,X3, A))
plt.plot(data[:,0], data[:,1],'o')
plt.grid()
plt.xlim([-10, 10])
plt.show()
df = pd.DataFrame(data)
df.head()
0 | 1 | |
---|---|---|
0 | -0.012564 | 0.297794 |
1 | 0.179264 | 0.253776 |
2 | -0.306310 | 0.256859 |
3 | 1.000864 | -1.508026 |
4 | 0.346579 | -0.179718 |
# select random feature
f = np.random.choice(df.shape[1])
feature = df.loc[:,f]
# get min and max value of selected feature
mini, maxi = feature.min() , feature.max()
# generate a cut-value between min and max
cut = np.random.uniform(low=mini, high=maxi)
# find isolated instance
isolated_index = -1
smaller, greater = feature[feature < cut], feature[feature >= cut]
if len(smaller) == 1: isolated_index = smaller.index[0]
if len(greater) == 1: isolated_index = greater.index[0]
if isolated_index!= -1:
isolated = list( df.loc[isolated_index])
print("cutoff point: ",cut," at feature ", f, "\ngives ", isolated_index, "th data ", isolated)
def recursive_partition(df, depth = 2, level = 0, info = "start"):
# Base Case
if level == depth: return -1
print("\n\n", info); print("At level: ", level)
# select random feature
f = np.random.choice(df.shape[1])
feature = df.loc[:,f]
# get min and max value of selected feature
mini, maxi = feature.min() , feature.max()
# generate a cut-value between min and max
cut = np.random.uniform(low=mini, high=maxi)
print("\n\tcutoff point: ",cut," at feature ", f)
#print(df)
# find isolated instance
smaller, greater = feature[feature < cut], feature[feature >= cut]
if len(smaller) == 1: return smaller.index[0]
if len(greater) == 1: return greater.index[0]
# go to small left child
isolated_index = recursive_partition(df[df[f] < cut ], depth, level = level +1, info = "small")
if isolated_index != -1: return isolated_index
# go to big right child
isolated_index = recursive_partition(df[df[f] >= cut ], depth, level = level +1, info = "big")
if isolated_index != -1: return isolated_index
return -1
iso = recursive_partition(df, depth = 10)
iso
start At level: 0 cutoff point: 6.0249995922613575 at feature 1
188
plt.plot(data[:,0], data[:,1],'o')
if iso != -1: plt.plot(data[iso,0], data[iso,1],'o')
plt.grid()
plt.xlim([-8, 8])
plt.show()
a =[]
a.append((4,5))
a
[(4, 5)]
# default plot settings
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.figsize'] = [5, 5]
class myIsolation:
def __init__(self, data):
self.path = []
self.data = data
df = pd.DataFrame(data)
self.bool2D = (df.shape[1] == 2)
self.iso = self.recursive_partition(df, depth = int(np.log2(len(data))))
def recursive_partition(self, df, depth = 2, level = 0, info = "start"):
# Base Case
if level == depth: return -1
#### print("\n\n", info); print("At level: ", level)
# select random feature
f = np.random.choice(df.shape[1])
feature = df.loc[:,f]
# get min and max value of selected feature
mini, maxi = feature.min() , feature.max()
# generate a cut-value between min and max
cut = np.random.uniform(low=mini, high=maxi)
######################################################## 2d draw
if self.bool2D:
other_feature = df.loc[:,df.shape[1]-f-1]
mini, maxi = other_feature.min() , other_feature.max()
#### print("\n\tcutoff point: ",cut," at feature ", f)
self.path.append({'level':level,
'info': info,
'cut' : cut,
'feature' : f,
'mini' :mini,
'maxi' :maxi})
######################################################## 2d draw
# find isolated instance
smaller, greater = feature[feature < cut], feature[feature >= cut]
if len(smaller) == 1: return smaller.index[0]
if len(greater) == 1: return greater.index[0]
# go to small left child
isolated_index = self.recursive_partition(df[df[f] < cut ], depth, level = level +1, info = "small")
if isolated_index != -1: return isolated_index
# go to big right child
isolated_index = self.recursive_partition(df[df[f] >= cut ], depth, level = level +1, info = "big")
if isolated_index != -1: return isolated_index
return -1
def score(self):
if self.iso != -1: return len(self.path)
def drawCuts(self, fig, ax):
for i in range(len(self.path)):
f, v, mi, ma = self.path[i]['feature'], self.path[i]['cut'], self.path[i]['mini'],self.path[i]['maxi']
x = np.linspace(mi, ma, 100); y = x * 0 + v; print(f, v)
if f == 0: plt.plot(y,x, color = 'r', linestyle ='--' )
else: plt.plot(x,y, color = 'r', linestyle ='--' )
def display(self):
fig, ax = plt.subplots()
plt.plot(self.data[:,0], self.data[:,1],'o')
if self.iso != -1: plt.plot(self.data[self.iso,0], self.data[self.iso,1],'o')
if self.bool2D: self.drawCuts(fig, ax)
print(self.iso)
plt.title("score" + str(self.score()))
plt.grid(); plt.show()
a = myIsolation(data)
a.display()
print(a.iso, len(a.path))
1 5.933870615524162 1 5.540043535038084 0 -3.876448840286511 1 -5.6505819810294895 0 -4.082377153859769 0 -4.666303320634922 1 -6.754035516268094 219
219 7
counter = {}
for i in range(10000):
a = myIsolation(data)
isolated,score = a.iso, a.score()
del(a)
if counter.get(isolated) == None: counter[isolated] = list()
counter[isolated].append(score)
idx = list(counter.keys())
val = [np.mean(counter[k]) for k in counter.keys()]
plt.bar(idx, val)
<Container object of 112 artists>
idx = list(counter.keys())
val = [np.mean(counter[k]) for k in counter.keys()]
plt.bar(idx, val)
mu, sigma = np.mean(val), np.std(val)
x = np.linspace(0, data.shape[0], 100);
y = x * 0 + mu;
plt.plot(x,y, color = 'r', linestyle ='--' ,lw = 3)
y = x * 0 + mu - sigma;
plt.plot(x,y, color = 'r', linestyle ='--' ,lw = 2)
y = x * 0 + mu - 2 * sigma;
plt.plot(x,y, color = 'r', linestyle ='--' ,lw = 1)
[<matplotlib.lines.Line2D at 0x1109345c0>]
import heapq
counter_len = {k:np.mean(counter[k]) for k in counter.keys()}
# Gettings best 5 lines
anomaly = heapq.nsmallest(10, counter, key=counter_len.get)
anomaly
[114, 300, 186, 0, 76, 157, 33, 11, 55, 68]
[(a, counter_len[a]) for a in anomaly]
[(114, 1.1781818181818182), (300, 1.923874053407732), (186, 2.0), (0, 2.0), (76, 2.0), (157, 2.0), (33, 2.0), (11, 2.0), (55, 2.0), (68, 2.0)]
import numpy as np
import matplotlib.pyplot as plt
# default plot settings
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = [9, 6]
x = np.concatenate((np.random.normal(loc=-2, scale=.5,size=500),
np.random.normal(loc=2, scale=.5, size=500)))
plt.hist(x, normed=True)
plt.xlim([-5, 5])
plt.show()
from sklearn.ensemble import IsolationForest
isolation_forest = IsolationForest(n_estimators=100)
isolation_forest.fit(x.reshape(-1, 1))
xx = np.linspace(-6, 6, 100).reshape(-1,1)
anomaly_score = isolation_forest.decision_function(xx)
outlier = isolation_forest.predict(xx)
plt.plot(xx, anomaly_score, label='anomaly score')
plt.fill_between(xx.flatten(), np.min(anomaly_score), np.max(anomaly_score),
where=outlier==-1, color='r',
alpha=.4, label='outlier region')
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel('x')
plt.xlim([-5, 5])
plt.show()
The anomaly score of the input samples. The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers.
import pandas as pd
import numpy as np
darbe = pd.read_csv('darbe.csv')
darbe.info()
<class 'pandas.core.frame.DataFrame'> Index: 262 entries, Hafta to 2018-10-28 Data columns (total 1 columns): Kategori: Tüm kategoriler 262 non-null object dtypes: object(1) memory usage: 4.1+ KB
darbe.iloc[:1]
Kategori: Tüm kategoriler | |
---|---|
Hafta | darbe: (Türkiye) |
darbe.iloc[2:3]
Kategori: Tüm kategoriler | |
---|---|
2013-11-10 | <1 |
darbe.shape
(262, 1)
vals = list(darbe.to_dict()['Kategori: Tüm kategoriler'].values())[1:]
vals[:20]
['<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1', '<1']
vals = [np.random.randn() if v == '<1' else int(v) for v in vals]
import matplotlib.pyplot as plt
plt.figure(figsize=(15,3))
plt.plot(vals, 'r--o')
[<matplotlib.lines.Line2D at 0x11615f2b0>]
plt.hist(vals, normed=True, bins=50)
(array([0.41954023, 0.05172414, 0.01532567, 0.00191571, 0.00191571, 0. , 0.00191571, 0. , 0. , 0.00191571, 0. , 0. , 0. , 0. , 0.00191571, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.00191571, 0. , 0. , 0. , 0. , 0. , 0. , 0.00191571]), array([ 0., 2., 4., 6., 8., 10., 12., 14., 16., 18., 20., 22., 24., 26., 28., 30., 32., 34., 36., 38., 40., 42., 44., 46., 48., 50., 52., 54., 56., 58., 60., 62., 64., 66., 68., 70., 72., 74., 76., 78., 80., 82., 84., 86., 88., 90., 92., 94., 96., 98., 100.]), <a list of 50 Patch objects>)
from sklearn.ensemble import IsolationForest
isolation_forest = IsolationForest(n_estimators=100)
isolation_forest.fit(np.array(vals).reshape(-1, 1))
xx = np.linspace(0, 100, 200).reshape(-1,1)
anomaly_score = isolation_forest.decision_function(xx)
outlier = isolation_forest.predict(xx)
plt.plot(xx, anomaly_score, label='anomaly score')
plt.fill_between(xx.flatten(), np.min(anomaly_score), np.max(anomaly_score),
where=outlier==-1, color='r',
alpha=.4, label='outlier region')
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel('x')
plt.xlim([-5, 105])
plt.show()
plt.plot(xx, anomaly_score, label='anomaly score')
plt.fill_between(xx.flatten(), np.min(anomaly_score), np.max(anomaly_score),
where=outlier==-1, color='r',
alpha=.4, label='outlier region')
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel('x')
plt.xlim([-5, 5])
plt.show()
outlier
array([ 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
np.percentile?
plt.boxplot(vals)
{'boxes': [<matplotlib.lines.Line2D at 0x1a1c0d40f0>], 'caps': [<matplotlib.lines.Line2D at 0x1a1c1f5a58>, <matplotlib.lines.Line2D at 0x1a1c280198>], 'fliers': [<matplotlib.lines.Line2D at 0x1a1c31db00>], 'means': [], 'medians': [<matplotlib.lines.Line2D at 0x1a1c31d390>], 'whiskers': [<matplotlib.lines.Line2D at 0x1a1c1ca6d8>, <matplotlib.lines.Line2D at 0x1a1c1ca780>]}
Q1, Q3 = np.percentile(vals, 25), np.percentile(vals, 75)
Q1, Q3
(0.0, 1.0)
IQR = Q3 - Q1
IQR
1.0
smallest, greatest = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
smallest, greatest
(-1.5, 2.5)
normal = [v for v in vals if (v > smallest) and (v < greatest)]
len(normal)
241
len(vals)
261
anormal = [v for v in vals if (v < smallest) or (v > greatest)]
len(anormal)
20