Main Idea¶

Anomalies are more susceptible to isolation (hence have short path lengths) under random partitioning,

Method¶

In this example, partitions are generated by randomly selecting an attribute and then randomly selecting a split value between the maximum and minimum values of

the selected attribute.

In [3]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# default plot settings
plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = [5, 5]

In [4]:

X1 = np.random.normal(loc= 0, size= (100,2))
X2 = np.random.normal(loc= 4, size= (100,2))
X3 = np.random.normal(loc= -5, size= (100,2))

A = np.array([[9,-6]])

data = np.concatenate((X1,X2,X3, A))

In [5]:

plt.plot(data[:,0], data[:,1],'o')
plt.grid()
plt.xlim([-10, 10])
plt.show()

In [6]:

df = pd.DataFrame(data)
df.head()

Out[6]:

	0	1
0	-0.012564	0.297794
1	0.179264	0.253776
2	-0.306310	0.256859
3	1.000864	-1.508026
4	0.346579	-0.179718

In [7]:

# select random feature
f = np.random.choice(df.shape[1])
feature = df.loc[:,f]
# get min and max value of selected feature
mini, maxi = feature.min() , feature.max()
# generate a cut-value between min and max
cut = np.random.uniform(low=mini, high=maxi)
# find isolated instance
isolated_index = -1
smaller, greater = feature[feature < cut], feature[feature >= cut]
if len(smaller) == 1: isolated_index = smaller.index[0]
if len(greater) == 1: isolated_index = greater.index[0]
if isolated_index!= -1:
    isolated = list( df.loc[isolated_index])
    print("cutoff point: ",cut," at feature ", f, "\ngives  ", isolated_index, "th data ", isolated)

In [8]:

def recursive_partition(df, depth = 2, level = 0, info = "start"):
    
    # Base Case 
    if level == depth: return -1
    
    print("\n\n", info); print("At level: ", level)
    
    # select random feature
    f = np.random.choice(df.shape[1])
    feature = df.loc[:,f]
    # get min and max value of selected feature
    mini, maxi = feature.min() , feature.max()
    # generate a cut-value between min and max
    cut = np.random.uniform(low=mini, high=maxi)
    
    print("\n\tcutoff point: ",cut," at feature ", f)
    #print(df)
    # find isolated instance
    smaller, greater = feature[feature < cut], feature[feature >= cut]
    if len(smaller) == 1: return smaller.index[0]
    if len(greater) == 1: return greater.index[0]
    
    # go to small left child
    isolated_index = recursive_partition(df[df[f] < cut ], depth, level = level +1, info = "small")
    if isolated_index != -1: return isolated_index
    
    # go to big right child
    isolated_index  = recursive_partition(df[df[f] >= cut ], depth, level = level +1, info = "big")
    if isolated_index != -1: return isolated_index
    
    return -1
        

In [9]:

iso = recursive_partition(df, depth = 10)
iso

 start
At level:  0

	cutoff point:  6.0249995922613575  at feature  1

Out[9]:

In [10]:

plt.plot(data[:,0], data[:,1],'o')
if iso != -1: plt.plot(data[iso,0], data[iso,1],'o')
plt.grid()
plt.xlim([-8, 8])
plt.show()

In [11]:

a =[]

In [10]:

a.append((4,5))

In [11]:

Out[11]:

[(4, 5)]

In [12]:

# default plot settings
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.figsize'] = [5, 5]

class myIsolation:
    def __init__(self, data):
        self.path = []
        self.data = data
        df = pd.DataFrame(data)
        self.bool2D = (df.shape[1] == 2)
        
        self.iso = self.recursive_partition(df, depth = int(np.log2(len(data))))
        
        
    def recursive_partition(self, df, depth = 2, level = 0, info = "start"):
    
        # Base Case 
        if level == depth: return -1

        #### print("\n\n", info); print("At level: ", level)

        # select random feature
        f = np.random.choice(df.shape[1])
        feature = df.loc[:,f]
        # get min and max value of selected feature
        mini, maxi = feature.min() , feature.max()
        # generate a cut-value between min and max
        cut = np.random.uniform(low=mini, high=maxi)
        
        ######################################################## 2d draw 
        if self.bool2D:
            other_feature = df.loc[:,df.shape[1]-f-1]
            mini, maxi = other_feature.min() , other_feature.max()

        #### print("\n\tcutoff point: ",cut," at feature ", f)
        self.path.append({'level':level, 
                          'info': info, 
                          'cut' : cut, 
                          'feature' : f, 
                          'mini' :mini, 
                          'maxi' :maxi})
        ######################################################## 2d draw 


        # find isolated instance
        smaller, greater = feature[feature < cut], feature[feature >= cut]
        if len(smaller) == 1: return smaller.index[0]
        if len(greater) == 1: return greater.index[0]

        # go to small left child
        isolated_index = self.recursive_partition(df[df[f] < cut ], depth, level = level +1, info = "small")
        if isolated_index != -1: return isolated_index

        # go to big right child
        isolated_index  = self.recursive_partition(df[df[f] >= cut ], depth, level = level +1, info = "big")
        if isolated_index != -1: return isolated_index

        return -1
    
    def score(self):
        if self.iso != -1: return len(self.path)
    
    def drawCuts(self, fig, ax):
        for i in range(len(self.path)):
            f, v, mi, ma = self.path[i]['feature'], self.path[i]['cut'], self.path[i]['mini'],self.path[i]['maxi']
            x = np.linspace(mi, ma, 100); y = x * 0 + v; print(f, v)
            if f == 0: plt.plot(y,x, color = 'r', linestyle ='--' )
            else: plt.plot(x,y, color = 'r', linestyle ='--' )
        
    def display(self):
        fig, ax = plt.subplots()
        plt.plot(self.data[:,0], self.data[:,1],'o')
        
        if self.iso != -1: plt.plot(self.data[self.iso,0], self.data[self.iso,1],'o')
            
        if self.bool2D: self.drawCuts(fig, ax)
            
        print(self.iso) 
        plt.title("score" + str(self.score()))
        plt.grid(); plt.show()

In [13]:

a = myIsolation(data)
a.display()
print(a.iso, len(a.path))

1 5.933870615524162
1 5.540043535038084
0 -3.876448840286511
1 -5.6505819810294895
0 -4.082377153859769
0 -4.666303320634922
1 -6.754035516268094
219

219 7

In [14]:

counter = {}
for i in range(10000):
    a = myIsolation(data)
    isolated,score = a.iso, a.score()
    del(a)
    if counter.get(isolated) == None: counter[isolated] = list()
    counter[isolated].append(score)

In [21]:

idx = list(counter.keys())
val = [np.mean(counter[k]) for k in counter.keys()]
plt.bar(idx, val)

Out[21]:

<Container object of 112 artists>

In [23]:

idx = list(counter.keys())
val = [np.mean(counter[k]) for k in counter.keys()]
plt.bar(idx, val)

mu, sigma = np.mean(val), np.std(val)

x = np.linspace(0, data.shape[0], 100); 
y = x * 0 + mu; 
plt.plot(x,y, color = 'r', linestyle ='--' ,lw = 3)

y = x * 0 + mu - sigma; 
plt.plot(x,y, color = 'r', linestyle ='--' ,lw = 2)

y = x * 0 + mu - 2 * sigma; 
plt.plot(x,y, color = 'r', linestyle ='--' ,lw = 1)

Out[23]:

[<matplotlib.lines.Line2D at 0x1109345c0>]

In [24]:

import heapq

counter_len = {k:np.mean(counter[k]) for k in counter.keys()}
# Gettings best 5 lines             
anomaly = heapq.nsmallest(10, counter, key=counter_len.get)

In [25]:

anomaly

Out[25]:

[114, 300, 186, 0, 76, 157, 33, 11, 55, 68]

In [26]:

[(a, counter_len[a]) for a in anomaly]

Out[26]:

[(114, 1.1781818181818182),
 (300, 1.923874053407732),
 (186, 2.0),
 (0, 2.0),
 (76, 2.0),
 (157, 2.0),
 (33, 2.0),
 (11, 2.0),
 (55, 2.0),
 (68, 2.0)]

In [ ]:

In [27]:

import numpy as np
import matplotlib.pyplot as plt
# default plot settings
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = [9, 6]

x = np.concatenate((np.random.normal(loc=-2, scale=.5,size=500), 
                    np.random.normal(loc=2, scale=.5, size=500)))
plt.hist(x, normed=True)
plt.xlim([-5, 5])
plt.show()

In [28]:

from sklearn.ensemble import IsolationForest
isolation_forest = IsolationForest(n_estimators=100)
isolation_forest.fit(x.reshape(-1, 1))
xx = np.linspace(-6, 6, 100).reshape(-1,1)
anomaly_score = isolation_forest.decision_function(xx)
outlier = isolation_forest.predict(xx)

plt.plot(xx, anomaly_score, label='anomaly score')
plt.fill_between(xx.flatten(), np.min(anomaly_score), np.max(anomaly_score), 
                 where=outlier==-1, color='r', 
                 alpha=.4, label='outlier region')
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel('x')
plt.xlim([-5, 5])
plt.show()

decision_function(X)¶

The anomaly score of the input samples. The lower, the more abnormal. Negative scores represent outliers, positive scores represent inliers.

Look data in google trends¶

darbe

In [46]:

import pandas as pd
import numpy as np
darbe = pd.read_csv('darbe.csv')
darbe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 262 entries, Hafta to 2018-10-28
Data columns (total 1 columns):
Kategori: Tüm kategoriler    262 non-null object
dtypes: object(1)
memory usage: 4.1+ KB

In [47]:

darbe.iloc[:1]

Out[47]:

	Kategori: Tüm kategoriler
Hafta	darbe: (Türkiye)

In [48]:

darbe.iloc[2:3]

Out[48]:

	Kategori: Tüm kategoriler
2013-11-10	<1

In [49]:

darbe.shape

Out[49]:

(262, 1)

In [50]:

vals = list(darbe.to_dict()['Kategori: Tüm kategoriler'].values())[1:]

In [51]:

vals[:20]

Out[51]:

['<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1',
 '<1']

In [69]:

vals = [np.random.randn() if v == '<1' else int(v) for v in vals]

In [70]:

import matplotlib.pyplot as plt
plt.figure(figsize=(15,3))
plt.plot(vals, 'r--o')

Out[70]:

[<matplotlib.lines.Line2D at 0x11615f2b0>]

In [78]:

plt.hist(vals, normed=True, bins=50)

Out[78]:

(array([0.41954023, 0.05172414, 0.01532567, 0.00191571, 0.00191571,
        0.        , 0.00191571, 0.        , 0.        , 0.00191571,
        0.        , 0.        , 0.        , 0.        , 0.00191571,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00191571, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00191571]),
 array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.,  20.,
         22.,  24.,  26.,  28.,  30.,  32.,  34.,  36.,  38.,  40.,  42.,
         44.,  46.,  48.,  50.,  52.,  54.,  56.,  58.,  60.,  62.,  64.,
         66.,  68.,  70.,  72.,  74.,  76.,  78.,  80.,  82.,  84.,  86.,
         88.,  90.,  92.,  94.,  96.,  98., 100.]),
 <a list of 50 Patch objects>)

In [75]:

from sklearn.ensemble import IsolationForest
isolation_forest = IsolationForest(n_estimators=100)
isolation_forest.fit(np.array(vals).reshape(-1, 1))
xx = np.linspace(0, 100, 200).reshape(-1,1)
anomaly_score = isolation_forest.decision_function(xx)
outlier = isolation_forest.predict(xx)

plt.plot(xx, anomaly_score, label='anomaly score')
plt.fill_between(xx.flatten(), np.min(anomaly_score), np.max(anomaly_score), 
                 where=outlier==-1, color='r', 
                 alpha=.4, label='outlier region')
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel('x')
plt.xlim([-5, 105])
plt.show()

In [76]:

plt.plot(xx, anomaly_score, label='anomaly score')
plt.fill_between(xx.flatten(), np.min(anomaly_score), np.max(anomaly_score), 
                 where=outlier==-1, color='r', 
                 alpha=.4, label='outlier region')
plt.legend()
plt.ylabel('anomaly score')
plt.xlabel('x')
plt.xlim([-5, 5])
plt.show()

In [77]:

outlier

Out[77]:

array([ 1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [81]:

np.percentile?

In [79]:

plt.boxplot(vals)

Out[79]:

{'boxes': [<matplotlib.lines.Line2D at 0x1a1c0d40f0>],
 'caps': [<matplotlib.lines.Line2D at 0x1a1c1f5a58>,
  <matplotlib.lines.Line2D at 0x1a1c280198>],
 'fliers': [<matplotlib.lines.Line2D at 0x1a1c31db00>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x1a1c31d390>],
 'whiskers': [<matplotlib.lines.Line2D at 0x1a1c1ca6d8>,
  <matplotlib.lines.Line2D at 0x1a1c1ca780>]}

In [82]:

Q1, Q3 = np.percentile(vals, 25), np.percentile(vals, 75)
Q1, Q3

Out[82]:

(0.0, 1.0)

In [83]:

IQR = Q3 - Q1
IQR

Out[83]:

1.0

In [84]:

smallest, greatest = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
smallest, greatest

Out[84]:

(-1.5, 2.5)

In [90]:

normal = [v for v in vals if (v > smallest) and (v < greatest)]
len(normal)

Out[90]:

In [91]:

len(vals)

Out[91]:

In [93]:

anormal = [v for v in vals if (v < smallest) or (v > greatest)]
len(anormal)

Out[93]:

For more¶

anomaly-detection

In [ ]: