Normal Distribution and 3 Sigma Rule¶

Anomaly/Outlier¶

If a test_point is $3\sigma$ away from the mean $\mu$ , it can be classified as an anomaly

Is there an anomaly?¶

In [2]:

import numpy as np

In [3]:

data = np.array([2, 3, 4,2,3,2,2,2,3,486])

In [4]:

m , s = data.mean(), data.std()
m , s

Out[4]:

(50.9, 145.03478893010464)

In [5]:

def anomalyDetector(data, test_point):
    m , s = data.mean(), data.std()
    return np.abs(test_point - m) > 3 * s

In [6]:

anomalyDetector(data, test_point = 486)

Out[6]:

False

In [7]:

50.9 + 3 *  145

Out[7]:

485.9

A better way of doing anomaly detection¶

Remove the max %5 of data points
Remove the min %5 of data points

In [9]:

import pandas as pd
df = pd.DataFrame(data)
df

Out[9]:

	0
0	2
1	3
2	4
3	2
4	3
5	2
6	2
7	2
8	3
9	486

In [10]:

df.describe()

Out[10]:

	0
count	10.000000
mean	50.900000
std	152.880091
min	2.000000
25%	2.000000
50%	2.500000
75%	3.000000
max	486.000000

In [19]:

qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))
qmin, qmax

Out[19]:

(2.0, 269.0999999999995)

In [21]:

data[(data >= qmin) & (data <= qmax)]

Out[21]:

array([2, 3, 4, 2, 3, 2, 2, 2, 3])

In [22]:

def anomalyDetector(data, test_point):
    # Remove the max %5 of data points
    # Remove the min %5 of data points
    df = pd.DataFrame(data)
    qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))
    data = data[(data >= qmin) & (data <= qmax)]
    
    m , s = data.mean(), data.std()
    return np.abs(test_point - m) > 3 * s

In [23]:

data

Out[23]:

array([  2,   3,   4,   2,   3,   2,   2,   2,   3, 486])

In [24]:

anomalyDetector(data, test_point = 486)

Out[24]:

True

In [25]:

df

Out[25]:

	0
0	2
1	3
2	4
3	2
4	3
5	2
6	2
7	2
8	3
9	486

In [67]:

def anomalyDetector(data, test_point = None):
    df = pd.DataFrame(data)
    qmin, qmax = float(df.quantile(.05)), float(df.quantile(.95))
    
    # Remove the max %5 and min %5 of data points
    data = data[(data >= qmin) & (data <= qmax)]
    m , s = data.mean(), data.std()
    
    if test_point:
        return np.abs(test_point - m) > 3 * s
    else:
        anomalies = df.apply(lambda x: np.abs(x - m) > 3 * s)
        idx = anomalies.values.reshape(-1)
        return idx

In [69]:

idx = anomalyDetector(data)
idx

Out[69]:

array([False, False, False, False, False, False, False, False, False,
        True])

In [70]:

data[idx]

Out[70]:

array([486])

In [75]:

anomalyDetector(data, test_point = 5)

Out[75]:

True

In [ ]: