#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pylab as py import pandas as pd import matplotlib.pyplot as plt import IPython from thclient import PerfherderClient from sklearn import mixture from scipy.stats import norm IPython.core.pylabtools.figsize(32, 14) # In[2]: def model(values, n_components=2, threshold_ratio=1.3): gmms = [mixture.GMM(n_components=i).fit(values) for i in range(1, n_components + 1)] bics = [gmm.bic(values) for gmm in gmms] # Favor simpler models to more complicated ones gmm = gmms[-1] for i in range(0, len(bics) - 1): if bics[i]/bics[i + 1] < threshold_ratio: gmm = gmms[i] break return gmm def ztest(mean_1, var_1, n_1, mean_2, var_2, n_2): diff = abs(mean_1 - mean_2) stderr = np.sqrt(var_1/n_1 + var_2/n_2) z = diff/stderr return (1 - norm.cdf(z)) def sort_by(x, y): return [x for (y, x) in sorted(zip(y, x))] def sorted_params(model): mean = sorted(model.means_) variance = sort_by(model.covars_, model.means_) weight = sort_by(model.weights_, model.means_) return (mean, variance, weight) def compare_windows(a, b, pvalue=1e-8): model_a = model(a.reshape((len(a), 1))) model_b = model(b.reshape((len(b), 1))) if model_a.n_components != model_b.n_components: return # TODO: Fallback to a simpler model for both windows mean_a, var_a, weight_a = sorted_params(model_a) mean_b, var_b, weight_b = sorted_params(model_b) for i in range(model_a.n_components): p = ztest(mean_a[i], var_a[i], weight_a[i]*len(a), mean_b[i], var_b[i], weight_b[i]*len(b)) if p < pvalue: return "ALERT" def find_regressions(repository, signature, interval=8*604800, window_size=60): client = PerfherderClient() s = client.get_performance_data(repository, signatures=signature, interval=interval) values = pd.Series(s[signature]["value"]) # TODO: Standardize values? colors = ["white"]*len(values) for i in range(len(values) - 2*window_size + 1): a = values[i:i + window_size] b = values[i + window_size: i + 2*window_size] if compare_windows(a, b) == "ALERT": colors[i + window_size] = "red" plt.scatter(range(len(values)), values, c=colors) # In[3]: find_regressions("mozilla-inbound", "5c6e37c52e0bd0abf37c52adf565068fad37bc03") # In[4]: find_regressions("mozilla-inbound", "ff4ce004871790022f413b1abb21632e7c788f65", interval=12*604800) # In[5]: find_regressions("mozilla-inbound", "93bb72a460cf2b6ce7f9d7912f0a2ab49c5cf9d8") # In[6]: find_regressions("mozilla-inbound", "dbaade2dc703221d3287f7b3611302b4d34e8e7f", interval=12*604800) # In[7]: find_regressions("mozilla-inbound", "f6fb7dcb89c26e9c7e18722fa9846f60583d2ef2", interval=12*604800)