import simplejson as json import pandas as pd import numpy as np import matplotlib.pyplot as plt from moztelemetry.spark import get_pings from __future__ import division from operator import itemgetter %%capture pings = get_pings(sc, "Firefox", "nightly", "37.0a1", "*", ("20141208", "20141214")) %%capture def extract(ping): ping = json.loads(ping) info = ping["info"] simple = ping["simpleMeasurements"] ping_subset = dict(info.items() + simple.items()) ping_subset["clientID"] = ping.get("clientID", None) ping_subset.pop("UITelemetry", None) ping_subset.pop("addonManager", None) return ping_subset pings_summary = pings.map(extract) %%capture df = pd.DataFrame(pings_summary.collect()) df = df[df["uptime"] >= 0] table = pd.pivot_table(df, index="clientID", \ values=["uptime", "memsize", "cpucount", "firstPaint", "shutdownDuration", "failedProfileLockCount"], \ aggfunc=[len, np.median]) table = table.swaplevel(0, 1, axis=1).sortlevel(0, axis=1) percentiles = [0.1, 0.25, 0.5, 0.75, 0.9] table.swaplevel(0, 1, axis=1).sortlevel(0, axis=1)["median"].describe(percentiles=percentiles) df[table.columns.levels[0]].describe(percentiles = percentiles) def compare_distributions(metric, max_min): stat = table[metric]["median"] plt.figure(figsize=(18, 5)) plt.ylabel("density") plt.hist(df[metric], 120, range=(0, max_min), normed=1, alpha=0.5, label="Session " + metric) plt.hist(stat, 120, range=(0, max_min), normed=1, alpha=0.5, label="Median client " + metric) plt.legend(loc='upper right') plt.show() compare_distributions("uptime", 120) compare_distributions("firstPaint", 10000) compare_distributions("shutdownDuration", 10000)