import ujson as json import matplotlib.pyplot as plt import pandas as pd import numpy as np import plotly.plotly as py from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client %pylab inline sc.defaultParallelism pings = get_pings(sc, app="Firefox", channel="nightly", build_id=("20150401000000", "20150401999999"), fraction=0.1) subset = get_pings_properties(pings, ["clientID", "info/OS", "simpleMeasurements/firstPaint"]) subset = subset.filter(lambda p: p.get("firstPaint", -1) >= 0) subset = get_one_ping_per_client(subset) cached = subset.cache() cached.count() grouped = cached.map(lambda p: (p["OS"], p["firstPaint"])).groupByKey().collectAsMap() frame = pd.DataFrame({x: np.log(pd.Series(list(y))) for x, y in grouped.items()}) plt.figure(figsize=(18, 7)) frame.boxplot(return_type="axes") plt.ylabel("log(firstPaint)") plt.show() fig = plt.figure(figsize=(18, 7)) frame["WINNT"].plot(kind="hist", bins=50) plt.title("startup distribution for Windows") plt.ylabel("count") plt.xlabel("log(firstPaint)") py.iplot_mpl(fig, strip_style=True)