Notebook

In [1]:

from moztelemetry import get_pings
import datetime
import time
import re

fraction = 0.1

ndays = 50 # number of days in the past
earliest = time.time() - ndays * 24 * 60 * 60
latest = time.time() - 1 * 24 * 60 * 60 # one day back

# for efficiency, we only ask for ping submitted within these dates.
earliest_ping = (datetime.date.fromtimestamp(earliest) - datetime.timedelta(days=2)).strftime("%Y%m%d")
latest_ping = (datetime.date.today() + datetime.timedelta(days=2)).strftime("%Y%m%d")

valueKeys = ["error", "from", "code"]
reOSError = re.compile("(Win|Unix) error (\\d+) .*", re.DOTALL)
reInvalidProperty = re.compile("(Invalid value for property '.*'): .*$")

def getPlatform(ping):
    application = ping["application"]
    if "name" not in application or application.get("name") == "Fennec":
        return "android" # ??
    return "iOS" if application.get("architecture") == "arm" else "desktop"

def normalizeFailure(failure):
    name = failure["name"]
    value = "<none>"
    for maybe in valueKeys:
        if maybe in failure:
            value = unicode(failure[maybe]).strip()
            break
    
    if value.endswith("is not a valid URL."): # has the URL (since fixed for newer pings)
        value = "<URL> is not a valid URL."
    elif value.startswith("Error: Payload too big"): # has number of bytes, which varies
        value = "Error: Payload too big"
    elif value.startswith("Error: X-Last-Modified changed in the middle of a download batch"): # has a timestamp, which varies
        value = "Error: X-Last-Modified changed in the middle of a download batch"
    elif value.startswith("Error: Invalid value for property"):
        value = ":".join(value.split(":")[:2])
    elif value.startswith("TypeError: date:") and "cannot be a future date" in value:
        value = "TypeError: date: <some date> cannot be a future date"
    elif value.startswith("Error: no salt available for"):
        value = "Error: no salt available for <extension-id> - how did this happen?"
    elif value.startswith("Error: you cannot set the clients.lastRecordUpload pref to the number"):
        value = "Error: you cannot set the clients.lastRecordUpload pref to the number <some large value> as number pref values must be in the signed 32-bit integer range"
    else:
        # this kinda sucks as we lose the filename being operated on, but in many cases it doesn't actually
        # matter (eg, "disk is full" error isn't really related to a specific file)
        value = reOSError.sub("\\1 Error \\2", value)
        value = reInvalidProperty.sub("\\1", value)
    return name + "/" + value

def mapFailures(ping):
    try:
        syncs = ping["payload"]["syncs"]
        if getPlatform(ping) != "desktop":
            return []
    except KeyError:
        return []
    result = []
    for sync in syncs:
        try:
            day = datetime.date.fromtimestamp(sync["when"] / 1000.0)
            when = sync["when"] / 1000
        except (ValueError, KeyError):
            continue # ignore bad timestamps
        if when < earliest or when > latest:
            continue
        for engine in sync.get("engines", []):
            if "failureReason" in engine:
                failure = normalizeFailure(engine["failureReason"])
            else:
                failure = None
            # hrm - seeing records with no name, and apparently non-strings (a float!?) in others
            if type(engine.get("name")) == unicode:
                result.append((day, { engine["name"]: { failure: 1 } }))
    return result

s = get_pings(sc, doc_type='sync', submission_date=(earliest_ping, latest_ping), fraction=fraction).flatMap(mapFailures)

fetching 23727.59884MB in 202366 files...

In [2]:

def reduceCounts(a, b):
    result = a.copy()
    for name, value in b.iteritems():
        if type(value) in (int, long):
            result[name] = a.get(name, 0) + value
        else:
            result[name] = reduceCounts(result.get(name, {}), value)
    return result

summaries = s.reduceByKey(reduceCounts)

In [ ]:

# regular list, sorted by date.
ssummaries = sorted(summaries.take(summaries.count()), key=lambda x: x[0])

In [ ]:

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.dates import date2num
from pylab import rcParams
import itertools

%config InlineBackend.figure_format = 'svg'

rcParams['figure.figsize'] = 12, 10
rcParams.update({'font.size': 6})

ticks = []

# keyed by engineName, values is dict keyed by y axis with values error counts per day.
engineVals = {}

for (date, engineSummaries) in ssummaries:
    ticks.append(date2num(date))
    for engineName, engineSummary in engineSummaries.items():
        engineVals.setdefault(engineName, {})
        for failure in engineSummary.keys():
            if failure is not None: # only taking errors
                engineVals[engineName][failure] = []

for engineName in sorted(engineVals.keys()):
    print engineName
    # count of success/failures recorded per day.
    total_successes = [0] * len(ticks)
    total_failures = [0] * len(ticks)

    for i, (date, allSummaries) in enumerate(ssummaries):
        summary = allSummaries.get(engineName, {})
        # get totals first.
        for name, count in summary.items():
            if name is None:
                total_successes[i] += count
            else:
                total_failures[i] += count

        vals = engineVals[engineName]
        for name, percs in vals.items():
            this = summary.get(name, 0)
            perc = (100.0 * this / total_failures[i]) if total_failures[i] else 0
            percs.append(perc)

    vals = map(lambda (a, b): (100.0 * a / (a + b)) if a and b else None, zip(total_successes, total_failures))
    fig, ax = plt.subplots()
    # Values of all None cause an exception creating the graph.
    hasErrors = len(filter(lambda x: x is not None, vals)) != 0
    if not hasErrors:
        print "  no error results for this engine"
        print
    else:
        ax.plot_date(ticks, 
                     vals,
                     ls='-',
                     label=engineName + " - Percentage of Successful Syncs")
        ax.legend(loc='upper left')
        ax.get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(x, ".3f")))
    
    # and plot the total syncs.
    ax2 = ax.twinx()
    ax2.plot_date(ticks[:-1],
                  map(lambda (a, b): (a+b)/fraction, zip(total_successes, total_failures)[:-1]),
                  ls=":",
                  color="r",
                  fillstyle="full",
                  marker=None,
                  label="Total Number of Syncs (millions)")
    ax2.set_ylabel('Total Syncs')
    #ax2.tick_params('y', colors='r')
    ax2.legend(loc='upper right')
    ax2.get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x)/1000000, ',')))

    if hasErrors:
        fig, ax = plt.subplots()

        # magic to support many items, each with different styles.
        colors=('k','y','m','c','b','g','r')
        linestyles=('-','--','-.',':')
        styles = itertools.cycle([color+linestyle for linestyle in linestyles for color in colors])
        markers = itertools.cycle(('+', 'o', '*', 'v', '^', '<', '>', 'x', 'D')) 

        # sort the names so the one with the total max numbers for the last 7 days are first.
        errorNames = sorted(engineVals[engineName].iterkeys(), key=lambda n: -sum(engineVals[engineName][n][-7:]))
        # Only plot the top 1/2 (but at least 5)
        numToPlot = max(len(errorNames) / 2, 5)
        toPlot = errorNames[:numToPlot]
        toSummarize = errorNames[numToPlot:]
        for name in toPlot:
            data = engineVals[engineName][name]
            name = name.replace("$", "\\$")
            ax.plot_date(ticks, data, styles.next(), marker=markers.next(), label=name)
        # and "others"
        data = [0] * len(ticks)
        for name in toSummarize:
            for (i, this) in enumerate(engineVals[engineName][name]):
                data[i] += this
        ax.plot_date(ticks, data, styles.next(), marker=markers.next(), label="Other errors")

        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, mode="expand", handlelength=10) # magic to get the legend outside the graph.
        ax.set_title(engineName + " - failure percentages")

    plt.show()

addons

addresses

bookmarks

clients