Notebook

In [1]:

from moztelemetry import get_pings
import datetime
import time
import re

channel = None # None, "nightly", etc
versions = re.compile("6[8]\..*") # ".*" for all
fraction = 0.1

ndays = 90 # number of days in the past
earliest = time.time() - ndays * 24 * 60 * 60
latest = time.time() # - 1 * 24 * 60 * 60 # one day back

# for efficiency, we only ask for ping submitted within these dates.
earliest_ping = (datetime.date.fromtimestamp(earliest) - datetime.timedelta(days=2)).strftime("%Y%m%d")
latest_ping = datetime.date.fromtimestamp(latest).strftime("%Y%m%d")

valueKeys = ["error", "from", "code"]
reOSError = re.compile("(Win|Unix) error (\\d+) .*", re.DOTALL)
reInvalidProperty = re.compile("(Invalid value for property '.*'): .*$")

def getPlatform(ping):
    application = ping["application"]
    if "name" not in application or application.get("name") == "Fennec":
        return "android" # ??
    return "iOS" if application.get("architecture") == "arm" else "desktop"

def normalizeFailure(failure):
    name = failure["name"]
    value = "<none>"
    for maybe in valueKeys:
        if maybe in failure:
            value = unicode(failure[maybe]).strip()
            break
    
    if value.endswith("is not a valid URL."): # has the URL (since fixed for newer pings)
        value = "<URL> is not a valid URL."
    elif value.startswith("Error: Payload too big"): # has number of bytes, which varies
        value = "Error: Payload too big"
    elif value.startswith("Error: X-Last-Modified changed in the middle of a download batch"): # has a timestamp, which varies
        value = "Error: X-Last-Modified changed in the middle of a download batch"
    elif value.startswith("Error: Invalid value for property"):
        value = ":".join(value.split(":")[:2])
    elif value.startswith("TypeError: date:") and "cannot be a future date" in value:
        value = "TypeError: date: <some date> cannot be a future date"
    elif value.startswith("Error: no salt available for"):
        value = "Error: no salt available for <extension-id> - how did this happen?"
    elif value.startswith("Error: Duplicate entry for") and value.endswith("in changeset"):
        value = "Error: Duplicate entry for <guid> in changeset"
    elif value.startswith("Error: you cannot set the clients.lastRecordUpload pref to the number"):
        value = "Error: you cannot set the clients.lastRecordUpload pref to the number <some large value> as number pref values must be in the signed 32-bit integer range"
    else:
        # this kinda sucks as we lose the filename being operated on, but in many cases it doesn't actually
        # matter (eg, "disk is full" error isn't really related to a specific file)
        value = reOSError.sub("\\1 Error \\2", value)
        value = reInvalidProperty.sub("\\1", value)
    return name + "/" + value

def mapFailures(ping):
    try:
        syncs = ping["payload"]["syncs"]
        if getPlatform(ping) != "desktop" or versions.match(ping["application"]["version"]) is None:
            return []
    except KeyError:
        return []
    result = []
    version = ping["application"]["version"].split(".")[0]
    for sync in syncs:
        try:
            day = datetime.date.fromtimestamp(sync["when"] / 1000.0)
            when = sync["when"] / 1000
        except (TypeError, ValueError, KeyError):
            continue # ignore bad timestamps
        if when < earliest or when > latest:
            continue
        for engine in sync.get("engines", []):
            if type(engine) != dict: # seeing a float here too in one ping!
                continue
            if "failureReason" in engine:
                failure = normalizeFailure(engine["failureReason"])
            elif "status" in engine:
                failure = "status/%s" % engine.get("status")
            else:
                failure = None
            # hrm - seeing records with no name, and apparently non-strings (a float!?) in others
            if type(engine.get("name")) == unicode:
                result.append(((day, version), { engine["name"]: { failure: 1 } }))
    return result

sc = SparkContext.getOrCreate()
s = get_pings(sc, doc_type='sync', submission_date=(earliest_ping, latest_ping), channel=channel, fraction=fraction).flatMap(mapFailures)

Call to deprecated function get_pings.
WARNING: THIS IS NOT A REPRESENTATIVE SAMPLE.
This 'sampling' is based on s3 files and is highly
susceptible to skew. Use only for quicker performance
while prototyping.
fetching 99485.05580MB in 1061452 files...

In [2]:

def reduceCounts(a, b):
    result = a.copy()
    for name, value in b.iteritems():
        if type(value) in (int, long):
            result[name] = a.get(name, 0) + value
        else:
            result[name] = reduceCounts(result.get(name, {}), value)
    return result

summaries = s.reduceByKey(reduceCounts)

In [3]:

# regular list, sorted by date.
ssummaries = sorted(summaries.collect(), key=lambda x: x[0])

In [4]:

# This really is quite a mess - but it's quite complicated what we are trying to do...

# Get the number of errors per engine by version, simply so we can guess at what the
# "top" errors are for that engine/version.

successByEngineByVersion = {}
errorCountsByEngineByVersion = {}
for ((date, version), engineSummaries) in ssummaries:
    for name, s in engineSummaries.iteritems():
        engineEntry = successByEngineByVersion.setdefault(name, {})
        verEntry = engineEntry.setdefault(version, {})
        
        this = verEntry[date] = {"total": s.get(None, 0), "failures": 0}
        for error, count in s.iteritems():
            this["total"] += count
            if error is not None:
                this["failures"] += count

        allEntry = engineEntry.setdefault("All Versions", {})
        thisAll = allEntry.setdefault(date, {})
        thisAll["total"] = thisAll.get("total", 0) + this["total"]
        thisAll["failures"] = thisAll.get("failures", 0) + this["failures"]
        
        engineEntry = errorCountsByEngineByVersion.setdefault(name, {})
        verEntry = engineEntry.setdefault(version, {})
        for error, count in s.iteritems():
            if error is not None:
                verEntry[error] = verEntry.get(error, 0) + count

# The data structures we use to graph.

def buildErrorCountsByVersionByDate():
    result = {}
    for ((date, version), engineSummaries) in ssummaries:
        for name, s in engineSummaries.iteritems():
            engineEntry = result.setdefault(name, {})
            verEntry = engineEntry.setdefault(version, {})

            for error, count in s.iteritems():
                if error is not None:
                    verEntry[date] = verEntry.get(date, 0) + count
    return result

# Build a map of total error counts per day, so we can work out the perc of each.
errorCountsByEngineByVersionByDate = buildErrorCountsByVersionByDate()

failuresByEngineByVersionByError = {}
for ((date, version), engineSummaries) in ssummaries:
    #date = date2num(date)
    for name, s in engineSummaries.iteritems():
        engineEntry = failuresByEngineByVersionByError.setdefault(name, {})
        verEntry = engineEntry.setdefault(version, {})

        # total error count for this engine/version/date.
        total = errorCountsByEngineByVersionByDate.get(name, {}).get(version, {}).get(date, 0)

        # Take the top-n errors for this engine in this version across all dates,
        # and put the rest in "other"
        countsDict = errorCountsByEngineByVersion[name][version]
        topCounts = set(map(lambda d: d[0], sorted(countsDict.items(), key=lambda d: -d[1])[:10]))

        ourDict = {"other": 0}
        for error, count in s.iteritems():
            if error is None:
                ourDict[None] = count
            else:
                if error in topCounts:
                    ourDict[error] = count
                else:
                    ourDict["other"] += count

        for error, count in sorted(ourDict.iteritems(), key=lambda d: -d[1]):
            if error is not None:
                perc = (100.0 * count / total) if total else 0

                errorEntries = verEntry.setdefault(error, [])
                errorEntries.append((date, perc))

In [5]:

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.dates import date2num
from pylab import rcParams
import itertools

%config InlineBackend.figure_format = 'svg'

rcParams['figure.figsize'] = 12, 10
rcParams.update({'font.size': 6})

# magic to support many items, each with different
# styles, but same "key" is always the same style.
class StyleGetter:
    colors=('k','y','m','c','b','g','r')
    linestyles=('-','--','-.',':')
    def __init__(self):
        self.styles = itertools.cycle([color+linestyle for linestyle in self.linestyles for color in self.colors])
        self.markers = itertools.cycle(('+', 'o', '*', 'v', '^', '<', '>', 'x', 'D'))
        self.map = {}

    def get(self, key):
        try:
            return self.map[key]
        except KeyError:
            val = {"style": self.styles.next(), "marker": self.markers.next()}
            self.map[key] = val
            return val

styleGetter = StyleGetter();

for engineName in sorted(successByEngineByVersion.keys()):
    print engineName

    colorCycle = itertools.cycle(('k','y','m','c','r','g','b'))

    fig, ax = plt.subplots()
    # We use a second axis for total syncs for the version.
    ax2 = ax.twinx()

    byVersion = successByEngineByVersion[engineName]
    totals = {}
    for version in sorted(byVersion.keys()):
        verTotals = {}
        ticks = []
        vals = []
        byDate = byVersion[version]
        for date in sorted(byDate.keys()):
            ticks.append(date2num(date))
            this = byDate[date]
            nsuccess = this["total"] - this.get("failures", 0)
            perc = (100.0 * nsuccess / this["total"]) if this["total"] else 0
            vals.append(perc)
            verTotals[date] = byDate[date]["total"]
            totals[date] = totals.get(date, 0) + byDate[date]["total"]

        color = colorCycle.next()
        ax.plot_date(ticks,
                     vals,
                     color=color,
                     ls='-',
                     marker=None,
                     label=engineName + " - " + version + " - Percentage of Successful Syncs")
        ax.legend(loc='lower left')
        ax.get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(x, ".3f")))
        ax.set_ylim([99.5, 100])

        # and plot the total syncs for this version.
        verTotals = sorted(verTotals.items())
        ax2.plot_date(map(lambda a: date2num(a[0]), verTotals), # ticks
                      map(lambda a: a[1], verTotals), # vals,
                      ls="--",
                      color=color,
                      fillstyle="full",
                      marker=None,
                      label="Total Syncs " + version + " (millions)")

    ax2.set_ylabel('Total Syncs')
    #ax2.tick_params('y', colors='r')
    ax2.legend(loc='lower right')
    ax2.get_yaxis().set_major_formatter(ticker.FuncFormatter(lambda x, p: format(int(x)/1000000, ',')))

    # One graph per version of failure rates for this engine.
    byVersion = failuresByEngineByVersionByError[engineName]
    for version in sorted(byVersion.keys()):

        fig, ax = plt.subplots()
        ax.set_title(engineName + " - " + version)

        byError = byVersion[version]
        for error, details in byError.items():
            style = styleGetter.get(error)
            ax.plot_date(map(lambda d: d[0], details),
                         map(lambda d: d[1], details),
                         style["style"],
                         marker=style["marker"],
                         label=version + " - " + error)
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, mode="expand", handlelength=10) # magic to get the legend outside the graph.

    plt.show()

addons

addresses

bookmarks

bookmarks-buffered

clients

creditcards

extension-storage

forms

history

passwords

prefs

tabs