In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
from datetime import date, timedelta

%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [2]:
yesterday = (date.today() - timedelta(1)).strftime("%Y%m%d")
yesterday
Out[2]:
'20150819'
In [3]:
pings = get_pings(sc, app="Firefox", channel="nightly", submission_date=yesterday, fraction=0.1)
In [4]:
def remove_clientid(ping):
    ping.pop("clientID")
    return ping

def get_keyed_aggregates(pings, histogram_name):
    pings = pings.map(lambda p: json.loads(p))
    
    keys = pings.flatMap(lambda p: p.get("keyedHistograms", {}).get(histogram_name, {}).keys()).distinct().collect()
    keys = filter(lambda x: "/" not in x, keys)
    properties = map(lambda k: "keyedHistograms/{}/{}".format(histogram_name, k), keys) + ["clientID"]
    subset = get_one_ping_per_client(get_pings_properties(pings, properties)).map(remove_clientid)
    
    histograms = subset.flatMap(lambda p: list(p.iteritems()))
    aggregates = pd.Series(dict(histograms.reduceByKey(lambda x, y: x + y).collect()))
    
    summary_frame = pd.DataFrame({"histogram": aggregates,
                                  "max": aggregates.map(lambda agg: agg.index[agg > 0].max()),
                                  "count": aggregates.map(lambda agg: agg.sum())})

    summary_frame = summary_frame.sort(["max", "count"], ascending=False)
    
    cnt = 0
    for key, histogram in summary_frame["histogram"].iteritems():
        if key.endswith("_parent") or key.endswith("_children"):
            continue
            
        if cnt >= 100:
            return
        
        print key
        histogram.plot(kind="bar")
        plt.ylabel("count")
        plt.show()
        cnt += 1
In [5]:
get_keyed_aggregates(pings, "MISBEHAVING_ADDONS_JANK_LEVEL")
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-5-030bf240003a> in <module>()
----> 1 get_keyed_aggregates(pings, "MISBEHAVING_ADDONS_JANK_LEVEL")

<ipython-input-4-953408991a2f> in get_keyed_aggregates(pings, histogram_name)
     12 
     13     histograms = subset.flatMap(lambda p: list(p.iteritems()))
---> 14     aggregates = pd.Series(dict(histograms.reduceByKey(lambda x, y: x + y).collect()))
     15 
     16     summary_frame = pd.DataFrame({"histogram": aggregates,

/home/hadoop/spark/python/pyspark/rdd.py in collect(self)
    711         """
    712         with SCCallSiteSync(self.context) as css:
--> 713             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
    714         return list(_load_from_socket(port, self._jrdd_deserializer))
    715 

/home/hadoop/anaconda/lib/python2.7/site-packages/py4j/java_gateway.pyc in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539 
    540         for temp_arg in temp_args:

/home/hadoop/anaconda/lib/python2.7/site-packages/py4j/protocol.pyc in get_return_value(answer, gateway_client, target_id, name)
    298                 raise Py4JJavaError(
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:
    302                 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 12 in stage 6.0 failed 1 times, most recent failure: Lost task 12.0 in stage 6.0 (TID 6069, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/hadoop/spark/python/pyspark/worker.py", line 101, in main
    process()
  File "/home/hadoop/spark/python/pyspark/worker.py", line 96, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/hadoop/spark/python/pyspark/rdd.py", line 2252, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/hadoop/spark/python/pyspark/rdd.py", line 2252, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/home/hadoop/spark/python/pyspark/rdd.py", line 282, in func
    return f(iterator)
  File "/home/hadoop/spark/python/pyspark/rdd.py", line 1705, in combineLocally
    merger.mergeValues(iterator)
  File "/home/hadoop/spark/python/pyspark/shuffle.py", line 253, in mergeValues
    d[k] = comb(d[k], v) if k in d else creator(v)
  File "<ipython-input-4-953408991a2f>", line 14, in <lambda>
TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

	at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:135)
	at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:176)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:311)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:64)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1204)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1193)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1192)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
In [ ]:
get_keyed_aggregates(pings, "MISBEHAVING_ADDONS_CPOW_TIME_MS")