import datetime as dt
import pandas as pd
import ujson as json
from pyspark.sql.types import *
from moztelemetry import get_pings, get_pings_properties
%pylab inline
Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
/home/hadoop/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
Populating the interactive namespace from numpy and matplotlib
Take the set of pings, make sure we have actual clientIds and remove duplicate pings. We collect each unique ping.
def dedupe_pings(rdd):
return rdd.filter(lambda p: p["meta/clientId"] is not None)\
.map(lambda p: (p["meta/documentId"], p))\
.reduceByKey(lambda x, y: x)\
.map(lambda x: x[1])
Transform and sanitize the pings into arrays.
def transform(ping):
# Should not be None since we filter those out.
clientId = ping["meta/clientId"]
profileDate = None
profileDaynum = ping["environment/profile/creationDate"]
if profileDaynum is not None:
try:
# Bad data could push profileDaynum > 32767 (size of a C int) and throw exception
profileDate = dt.datetime(1970, 1, 1) + dt.timedelta(int(profileDaynum))
except:
profileDate = None
# Create date should already be in ISO format
creationDate = ping["creationDate"]
if creationDate is not None:
# This is only accurate because we know the creation date is always in 'Z' (zulu) time.
creationDate = dt.datetime.strptime(ping["creationDate"], "%Y-%m-%dT%H:%M:%S.%fZ")
# Added via the ingestion process so should not be None.
submissionDate = dt.datetime.strptime(ping["meta/submissionDate"], "%Y%m%d")
appVersion = ping["application/version"]
osVersion = ping["environment/system/os/version"]
if osVersion is not None:
osVersion = int(osVersion)
locale = ping["environment/settings/locale"]
# Truncate to 32 characters
defaultSearch = ping["environment/settings/defaultSearchEngine"]
if defaultSearch is not None:
defaultSearch = defaultSearch[0:32]
# Build up the device string, truncating like we do in 'core' ping.
device = ping["environment/system/device/manufacturer"]
model = ping["environment/system/device/model"]
if device is not None and model is not None:
device = device[0:12] + "-" + model[0:19]
xpcomABI = ping["application/xpcomAbi"]
arch = "arm"
if xpcomABI is not None and "x86" in xpcomABI:
arch = "x86"
return [clientId, profileDate, submissionDate, creationDate, appVersion, osVersion, locale, defaultSearch, device, arch]
Create a set of pings from "saved-session" to build a set of core client data. Output the data to CSV or Parquet.
This script is designed to loop over a range of days and output a single day for the given channels. Use explicit date ranges for backfilling, or now() - '1day' for automated runs.
channels = ["nightly", "aurora", "beta", "release"]
start = dt.datetime.now() - dt.timedelta(1)
end = dt.datetime.now() - dt.timedelta(1)
day = start
while day <= end:
for channel in channels:
print "\nchannel: " + channel + ", date: " + day.strftime("%Y%m%d")
pings = get_pings(sc, app="Fennec", channel=channel,
submission_date=(day.strftime("%Y%m%d"), day.strftime("%Y%m%d")),
build_id=("20100101000000", "99999999999999"),
fraction=1)
subset = get_pings_properties(pings, ["meta/clientId",
"meta/documentId",
"meta/submissionDate",
"creationDate",
"application/version",
"environment/system/os/version",
"environment/profile/creationDate",
"environment/settings/locale",
"environment/settings/defaultSearchEngine",
"environment/system/device/model",
"environment/system/device/manufacturer",
"application/xpcomAbi"])
subset = dedupe_pings(subset)
print "\nDe-duped pings:"
print subset.first()
transformed = subset.map(transform)
print "\nTransformed pings:"
print transformed.first()
s3_output = "s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/android_clients"
s3_output += "/v1/channel=" + channel + "/submission=" + day.strftime("%Y%m%d")
schema = StructType([
StructField("clientid", StringType(), False),
StructField("profiledate", TimestampType(), True),
StructField("submissiondate", TimestampType(), False),
StructField("creationdate", TimestampType(), True),
StructField("appversion", StringType(), True),
StructField("osversion", IntegerType(), True),
StructField("locale", StringType(), True),
StructField("defaultsearch", StringType(), True),
StructField("device", StringType(), True),
StructField("arch", StringType(), True)
])
grouped = sqlContext.createDataFrame(transformed, schema)
grouped.coalesce(1).write.parquet(s3_output)
day += dt.timedelta(1)
channel: nightly, date: 20161105 De-duped pings: {'environment/settings/locale': u'es-ES', 'meta/submissionDate': u'20161105', 'environment/system/os/version': 17, 'application/version': u'42.0a1', 'environment/profile/creationDate': None, 'environment/system/device/model': u'Desire HD', 'meta/clientId': u'0b847511-95ed-4f50-90b1-e44e917f2581', 'environment/system/device/manufacturer': u'HTC', 'creationDate': u'2016-11-05T20:38:39.992Z', 'environment/settings/defaultSearchEngine': u'google', 'application/xpcomAbi': u'arm-eabi-gcc3', 'meta/documentId': u'3c705724-99da-429a-aef6-7fcf5976c34e'} Transformed pings: [u'e6fd2744-29ee-4bfb-89e2-3147643be2fb', datetime.datetime(2016, 3, 27, 0, 0), datetime.datetime(2016, 11, 5, 0, 0), datetime.datetime(2016, 11, 5, 3, 32, 3, 704000), u'52.0a1', 23, u'en-US', u'bing', u'Xiaomi-MI 5', 'arm'] channel: aurora, date: 20161105 De-duped pings: {'environment/settings/locale': u'de', 'meta/submissionDate': u'20161105', 'environment/system/os/version': 18, 'application/version': u'44.0a2', 'environment/profile/creationDate': None, 'environment/system/device/model': u'GT-N7100', 'meta/clientId': u'ed673966-1473-4075-b6ca-6ca88fd9b4e8', 'environment/system/device/manufacturer': u'samsung', 'creationDate': u'2016-11-05T06:07:27.868Z', 'environment/settings/defaultSearchEngine': u'google', 'application/xpcomAbi': u'arm-eabi-gcc3', 'meta/documentId': u'a38d8f31-fb8d-4839-8cdc-891f7007d916'} Transformed pings: [u'ed673966-1473-4075-b6ca-6ca88fd9b4e8', None, datetime.datetime(2016, 11, 5, 0, 0), datetime.datetime(2016, 11, 5, 6, 7, 27, 868000), u'44.0a2', 18, u'de', u'google', u'samsung-GT-N7100', 'arm'] channel: beta, date: 20161105 De-duped pings: {'environment/settings/locale': u'en-US', 'meta/submissionDate': u'20161105', 'environment/system/os/version': 18, 'application/version': u'42.0', 'environment/profile/creationDate': None, 'environment/system/device/model': u'GT-I9300', 'meta/clientId': u'524f763c-7124-4dd9-b470-0995ef47fda1', 'environment/system/device/manufacturer': u'samsung', 'creationDate': u'2016-11-04T23:30:28.321Z', 'environment/settings/defaultSearchEngine': u'google', 'application/xpcomAbi': u'arm-eabi-gcc3', 'meta/documentId': u'd0f10cec-1cc7-4c4d-aba3-ef0b07fe6e73'} Transformed pings: [u'92fdb9ff-0148-4f25-9e1a-32f301c84062', None, datetime.datetime(2016, 11, 5, 0, 0), datetime.datetime(2016, 11, 5, 7, 34, 11, 495000), u'44.0', 21, u'en-US', u'google', u'HUAWEI-ALE-L21', 'arm'] channel: release, date: 20161105 De-duped pings: {'environment/settings/locale': u'de', 'meta/submissionDate': u'20161105', 'environment/system/os/version': 23, 'application/version': u'49.0.2', 'environment/profile/creationDate': 16985, 'environment/system/device/model': u'SM-G935F', 'meta/clientId': u'1f1e6b6c-b26d-49f4-9b55-2c58c3516e37', 'environment/system/device/manufacturer': u'samsung', 'creationDate': u'2016-11-05T12:25:21.142Z', 'environment/settings/defaultSearchEngine': u'google', 'application/xpcomAbi': u'arm-eabi-gcc3', 'meta/documentId': u'019d1f2f-e3fd-4d78-b0ee-f60a1532f187'} Transformed pings: [u'1f1e6b6c-b26d-49f4-9b55-2c58c3516e37', datetime.datetime(2016, 7, 3, 0, 0), datetime.datetime(2016, 11, 5, 0, 0), datetime.datetime(2016, 11, 5, 12, 25, 21, 142000), u'49.0.2', 23, u'de', u'google', u'samsung-SM-G935F', 'arm']