The output of this notebook is used to fill in the installation counts from testpilot.firefox.com. The way we calculate these counts is:
Please note these numbers are calculated for the sole purpose of giving an approximation to end-users and should absolutely not be used for decision-making.
TODO: This notebook will eventually become slow (when we end up with a lot of TxP event pings) at which point we should start memoizing daily event counts instead of counting everything every time.
sc.defaultParallelism
96
from moztelemetry import get_pings, get_pings_properties
from collections import defaultdict
import json
import time
import boto3
from boto3.s3.transfer import S3Transfer
FRACTION = 1
Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
# First, grab ids for all the clients who have sent us new-style pings
date_range = ('20160804', '29991231') # Oldest event-based ping is from 20160804
testpilot_pings = get_pings(sc, doc_type="testpilot", app="Firefox", submission_date=date_range, fraction=FRACTION)
testpilot_fields = ["payload/version", "clientId", "payload/tests", "payload/events"]
testpilot_subset = get_pings_properties(testpilot_pings, testpilot_fields)
new_ping_subset = testpilot_subset.filter(lambda t: t['payload/version'] != 1)
active_clients = new_ping_subset.map(lambda t: (t["clientId"], True)).distinct()
active_clients.count()
53572
# Baseline installs, approximately how many installations we had before the TxP event-based ping changes
baseline_installations = {
u'@activity-streams': 22000,
u'@testpilot-addon': 30000, # This number is *totally* made up, do not use for anything important
u'tabcentertest1@mozilla.com': 20000,
u'universal-search@mozilla.com': 33500,
u'wayback_machine@mozilla.org': 3400
}
# Now we want to grab enabled/disabled events from the new-style pings
new_style_events = new_ping_subset.flatMap(lambda t: t.get('payload/events', []))
enable_events = new_style_events\
.filter(lambda t: t.get('event', None) == 'enabled')\
.map(lambda t: t['object'])\
.countByValue()
enable_events
defaultdict(int, {u'@activity-streams': 43428, u'@foo-bar': 2, u'@min-vid': 93, u'@testpilot-addon': 22402, u'@x16': 77, u'blok@mozilla.org': 134, u'jid1-NeEaf3sAHdKHPA@jetpack': 56, u'tabcentertest1@mozilla.com': 13137, u'universal-search@mozilla.com': 29086, u'wayback_machine@mozilla.org': 19217})
disable_events = new_style_events\
.filter(lambda t: t.get('event', None) == 'disabled')\
.map(lambda t: t['object'])\
.countByValue()
# negate disable event counts
for k,v in disable_events.items():
disable_events[k] = v * -1
disable_events
defaultdict(int, {u'@activity-streams': -2371, u'@foo-bar': -2, u'@min-vid': -62, u'@testpilot-addon': -3919, u'@x16': -70, u'blok@mozilla.org': -90, u'jid1-NeEaf3sAHdKHPA@jetpack': -37, u'tabcentertest1@mozilla.com': -3911, u'universal-search@mozilla.com': -1583, u'wayback_machine@mozilla.org': -1276})
# Add everything together
def join_dicts(dicts):
joined_dict = defaultdict(list)
for dictionary in dicts:
for k,v in dictionary.items():
joined_dict[k].append(v)
return joined_dict
final_counts = join_dicts([baseline_installations, enable_events, disable_events])
final_counts
defaultdict(list, {u'@activity-streams': [22000, 43428, -2371], u'@foo-bar': [2, -2], u'@min-vid': [93, -62], u'@testpilot-addon': [30000, 22402, -3919], u'@x16': [77, -70], u'blok@mozilla.org': [134, -90], u'jid1-NeEaf3sAHdKHPA@jetpack': [56, -37], u'tabcentertest1@mozilla.com': [20000, 13137, -3911], u'universal-search@mozilla.com': [33500, 29086, -1583], u'wayback_machine@mozilla.org': [3400, 19217, -1276]})
for k,v in final_counts.items():
final_counts[k] = sum(v)
final_counts
defaultdict(list, {u'@activity-streams': 63057, u'@foo-bar': 0, u'@min-vid': 31, u'@testpilot-addon': 48483, u'@x16': 7, u'blok@mozilla.org': 44, u'jid1-NeEaf3sAHdKHPA@jetpack': 19, u'tabcentertest1@mozilla.com': 29226, u'universal-search@mozilla.com': 61003, u'wayback_machine@mozilla.org': 21341})
# Output this to json to write to file
counts_json = json.dumps(final_counts)
# Not really necessary, but we're saving historical output to a timestamped file
timestamp = int(time.time())
timestamp
1474954083
timestamped_filename = "{}.json".format(timestamp)
latest_filename = "latest.json"
bucket = "telemetry-public-analysis-2"
path = "testpilot/data/installation-counts/"
timestamped_s3_key = path + timestamped_filename
latest_s3_key = path + latest_filename
with open(latest_filename, 'w') as f:
f.write(counts_json)
client = boto3.client('s3', 'us-west-2')
transfer = S3Transfer(client)
transfer.upload_file(latest_filename, bucket, timestamped_s3_key, extra_args={'ContentType':'application/json'})
transfer.upload_file(latest_filename, bucket, latest_s3_key, extra_args={'ContentType':'application/json'})