Snapchat Leak

Snapchat let 4.6M usernames and phone numbers get out. This notebook takes that data and explores what usernames are most prevelant in each area code. Usernames with "love" in them for example, are more common in California and Boston. "lynn" is more common in the South. "5280" is very common in Denver, which is in an interesting one to investigate (it's an elevation).

Algorithm Shop

This notebook goes along with this algorithmshop.com post. Check out the post for animated visualizations of this data on a map of the US.

Getting Data

The leaked usernames and phone numbers came from here

The geocoding came from taking the area code names from that file and running them through this Geocode. You will have to geocode these places yourself, since there are licensing issues.

In [1]:
import pandas as pd

locations = pd.read_csv('locations.tsv', delimiter='\t')
all_users = pd.read_csv('schat.csv',
                        names=['numbers', 'username', 'location'],
                        index_col='location',
                        header=None)

# We just need the location names
del all_users['numbers']
In [2]:
all_users[0:10]
Out[2]:
username
location
Manhattan slthornton
Manhattan strict_daddy4u
Manhattan whoknew69
Manhattan testingtesting
Manhattan s.fullb13
Manhattan gavan_smith
Manhattan thismyusername
Manhattan erinspickles
Manhattan flyinghorses
Manhattan saraelizabeth98
In [3]:
from collections import defaultdict


def get_subs(s, n):
    """Get all substrings of s of length n"""
    for i in range(len(s) - n + 1):
        yield s[i:i+n]
        

def get_len_n_dict(n, lower_bound):
    sub_counts = defaultdict(int)
    for name in all_users.username:
        for s in get_subs(name, n):
            sub_counts[s] += 1
    return dict((k,v) for (k,v) in sub_counts.items() if v >= lower_bound)
In [22]:
# Find substrings of a certain lenght that occur a minimal
# number of times:

SUBSTRING_LENGTH = 4
MIN_OCCURENCE = 60
substring_counts = get_len_n_dict(SUBSTRING_LENGTH, MIN_OCCURENCE)
In [24]:
'5820' in substring_counts
Out[24]:
False
In [5]:
# Example of the most common substrings
from heapq import nlargest

nlargest(10, substring_counts.items(), key=lambda x: x[1])
Out[5]:
[('love', 51034),
 ('mari', 48834),
 ('anna', 33225),
 ('elle', 31469),
 ('alex', 28967),
 ('chri', 28713),
 ('hris', 28330),
 ('arie', 26568),
 ('stin', 26260),
 ('chel', 26169)]
In [6]:
counts_per_area = all_users.groupby(level=0).agg(len)
counts_per_area.rename(columns={'username': 'users'}, inplace=True)
counts_per_area[0:10]
Out[6]:
users
location
Arkansas 28940
Boston 41857
Boulder-Denver 139265
Bronx, Queens, Brooklyn 51086
Buffalo 144939
Canadian territories in the Arctic far north 31
Central Arizona 35631
Central Florida 3258
Central Georgia 1396
Central Texas 1542
In [7]:
# Now get counts of each substring by state
def counts_in_group(g, n, total_counts):
    output_counts = defaultdict(int)
    for name in g.username:
        for s in get_subs(name, n):
            if s in total_counts:
                output_counts[s] += 1
    return pd.DataFrame([output_counts])

groupby_fn = lambda x: counts_in_group(x, SUBSTRING_LENGTH, substring_counts)

by_state = all_users.groupby(level=0).apply(groupby_fn)

# I always end up doing this with pandas "apply". Halp me. I don't want to do it like this...
by_state = by_state.reset_index(level=1, drop=True)
In [8]:
by_state[0:3][['.mar', 'love', 'zzzz']]
Out[8]:
.mar love zzzz
location
Arkansas 11 286 4
Boston 18 670 17
Boulder-Denver 63 1092 74
In [9]:
# Filter out the  small data
by_state_normalized = by_state / counts_per_area
normalized = by_state.div(counts_per_area.users, axis='index')
drop_small = normalized[counts_per_area.users > 2000].T
In [10]:
# We get regional strings by looking for high variance
#in the ratio of users with a given substring for a state.
VARIANCE_QUANTILE = .933

variances = drop_small.var(axis=1)
large_variance = drop_small[variances > variances.quantile(q=VARIANCE_QUANTILE)]
In [12]:
from random import randint


class StreamSampler(object):
    def __init__(self, num_samples=1):
        self.num_samples = num_samples
        self.saved = []
        self.num_seen = 0

    def present(self, item):
        if len(self.saved) < self.num_samples:
            self.saved.append(item)
            self.num_seen += 1
            return
        else:
            v = random.randint(0, self.num_seen)
            if v < self.num_samples:
                self.saved[v] = item
            self.num_seen += 1
    
    def samples(self):
        return self.saved
In [13]:
# For each substring, get 2 example usernames per area code.

desired_words = large_variance.index
example_users = defaultdict(lambda: defaultdict(lambda: StreamSampler(2)))

for (area_code, r) in all_users.iterrows():
    name = r['username']
    for s in get_subs(name, 4):
        if s in desired_words:
            example_users[s][area_code].present(name)
In [14]:
def print_report(word):
    print('\nSUBSTRING {}'.format(word))
    c = large_variance.ix[word].copy()
    c.sort(ascending=False)
    print(c)
In [15]:
for s in ('love', 'girl', 'baby', 'lynn', 'ngel', '1234', '5280'):
    print_report(s)
SUBSTRING love
location
Southeastern California                  0.022592
Downtown Los Angeles                     0.022140
Eastern Los Angeles                      0.019226
Eastern San Francisco                    0.018228
Southern California                      0.017534
Boston                                   0.016007
Oakland                                  0.015008
San Fernando Valley                      0.014406
Denver-Boulder                           0.014058
Central Florida                          0.012584
Los Angeles                              0.012254
Southeastern Virginia                    0.011904
Florida                                  0.011734
New York City                            0.011320
Southeastern Colorado                    0.011114
Southeastern Michigan incl. Ann Arbor    0.010739
Central Arizona                          0.010553
Miami                                    0.010417
Fort Lauderdale                          0.010315
South Carolina                           0.010262
Pennsylvania                             0.010250
Chicago                                  0.010140
Arkansas                                 0.009883
Eastern part of Southern New Jersey      0.009638
Manhattan                                0.009591
Western and Northern Colorado            0.009569
Idaho                                    0.009468
Buffalo                                  0.009452
Northwestern Arkansas                    0.009041
San Francisco                            0.008973
Bronx, Queens, Brooklyn                  0.008965
Southern New York State                  0.008729
Eastern Ohio                             0.008557
Northern New York                        0.008301
Mountain View                            0.008281
Southwestern Wisconsin                   0.008119
Indianapolis                             0.008097
Chicago Suburbs                          0.008048
Southeastern Ohio                        0.007933
Boulder-Denver                           0.007841
Southwest Connecticut                    0.007752
Northeastern New York State              0.007708
Southern Illinois                        0.007680
Northern Louisiana                       0.007620
Seattle                                  0.007531
Champaign-Urbana                         0.006832
Westchester County, NY                   0.006559
Northern Chicago Suburbs                 0.006492
Southern Michigan                        0.006390
Maine                                    0.005827
Minnesota                                0.004887
Manitoba                                 0.002774
Name: love, Length: 52, dtype: float64

SUBSTRING girl
location
South Carolina                           0.008779
Northwestern Arkansas                    0.008767
Arkansas                                 0.008086
Southeastern Virginia                    0.007936
Eastern San Francisco                    0.007661
Central Florida                          0.007366
Florida                                  0.007192
Central Arizona                          0.007157
Southeastern Colorado                    0.007073
Western and Northern Colorado            0.007003
Idaho                                    0.006747
Maine                                    0.006617
Eastern Ohio                             0.006540
Southern Illinois                        0.006474
Southern California                      0.006425
Southeastern Ohio                        0.006381
Southwestern Wisconsin                   0.006315
Denver-Boulder                           0.006145
Champaign-Urbana                         0.006081
Chicago Suburbs                          0.005955
Pennsylvania                             0.005857
Northeastern New York State              0.005737
Northern Louisiana                       0.005487
Eastern Los Angeles                      0.005342
Northern New York                        0.005317
Eastern part of Southern New Jersey      0.005178
Seattle                                  0.005177
Southeastern California                  0.005153
Southeastern Michigan incl. Ann Arbor    0.005087
Boulder-Denver                           0.005055
Fort Lauderdale                          0.004992
Southern New York State                  0.004946
Minnesota                                0.004887
Buffalo                                  0.004754
Chicago                                  0.004384
Oakland                                  0.004374
Northern Chicago Suburbs                 0.004206
Boston                                   0.004205
Westchester County, NY                   0.004176
Indianapolis                             0.004171
Los Angeles                              0.004169
Southwest Connecticut                    0.004156
New York City                            0.004081
Downtown Los Angeles                     0.003945
San Fernando Valley                      0.003916
Bronx, Queens, Brooklyn                  0.003856
Mountain View                            0.003823
Manhattan                                0.003548
San Francisco                            0.003490
Miami                                    0.003302
Southern Michigan                        0.002925
Manitoba                                 0.001664
Name: girl, Length: 52, dtype: float64

SUBSTRING baby
location
Eastern San Francisco                    0.008718
Florida                                  0.007949
Oakland                                  0.007761
Central Florida                          0.007060
Southern California                      0.006170
Boston                                   0.006068
Southeastern Colorado                    0.005586
Southeastern California                  0.005549
Indianapolis                             0.005521
Eastern Los Angeles                      0.005355
Eastern Ohio                             0.005195
Denver-Boulder                           0.005194
Buffalo                                  0.005140
Northern New York                        0.005032
Fort Lauderdale                          0.005004
Southeastern Michigan incl. Ann Arbor    0.004946
South Carolina                           0.004934
Downtown Los Angeles                     0.004912
Chicago                                  0.004879
Northern Louisiana                       0.004877
Central Arizona                          0.004631
Pennsylvania                             0.004588
Southeastern Virginia                    0.004535
Northeastern New York State              0.004368
Idaho                                    0.004324
San Fernando Valley                      0.004320
Eastern part of Southern New Jersey      0.004315
Chicago Suburbs                          0.004256
Champaign-Urbana                         0.004152
Arkansas                                 0.004077
Southern New York State                  0.004073
Southern Illinois                        0.004048
Western and Northern Colorado            0.004048
Southeastern Ohio                        0.003880
Northwestern Arkansas                    0.003836
Miami                                    0.003792
New York City                            0.003758
San Francisco                            0.003720
Mountain View                            0.003622
Manhattan                                0.003558
Bronx, Queens, Brooklyn                  0.003445
Los Angeles                              0.003426
Boulder-Denver                           0.003325
Southwest Connecticut                    0.003282
Southwestern Wisconsin                   0.003157
Northern Chicago Suburbs                 0.002914
Southern Michigan                        0.002889
Minnesota                                0.002793
Westchester County, NY                   0.002727
Maine                                    0.002666
Seattle                                  0.002636
Manitoba                                 0.001664
Name: baby, Length: 52, dtype: float64

SUBSTRING lynn
location
Southern New York State                  0.010474
Northern Louisiana                       0.009144
Florida                                  0.009084
Maine                                    0.008888
Southeastern Ohio                        0.008364
Southeastern Michigan incl. Ann Arbor    0.007348
Northern New York                        0.007304
Southern Illinois                        0.007083
Arkansas                                 0.007049
Northwestern Arkansas                    0.006712
Buffalo                                  0.006327
Champaign-Urbana                         0.006184
Northeastern New York State              0.006165
Eastern Ohio                             0.005990
Idaho                                    0.005815
Minnesota                                0.005585
Southeastern Colorado                    0.005363
Eastern part of Southern New Jersey      0.005178
Chicago Suburbs                          0.005103
Southwestern Wisconsin                   0.004962
Western and Northern Colorado            0.004888
Manitoba                                 0.004576
Indianapolis                             0.004417
Pennsylvania                             0.004393
Central Arizona                          0.004322
Southern California                      0.004145
Southeastern Virginia                    0.003921
Eastern San Francisco                    0.003875
Southwest Connecticut                    0.003761
Central Florida                          0.003683
Denver-Boulder                           0.003643
South Carolina                           0.003633
Southeastern California                  0.003567
Boulder-Denver                           0.003547
Eastern Los Angeles                      0.002988
Fort Lauderdale                          0.002847
Northern Chicago Suburbs                 0.002777
Seattle                                  0.002730
Westchester County, NY                   0.002401
Bronx, Queens, Brooklyn                  0.002192
Southern Michigan                        0.001945
San Fernando Valley                      0.001868
New York City                            0.001758
Los Angeles                              0.001753
Chicago                                  0.001669
Oakland                                  0.001647
Boston                                   0.001625
Mountain View                            0.001525
Miami                                    0.001453
Manhattan                                0.001431
Downtown Los Angeles                     0.001376
San Francisco                            0.001212
Name: lynn, Length: 52, dtype: float64

SUBSTRING ngel
location
Eastern San Francisco                    0.005988
Southeastern California                  0.005945
Downtown Los Angeles                     0.004805
Central Florida                          0.004604
Eastern Los Angeles                      0.004355
Denver-Boulder                           0.004281
Southern California                      0.004215
Oakland                                  0.004007
Bronx, Queens, Brooklyn                  0.003739
New York City                            0.003702
Southwestern Wisconsin                   0.003608
Southeastern Colorado                    0.003517
San Fernando Valley                      0.003483
Florida                                  0.003407
Fort Lauderdale                          0.003342
Miami                                    0.003342
Central Arizona                          0.003340
Boston                                   0.003321
Eastern part of Southern New Jersey      0.003308
Los Angeles                              0.003206
Southern New York State                  0.003200
Manhattan                                0.003200
Mountain View                            0.003029
Southeastern Virginia                    0.003023
South Carolina                           0.002906
San Francisco                            0.002875
Minnesota                                0.002793
Chicago                                  0.002772
Chicago Suburbs                          0.002760
Western and Northern Colorado            0.002756
Pennsylvania                             0.002733
Buffalo                                  0.002732
Eastern Ohio                             0.002659
Idaho                                    0.002647
Manitoba                                 0.002635
Southeastern Michigan incl. Ann Arbor    0.002543
Arkansas                                 0.002522
Westchester County, NY                   0.002469
Northeastern New York State              0.002456
Northern New York                        0.002292
Boulder-Denver                           0.002269
Southwest Connecticut                    0.002260
Northern Chicago Suburbs                 0.002251
Southern Michigan                        0.002139
Southern Illinois                        0.002114
Indianapolis                             0.002086
Southeastern Ohio                        0.002070
Champaign-Urbana                         0.001958
Northwestern Arkansas                    0.001781
Northern Louisiana                       0.001422
Seattle                                  0.001412
Maine                                    0.001284
Name: ngel, Length: 52, dtype: float64

SUBSTRING 1234
location
South Carolina                           0.003421
Northwestern Arkansas                    0.003288
Southeastern Ohio                        0.003190
Southeastern Michigan incl. Ann Arbor    0.002967
Southern New York State                  0.002910
Western and Northern Colorado            0.002877
Eastern Ohio                             0.002567
Northern Louisiana                       0.002540
Arkansas                                 0.002522
Denver-Boulder                           0.002480
Northern New York                        0.002414
Minnesota                                0.002374
Maine                                    0.002370
Northeastern New York State              0.002325
Northern Chicago Suburbs                 0.002302
Boston                                   0.002246
Buffalo                                  0.002208
Southwest Connecticut                    0.002177
Seattle                                  0.002165
Eastern part of Southern New Jersey      0.002158
Westchester County, NY                   0.002143
Chicago Suburbs                          0.002139
Boulder-Denver                           0.002133
Southern Illinois                        0.002114
Southern Michigan                        0.002046
Southeastern Colorado                    0.002001
Idaho                                    0.001976
Central Arizona                          0.001965
Chicago                                  0.001938
Champaign-Urbana                         0.001929
Mountain View                            0.001874
Indianapolis                             0.001840
New York City                            0.001785
Pennsylvania                             0.001757
Fort Lauderdale                          0.001686
Eastern San Francisco                    0.001673
Miami                                    0.001637
Southeastern California                  0.001585
Central Florida                          0.001535
Southern California                      0.001515
Manhattan                                0.001511
San Fernando Valley                      0.001508
Bronx, Queens, Brooklyn                  0.001507
Southeastern Virginia                    0.001464
Los Angeles                              0.001425
Eastern Los Angeles                      0.001413
Manitoba                                 0.001387
San Francisco                            0.001286
Downtown Los Angeles                     0.001240
Oakland                                  0.001126
Florida                                  0.000757
Southwestern Wisconsin                   0.000451
Name: 1234, Length: 52, dtype: float64

SUBSTRING 5280
location
Denver-Boulder                           0.003272
Boulder-Denver                           0.002599
Western and Northern Colorado            0.000173
Southeastern Colorado                    0.000087
Southern Illinois                        0.000035
Arkansas                                 0.000035
Southwest Connecticut                    0.000033
Eastern Ohio                             0.000031
Southern Michigan                        0.000029
Champaign-Urbana                         0.000022
Westchester County, NY                   0.000017
Northern Chicago Suburbs                 0.000015
Los Angeles                              0.000014
Chicago                                  0.000014
Downtown Los Angeles                     0.000012
San Francisco                            0.000009
New York City                            0.000009
Oakland                                  0.000008
Northeastern New York State              0.000007
Buffalo                                  0.000007
Northern New York                        0.000007
Fort Lauderdale                          0.000006
Southern California                      0.000005
San Fernando Valley                      0.000005
Boston                                        NaN
Bronx, Queens, Brooklyn                       NaN
Central Arizona                               NaN
Central Florida                               NaN
Chicago Suburbs                               NaN
Eastern Los Angeles                           NaN
Eastern San Francisco                         NaN
Eastern part of Southern New Jersey           NaN
Florida                                       NaN
Idaho                                         NaN
Indianapolis                                  NaN
Maine                                         NaN
Manhattan                                     NaN
Manitoba                                      NaN
Miami                                         NaN
Minnesota                                     NaN
Mountain View                                 NaN
Northern Louisiana                            NaN
Northwestern Arkansas                         NaN
Pennsylvania                                  NaN
Seattle                                       NaN
South Carolina                                NaN
Southeastern California                       NaN
Southeastern Michigan incl. Ann Arbor         NaN
Southeastern Ohio                             NaN
Southeastern Virginia                         NaN
Southern New York State                       NaN
Southwestern Wisconsin                        NaN
Name: 5280, Length: 52, dtype: float64
In [16]:
# This is just for generating json for use in algorithmshop.com's visualization:
# http://algorithmshop.com/20140102-snapchat-leak.html

import json
import math
from random import shuffle

PATH_PREFIX = '/post-files/20140202-snapchat'

# Some canadian things sneak in...
BLACKLIST = {'Manitoba'}

output_blobs = []
for (sub, r) in large_variance.iterrows():
    blob = {}
    blob['substring'] = sub
    blob['location_data'] = [{'location': location,
                              'frequency': frequency,
                              'example_users': example_users[sub][location].samples()}
                             for (location, frequency) in r.iteritems()
                             if (not math.isnan(frequency) and location not in BLACKLIST)]

    path = 'blobs/blob-{}.json'.format(sub)
    with open(path, 'wt') as f:
        json.dump(blob, f)
    output_blobs.append({'fragment': str(abs(hash(sub))),
                         'path': '{}/{}'.format(PATH_PREFIX, path)})

shuffle(output_blobs)
with open('blobs/all_blobs.json', 'wt') as f:
    json.dump(output_blobs, f)
    

with open('blobs/locations.json', 'wt') as f:
    all_locations = []
    for (_, r) in locations[:-1].iterrows():
        single_location = {'location': r['name'],
                            'lat': r['latitude'],
                            'lon': r['longitude']}
        all_locations.append(single_location)
    json.dump(all_locations, f)