import pandas as pd locations = pd.read_csv('locations.tsv', delimiter='\t') all_users = pd.read_csv('schat.csv', names=['numbers', 'username', 'location'], index_col='location', header=None) # We just need the location names del all_users['numbers'] all_users[0:10] from collections import defaultdict def get_subs(s, n): """Get all substrings of s of length n""" for i in range(len(s) - n + 1): yield s[i:i+n] def get_len_n_dict(n, lower_bound): sub_counts = defaultdict(int) for name in all_users.username: for s in get_subs(name, n): sub_counts[s] += 1 return dict((k,v) for (k,v) in sub_counts.items() if v >= lower_bound) # Find substrings of a certain lenght that occur a minimal # number of times: SUBSTRING_LENGTH = 4 MIN_OCCURENCE = 60 substring_counts = get_len_n_dict(SUBSTRING_LENGTH, MIN_OCCURENCE) '5820' in substring_counts # Example of the most common substrings from heapq import nlargest nlargest(10, substring_counts.items(), key=lambda x: x[1]) counts_per_area = all_users.groupby(level=0).agg(len) counts_per_area.rename(columns={'username': 'users'}, inplace=True) counts_per_area[0:10] # Now get counts of each substring by state def counts_in_group(g, n, total_counts): output_counts = defaultdict(int) for name in g.username: for s in get_subs(name, n): if s in total_counts: output_counts[s] += 1 return pd.DataFrame([output_counts]) groupby_fn = lambda x: counts_in_group(x, SUBSTRING_LENGTH, substring_counts) by_state = all_users.groupby(level=0).apply(groupby_fn) # I always end up doing this with pandas "apply". Halp me. I don't want to do it like this... by_state = by_state.reset_index(level=1, drop=True) by_state[0:3][['.mar', 'love', 'zzzz']] # Filter out the small data by_state_normalized = by_state / counts_per_area normalized = by_state.div(counts_per_area.users, axis='index') drop_small = normalized[counts_per_area.users > 2000].T # We get regional strings by looking for high variance #in the ratio of users with a given substring for a state. VARIANCE_QUANTILE = .933 variances = drop_small.var(axis=1) large_variance = drop_small[variances > variances.quantile(q=VARIANCE_QUANTILE)] from random import randint class StreamSampler(object): def __init__(self, num_samples=1): self.num_samples = num_samples self.saved = [] self.num_seen = 0 def present(self, item): if len(self.saved) < self.num_samples: self.saved.append(item) self.num_seen += 1 return else: v = random.randint(0, self.num_seen) if v < self.num_samples: self.saved[v] = item self.num_seen += 1 def samples(self): return self.saved # For each substring, get 2 example usernames per area code. desired_words = large_variance.index example_users = defaultdict(lambda: defaultdict(lambda: StreamSampler(2))) for (area_code, r) in all_users.iterrows(): name = r['username'] for s in get_subs(name, 4): if s in desired_words: example_users[s][area_code].present(name) def print_report(word): print('\nSUBSTRING {}'.format(word)) c = large_variance.ix[word].copy() c.sort(ascending=False) print(c) for s in ('love', 'girl', 'baby', 'lynn', 'ngel', '1234', '5280'): print_report(s) # This is just for generating json for use in algorithmshop.com's visualization: # http://algorithmshop.com/20140102-snapchat-leak.html import json import math from random import shuffle PATH_PREFIX = '/post-files/20140202-snapchat' # Some canadian things sneak in... BLACKLIST = {'Manitoba'} output_blobs = [] for (sub, r) in large_variance.iterrows(): blob = {} blob['substring'] = sub blob['location_data'] = [{'location': location, 'frequency': frequency, 'example_users': example_users[sub][location].samples()} for (location, frequency) in r.iteritems() if (not math.isnan(frequency) and location not in BLACKLIST)] path = 'blobs/blob-{}.json'.format(sub) with open(path, 'wt') as f: json.dump(blob, f) output_blobs.append({'fragment': str(abs(hash(sub))), 'path': '{}/{}'.format(PATH_PREFIX, path)}) shuffle(output_blobs) with open('blobs/all_blobs.json', 'wt') as f: json.dump(output_blobs, f) with open('blobs/locations.json', 'wt') as f: all_locations = [] for (_, r) in locations[:-1].iterrows(): single_location = {'location': r['name'], 'lat': r['latitude'], 'lon': r['longitude']} all_locations.append(single_location) json.dump(all_locations, f)