Presentation¶

Naive¶

Reading data¶

In [5]:

from urllib.request import urlopen
from json import loads 

BASE = 'https://api.github.com/search'
_url1 = '{}/repositories?q={}'
q = 'data&per_page=100'
url1 = _url1.format(BASE, q)
f = urlopen(url1)
data = loads(f.read().decode('utf-8'))
repos = data['items']
repos[0]['description']

Out[5]:

'Data and code behind the stories and interactives at FiveThirtyEight'

In [6]:

repos[0]['full_name']

Out[6]:

'fivethirtyeight/data'

Processing data¶

In [7]:

def rate(repos):
    rated = []
    
    for repo in repos:
        rated.append(repo['watchers'] * 2)

    return rated

In [8]:

rate(repos)[:5]

Out[8]:

[11142, 5556, 396, 438, 128]

In [9]:

# Infinite data
from itertools import count

inf_repos = ({'watchers': c} for c in count())

# Don't actually run the below code since it will hang forever
# rate(inf_repos)

In [10]:

# Expensive data
from time import sleep

def exp_rate(repos):
    rated = []

    for repo in repos:
        sleep(1)
        rated.append(repo['watchers'] * 2)

    return rated

In [11]:

exp_rate(repos)[:5]

Out[11]:

[11142, 5556, 396, 438, 128]

Lazy evaluation¶

In [12]:

eager_list = list(range(5))
eager_list

Out[12]:

[0, 1, 2, 3, 4]

In [13]:

lazy_list = iter(eager_list)
lazy_list

Out[13]:

<list_iterator at 0x10f97bcc0>

In [14]:

next(lazy_list)

Out[14]:

In [15]:

list(lazy_list)

Out[15]:

[1, 2, 3, 4]

In [16]:

next(lazy_list)

---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
<ipython-input-16-898b6387b693> in <module>()
----> 1 next(lazy_list)

StopIteration:

Reading data¶

In [17]:

from ijson import items

f = urlopen(url1)
repos = items(f, 'items.item')
repos

Out[17]:

<generator object items at 0x10f9677d8>

In [18]:

repo = next(repos)
repo['full_name']

Out[18]:

'fivethirtyeight/data'

Processing data¶

In [19]:

def gen_rates(repos):
    for repo in repos:
        yield repo['watchers'] * 2

In [20]:

gen_rates(repos)

Out[20]:

<generator object gen_rates at 0x10f98d048>

In [21]:

rates = gen_rates(repos)
next(rates)

Out[21]:

In [22]:

next(rates)

Out[22]:

In [23]:

# Infinite data
rates = gen_rates(inf_repos)
next(rates)

Out[23]:

In [25]:

# Expensive data
def gen_exp_rates(repos):
    for repo in repos:
        sleep(1)
        yield repo['watchers'] * 2

In [26]:

from itertools import islice

rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

Out[26]:

[438, 128, 684, 348, 1356]

In [27]:

next(rates)

Out[27]:

Grouping data¶

In [39]:

f = urlopen(url1)
repos = items(f, 'items.item')
repo = next(repos)
repo.keys()

Out[39]:

dict_keys(['id', 'name', 'full_name', 'owner', 'private', 'html_url', 'description', 'fork', 'url', 'forks_url', 'keys_url', 'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url', 'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url', 'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url', 'languages_url', 'stargazers_url', 'contributors_url', 'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url', 'comments_url', 'issue_comment_url', 'contents_url', 'compare_url', 'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url', 'milestones_url', 'notifications_url', 'labels_url', 'releases_url', 'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url', 'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size', 'stargazers_count', 'watchers_count', 'language', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 'forks_count', 'mirror_url', 'open_issues_count', 'forks', 'open_issues', 'watchers', 'default_branch', 'score'])

In [40]:

repo['has_issues']

Out[40]:

True

In [41]:

import itertools as it
from operator import itemgetter

keyfunc = itemgetter('has_issues')
sorted_repos = sorted(repos, key=keyfunc)
grouped = it.groupby(sorted_repos, keyfunc)
data = ((key, len(list(group))) for key, group in grouped)
next(data)

Out[41]:

(False, 3)

In [42]:

next(data)

Out[42]:

(True, 96)

Memoization¶

Processing data¶

In [43]:

def calc_rate(watchers):
    sleep(1)
    return watchers * 2

def gen_exp_rates(repos):
    for repo in repos:
        yield calc_rate(repo['watchers'])

In [45]:

repos = it.repeat({'watchers': 5})
rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

Out[45]:

[10, 10, 10, 10, 10]

In [48]:

from functools import lru_cache

def _calc_rate(watchers):
    sleep(1)
    return watchers * 2

cacher = lru_cache()
calc_rate = cacher(_calc_rate)

def gen_exp_rates(repos):
    for repo in repos:
        yield calc_rate(repo['watchers'])

In [51]:

repos = it.repeat({'watchers': 5})
rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

Out[51]:

[10, 10, 10, 10, 10]

In [52]:

@lru_cache()
def calc_rate(watchers):
    sleep(1)
    return watchers * 2

def gen_exp_rates(repos):
    for repo in repos:
        yield calc_rate(repo['watchers'])

In [53]:

repos = it.repeat({'watchers': 5})
rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

Out[53]:

[10, 10, 10, 10, 10]

Introducing meza¶

Reading data¶

In [61]:

from urllib.request import urlopen
from meza.io import read_json

url2 = '{}/repositories?q=data'.format(BASE) 
f = urlopen(url2)
records = read_json(f, path='items.item')
repo = next(records)
repo['full_name']

Out[61]:

'fivethirtyeight/data'

In [62]:

len(list(records))

Out[62]:

In [56]:

from io import StringIO
from meza.io import read_csv

f = StringIO('greeting,location\nhello,world\n')
next(read_csv(f))

Out[56]:

{'greeting': 'hello', 'location': 'world'}

In [57]:

from os import path as p
from meza.io import join

url3 = '{}&page=2'.format(url2)
files = map(urlopen, [url2, url3])
records = join(*files, ext='json', path='items.item')
repo = next(records) 
repo['full_name']

Out[57]:

'fivethirtyeight/data'

In [58]:

repo['language']

Out[58]:

'Jupyter Notebook'

In [59]:

len(list(records))

Out[59]:

Transforming data¶

In [63]:

from meza.process import merge

records = [{'a': 200}, {'b': 300}, {'c': 400}]
merge(records)

Out[63]:

{'a': 200, 'b': 300, 'c': 400}

In [64]:

from meza.process import group

records = [
    {'item': 'a', 'amount': 200},
    {'item': 'a', 'amount': 200},
    {'item': 'b', 'amount': 400}]

grouped = group(records, 'item')
key, _group = next(grouped)
key

Out[64]:

'a'

In [65]:

_group

Out[65]:

[{'amount': 200, 'item': 'a'}, {'amount': 200, 'item': 'a'}]

In [66]:

from meza import process as pr

f = urlopen(url2)
raw = read_json(f, path='items.item')
fields = ['full_name', 'language', 'watchers', 'score', 'has_wiki']
cut = pr.cut(raw, fields)
cut

Out[66]:

<generator object cut.<locals>.<genexpr> at 0x11020ae08>

In [67]:

cut, preview = pr.peek(cut)
cut

Out[67]:

<itertools.chain at 0x11025f4a8>

In [68]:

len(preview)

Out[68]:

In [69]:

preview[0]

Out[69]:

{'full_name': 'fivethirtyeight/data',
 'has_wiki': True,
 'language': 'Jupyter Notebook',
 'score': Decimal('120.396454'),
 'watchers': 5572}

In [70]:

filled = pr.fillempty(raw, value='', fields=['language'])
pivoted = pr.pivot(filled, 'score', 'language', rows=['has_wiki'], op=min)
next(pivoted)

Out[70]:

{'HTML': Decimal('73.19426'),
 'JavaScript': Decimal('54.46375'),
 'Python': Decimal('50.188396'),
 'has_wiki': False}

In [71]:

next(pivoted)

Out[71]:

{'': Decimal('44.635494'),
 'C#': Decimal('47.918125'),
 'HTML': Decimal('68.96914'),
 'JavaScript': Decimal('44.16988'),
 'PHP': Decimal('44.0172'),
 'Python': Decimal('44.73296'),
 'R': Decimal('45.959583'),
 'has_wiki': True}