from urllib.request import urlopen
from itertools import groupby
from operator import itemgetter
from ijson import items
url2 = 'https://api.github.com/search/repositories?q=data'
f = urlopen(url2)
repos = items(f, 'items.item')
keyfunc = itemgetter('language')
cleaned = filter(keyfunc, repos)
records = sorted(cleaned, key=keyfunc)
grouped = groupby(records, keyfunc)
for key, group in grouped:
cnt = sum(g['watchers'] for g in group)
print(key, cnt)
C# 35 C++ 64 HTML 352 JavaScript 4702 Jupyter Notebook 5573 PHP 129 Python 16235 R 18
owner_type
per has_pages
¶from urllib.request import urlopen
from operator import itemgetter
from functools import partial
from meza import process as pr, fntools as ft
from meza.io import read_json
url4 = 'https://api.github.com/search/repositories?q=data&sort=stars&order=desc'
f = urlopen(url4)
records = read_json(f, path='items.item')
# repos without a language have a value of None, which meza doesn't like
filled = pr.fillempty(records, value='', fields=['language'])
filled, preview = pr.peek(filled)
preview[0]
{'archive_url': 'https://api.github.com/repos/d3/d3/{archive_format}{/ref}', 'assignees_url': 'https://api.github.com/repos/d3/d3/assignees{/user}', 'blobs_url': 'https://api.github.com/repos/d3/d3/git/blobs{/sha}', 'branches_url': 'https://api.github.com/repos/d3/d3/branches{/branch}', 'clone_url': 'https://github.com/d3/d3.git', 'collaborators_url': 'https://api.github.com/repos/d3/d3/collaborators{/collaborator}', 'comments_url': 'https://api.github.com/repos/d3/d3/comments{/number}', 'commits_url': 'https://api.github.com/repos/d3/d3/commits{/sha}', 'compare_url': 'https://api.github.com/repos/d3/d3/compare/{base}...{head}', 'contents_url': 'https://api.github.com/repos/d3/d3/contents/{+path}', 'contributors_url': 'https://api.github.com/repos/d3/d3/contributors', 'created_at': '2010-09-27T17:22:42Z', 'default_branch': 'master', 'deployments_url': 'https://api.github.com/repos/d3/d3/deployments', 'description': 'Bring data to life with SVG, Canvas and HTML. :bar_chart::chart_with_upwards_trend::tada:', 'downloads_url': 'https://api.github.com/repos/d3/d3/downloads', 'events_url': 'https://api.github.com/repos/d3/d3/events', 'fork': False, 'forks': 16964, 'forks_count': 16964, 'forks_url': 'https://api.github.com/repos/d3/d3/forks', 'full_name': 'd3/d3', 'git_commits_url': 'https://api.github.com/repos/d3/d3/git/commits{/sha}', 'git_refs_url': 'https://api.github.com/repos/d3/d3/git/refs{/sha}', 'git_tags_url': 'https://api.github.com/repos/d3/d3/git/tags{/sha}', 'git_url': 'git://github.com/d3/d3.git', 'has_downloads': True, 'has_issues': True, 'has_pages': True, 'has_projects': False, 'has_wiki': True, 'homepage': 'https://d3js.org', 'hooks_url': 'https://api.github.com/repos/d3/d3/hooks', 'html_url': 'https://github.com/d3/d3', 'id': 943149, 'issue_comment_url': 'https://api.github.com/repos/d3/d3/issues/comments{/number}', 'issue_events_url': 'https://api.github.com/repos/d3/d3/issues/events{/number}', 'issues_url': 'https://api.github.com/repos/d3/d3/issues{/number}', 'keys_url': 'https://api.github.com/repos/d3/d3/keys{/key_id}', 'labels_url': 'https://api.github.com/repos/d3/d3/labels{/name}', 'language': 'JavaScript', 'languages_url': 'https://api.github.com/repos/d3/d3/languages', 'merges_url': 'https://api.github.com/repos/d3/d3/merges', 'milestones_url': 'https://api.github.com/repos/d3/d3/milestones{/number}', 'mirror_url': None, 'name': 'd3', 'notifications_url': 'https://api.github.com/repos/d3/d3/notifications{?since,all,participating}', 'open_issues': 0, 'open_issues_count': 0, 'owner': {'avatar_url': 'https://avatars2.githubusercontent.com/u/1562726?v=3', 'events_url': 'https://api.github.com/users/d3/events{/privacy}', 'followers_url': 'https://api.github.com/users/d3/followers', 'following_url': 'https://api.github.com/users/d3/following{/other_user}', 'gists_url': 'https://api.github.com/users/d3/gists{/gist_id}', 'gravatar_id': '', 'html_url': 'https://github.com/d3', 'id': 1562726, 'login': 'd3', 'organizations_url': 'https://api.github.com/users/d3/orgs', 'received_events_url': 'https://api.github.com/users/d3/received_events', 'repos_url': 'https://api.github.com/users/d3/repos', 'site_admin': False, 'starred_url': 'https://api.github.com/users/d3/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/d3/subscriptions', 'type': 'Organization', 'url': 'https://api.github.com/users/d3'}, 'private': False, 'pulls_url': 'https://api.github.com/repos/d3/d3/pulls{/number}', 'pushed_at': '2017-05-16T14:50:33Z', 'releases_url': 'https://api.github.com/repos/d3/d3/releases{/id}', 'score': Decimal('7.8259406'), 'size': 41570, 'ssh_url': 'git@github.com:d3/d3.git', 'stargazers_count': 64706, 'stargazers_url': 'https://api.github.com/repos/d3/d3/stargazers', 'statuses_url': 'https://api.github.com/repos/d3/d3/statuses/{sha}', 'subscribers_url': 'https://api.github.com/repos/d3/d3/subscribers', 'subscription_url': 'https://api.github.com/repos/d3/d3/subscription', 'svn_url': 'https://github.com/d3/d3', 'tags_url': 'https://api.github.com/repos/d3/d3/tags', 'teams_url': 'https://api.github.com/repos/d3/d3/teams', 'trees_url': 'https://api.github.com/repos/d3/d3/git/trees{/sha}', 'updated_at': '2017-05-25T21:34:57Z', 'url': 'https://api.github.com/repos/d3/d3', 'watchers': 64706, 'watchers_count': 64706}
# meza doesn't do well with nested dicts
flat = (dict(ft.flatten(r)) for r in filled)
flat, preview = pr.peek(flat)
preview[0]
{'archive_url': 'https://api.github.com/repos/d3/d3/{archive_format}{/ref}', 'assignees_url': 'https://api.github.com/repos/d3/d3/assignees{/user}', 'blobs_url': 'https://api.github.com/repos/d3/d3/git/blobs{/sha}', 'branches_url': 'https://api.github.com/repos/d3/d3/branches{/branch}', 'clone_url': 'https://github.com/d3/d3.git', 'collaborators_url': 'https://api.github.com/repos/d3/d3/collaborators{/collaborator}', 'comments_url': 'https://api.github.com/repos/d3/d3/comments{/number}', 'commits_url': 'https://api.github.com/repos/d3/d3/commits{/sha}', 'compare_url': 'https://api.github.com/repos/d3/d3/compare/{base}...{head}', 'contents_url': 'https://api.github.com/repos/d3/d3/contents/{+path}', 'contributors_url': 'https://api.github.com/repos/d3/d3/contributors', 'created_at': '2010-09-27T17:22:42Z', 'default_branch': 'master', 'deployments_url': 'https://api.github.com/repos/d3/d3/deployments', 'description': 'Bring data to life with SVG, Canvas and HTML. :bar_chart::chart_with_upwards_trend::tada:', 'downloads_url': 'https://api.github.com/repos/d3/d3/downloads', 'events_url': 'https://api.github.com/repos/d3/d3/events', 'fork': False, 'forks': 16964, 'forks_count': 16964, 'forks_url': 'https://api.github.com/repos/d3/d3/forks', 'full_name': 'd3/d3', 'git_commits_url': 'https://api.github.com/repos/d3/d3/git/commits{/sha}', 'git_refs_url': 'https://api.github.com/repos/d3/d3/git/refs{/sha}', 'git_tags_url': 'https://api.github.com/repos/d3/d3/git/tags{/sha}', 'git_url': 'git://github.com/d3/d3.git', 'has_downloads': True, 'has_issues': True, 'has_pages': True, 'has_projects': False, 'has_wiki': True, 'homepage': 'https://d3js.org', 'hooks_url': 'https://api.github.com/repos/d3/d3/hooks', 'html_url': 'https://github.com/d3/d3', 'id': 943149, 'issue_comment_url': 'https://api.github.com/repos/d3/d3/issues/comments{/number}', 'issue_events_url': 'https://api.github.com/repos/d3/d3/issues/events{/number}', 'issues_url': 'https://api.github.com/repos/d3/d3/issues{/number}', 'keys_url': 'https://api.github.com/repos/d3/d3/keys{/key_id}', 'labels_url': 'https://api.github.com/repos/d3/d3/labels{/name}', 'language': 'JavaScript', 'languages_url': 'https://api.github.com/repos/d3/d3/languages', 'merges_url': 'https://api.github.com/repos/d3/d3/merges', 'milestones_url': 'https://api.github.com/repos/d3/d3/milestones{/number}', 'mirror_url': None, 'name': 'd3', 'notifications_url': 'https://api.github.com/repos/d3/d3/notifications{?since,all,participating}', 'open_issues': 0, 'open_issues_count': 0, 'owner_avatar_url': 'https://avatars2.githubusercontent.com/u/1562726?v=3', 'owner_events_url': 'https://api.github.com/users/d3/events{/privacy}', 'owner_followers_url': 'https://api.github.com/users/d3/followers', 'owner_following_url': 'https://api.github.com/users/d3/following{/other_user}', 'owner_gists_url': 'https://api.github.com/users/d3/gists{/gist_id}', 'owner_gravatar_id': '', 'owner_html_url': 'https://github.com/d3', 'owner_id': 1562726, 'owner_login': 'd3', 'owner_organizations_url': 'https://api.github.com/users/d3/orgs', 'owner_received_events_url': 'https://api.github.com/users/d3/received_events', 'owner_repos_url': 'https://api.github.com/users/d3/repos', 'owner_site_admin': False, 'owner_starred_url': 'https://api.github.com/users/d3/starred{/owner}{/repo}', 'owner_subscriptions_url': 'https://api.github.com/users/d3/subscriptions', 'owner_type': 'Organization', 'owner_url': 'https://api.github.com/users/d3', 'private': False, 'pulls_url': 'https://api.github.com/repos/d3/d3/pulls{/number}', 'pushed_at': '2017-05-16T14:50:33Z', 'releases_url': 'https://api.github.com/repos/d3/d3/releases{/id}', 'score': Decimal('7.8259406'), 'size': 41570, 'ssh_url': 'git@github.com:d3/d3.git', 'stargazers_count': 64706, 'stargazers_url': 'https://api.github.com/repos/d3/d3/stargazers', 'statuses_url': 'https://api.github.com/repos/d3/d3/statuses/{sha}', 'subscribers_url': 'https://api.github.com/repos/d3/d3/subscribers', 'subscription_url': 'https://api.github.com/repos/d3/d3/subscription', 'svn_url': 'https://github.com/d3/d3', 'tags_url': 'https://api.github.com/repos/d3/d3/tags', 'teams_url': 'https://api.github.com/repos/d3/d3/teams', 'trees_url': 'https://api.github.com/repos/d3/d3/git/trees{/sha}', 'updated_at': '2017-05-25T21:34:57Z', 'url': 'https://api.github.com/repos/d3/d3', 'watchers': 64706, 'watchers_count': 64706}
# `watchers` is the pivot field to aggregate by
# `language` is the pivot field to group by
args = ('watchers', 'language')
# the pivot fields we want to include in each row
rows = ['has_pages', 'owner_type']
pivotted = pr.pivot(flat, *args, rows=rows, op=sum)
pivotted, preview = pr.peek(pivotted)
preview[0]
{'C#': 7772, 'C++': 58473, 'Go': 13510, 'Objective-C': 10702, 'Ruby': 7504, 'Swift': 27142, 'has_pages': False, 'owner_type': 'Organization'}
# `rows` are the fields we don't want to normalize (since `invert` is true)
kwargs = {'rows': rows, 'invert': True}
# `watchers` is the field to use for the normalized values
# `language` is the field to use for the normalized key
normal = pr.normalize(pivotted, *args, **kwargs)
normal, preview = pr.peek(normal)
preview[0]
{'has_pages': False, 'language': 'Objective-C', 'owner_type': 'Organization', 'watchers': 10702}
# aggregate by `watchers`
agg_keyfunc = itemgetter('watchers')
# group by `has_pages` and `owner_type`
group_keyfunc = lambda x: tuple(x[r] for r in rows)
aggregator = partial(max, key=agg_keyfunc)
# Only emit the groups, not the group key (since `tupled` is False)
kwargs = {'tupled': False, 'aggregator': aggregator}
grouped = pr.group(normal, group_keyfunc, **kwargs)
grouped, preview = pr.peek(grouped)
preview[0]
{'has_pages': False, 'language': 'C++', 'owner_type': 'Organization', 'watchers': 58473}
from pprint import pprint
sgrouped = sorted(grouped, key=agg_keyfunc, reverse=True)
for record in sgrouped:
pprint(record)
{'has_pages': True, 'language': 'JavaScript', 'owner_type': 'Organization', 'watchers': 130152} {'has_pages': False, 'language': 'C++', 'owner_type': 'Organization', 'watchers': 58473} {'has_pages': False, 'language': 'Python', 'owner_type': 'User', 'watchers': 48543} {'has_pages': True, 'language': 'JavaScript', 'owner_type': 'User', 'watchers': 10285}