Marvel API: Boring Data!

Portland Data Science Meetup, November 2014

In [6]:
import md5
import os
import time
import urlparse

import bokeh.charts as bc
import matplotlib.pyplot as plt
import pandas as pd
import requests
import qgrid
import seaborn as sns

%matplotlib inline
qgrid.nbinstall()

# Quick and Dirty API wrapper

class Marvelous(object):

    base_url = "http://gateway.marvel.com"

    def __init__(self, private_key=None, public_key=None):
        """A microwrapper for the Marvel API"""
        
        if private_key and public_key:
            self.private_key = private
            self.public_key = public_key
        else:
            self.private_key = os.getenv('MARVEL_PRIVATE_KEY')
            self.public_key = os.getenv('MARVEL_PUBLIC_KEY')
            
    def request(self, endpoint, **kwargs):
        """Make a request against the Marvel API"""
        ts = str(int(time.time()))
        concat = ''.join([ts, self.private_key, self.public_key])
        query_params = {
            'ts': ts, 
            'apikey': self.public_key, 
            'hash': md5.new(concat).hexdigest()}
        query_params.update(kwargs)
        url = urlparse.urljoin(self.base_url, endpoint)
        resp = requests.get(url, params=query_params)
        if resp.status_code != 200:
            resp.raise_for_status()
        else:
            return resp.json()
    
    def get_character(self, character_name):
        """Get the data for a given character name"""
        resp = self.request('/v1/public/characters', name=character_name)
        result = resp['data']['results'][0]
        return {
            'name': result['name'],
            'description': result['description'],
            'id': result['id'],
            'comic_count': result['comics']['available'],
            'story_count': result['stories']['available'],
            'series_count': result['series']['available'],
            'event_count': result['events']['available']
        }
In [7]:
api = Marvelous()
In [8]:
api.get_character('Captain America')
Out[8]:
{'comic_count': 1422,
 'description': u"Vowing to serve his country any way he could, young Steve Rogers took the super soldier serum to become America's one-man army. Fighting for the red, white and blue for over 60 years, Captain America is the living, breathing symbol of freedom and liberty.",
 'event_count': 19,
 'id': 1009220,
 'name': u'Captain America',
 'series_count': 375,
 'story_count': 2285}
In [9]:
cap_comics = api.request('/v1/public/characters/1009220/comics', dateRange='2010-01-01,2014-11-01', limit=100)
In [10]:
# Let's get all of the characters in this list of comics
comic_list = cap_comics['data']['results']
character_set = {c['name'] for comic in comic_list for c in comic['characters']['items']}
In [11]:
# Let's only get comic ids for comics that have associated characters
comic_ids = [str(com['id']) for com in comic_list if com['characters']['available'] > 10]
# Marvel's API will only accept up to 10 comic ids
joined_ids = ','.join(comic_ids[:5])
joined_ids
Out[11]:
'41278,35241,41191,42118,42117'
In [12]:
all_characters = api.request('/v1/public/characters', comics=joined_ids, limit=50)
In [13]:
character_list = all_characters['data']['results']
records = []
for character in character_list:
    records.append({
            'name': character['name'],
            'description': character['description'],
            'id': character['id'],
            'comic_count': character['comics']['available'],
            'story_count': character['stories']['available'],
            'series_count': character['series']['available'],
            'event_count': character['events']['available']
        })
In [14]:
hero_df = pd.DataFrame(records)
qgrid.show_grid(hero_df, remote_js=True)
In [30]:
# For quick and dirty exploratory analysis, I usually start with Pandas native plotting functionality
hero_df['comic_count'].plot(kind='bar')
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x10dc8bb50>
In [42]:
top_10 = hero_df.sort('comic_count', ascending=False)[['name', 'comic_count']][:10]
qgrid.show_grid(top_10, remote_js=True)
In [32]:
top_10.plot(kind='bar', x='name')
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e087550>
In [33]:
# Ok, time to create something a bit nicer
sns.factorplot('name', 'comic_count', data=top_10, kind='bar', aspect=2.0)
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x10f2922d0>
In [34]:
# Seaborn is very customizable
sns.set_style("whitegrid")
sns.factorplot('name', 'comic_count', data=top_10, kind='bar', aspect=2.5, 
               palette="muted", size=7, x_order=top_10['name'].tolist())
Out[34]:
<seaborn.axisgrid.FacetGrid at 0x10f1b8050>
In [35]:
import vincent
vincent.initialize_notebook()
In [36]:
bar = (vincent.Bar(top_10, columns=['comic_count'], key_on='name')
              .common_axis_properties(title_size=15)
              .axis_titles(x='Character Name', y='Total Comic Book Count')
              .colors(range_=['#6a9fb5'])
              .x_axis_properties(title_offset=10)
              .y_axis_properties(title_offset=-30))
bar
Out[36]:
In [1]:
from IPython.core.display import HTML

# Use the following if running locally:
# styles = open("styles/custom.css", "r").read()

# This is for nbviewer:
styles = open("custom.css", "r").read()

HTML(styles)
Out[1]: