Metafiction

Data Wrangling

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# %matplotlib inline
In [2]:
import json

metafiction = [json.loads(x) for x in open("metafiction.dat")]
len(metafiction)
Out[2]:
100

authors

In [3]:
author_list = [{"author": rec["id"], "name": rec["name"]} for rec in metafiction]

len(author_list)
Out[3]:
100
In [4]:
for record in metafiction:
    for story in record["favourite-stories"]:
        author_list.append({"author": story["author"]})
    for author in record["favourite-authors"]:
        author_list.append({"author": author})

len(author_list)
Out[4]:
10694
In [5]:
authors = DataFrame(author_list)
authors.drop_duplicates(["author"], inplace=True)
authors.set_index(["author"], inplace=True)

len(authors)
Out[5]:
5170
In [6]:
authors.ix[[0]]
Out[6]:
name
author
5111102 EagleJarl

stories

In [7]:
story_list = []

for record in metafiction:
    story_list.extend(record["author-stories"])
    story_list.extend(record["favourite-stories"])

len(story_list)
Out[7]:
11052
In [8]:
stories = DataFrame(story_list)

## rename columns
columns = stories.columns.values
columns[3] = u"is_complete"
columns[4] = u"submitted"
columns[5] = u"updated"
columns[9] = u"story"
stories.columns = columns 

stories.drop_duplicates(["story"], inplace=True)
stories.set_index("story", inplace=True)
stories["submitted"] = stories["submitted"].astype("datetime64")
stories["updated"] = stories["updated"].astype("datetime64")

len(stories)
Out[8]:
9089
In [9]:
stories.ix[[0]]
Out[9]:
author categories chapters is_complete submitted updated favourites follows genres language rating reviews title word-count
story
9669819 5111102 [Dungeons and Dragons] 76 False 2013-09-08 11:03:42 2014-12-06 17:56:42 425 483 [Adventure, Fantasy] English T 773 The Two Year Emperor 309723

favourites

In [10]:
favourite_author_list = []
favourite_story_list = []

for record in metafiction:
    for author in record["favourite-authors"]:
        favourite_author_list.append({"author": record["id"],
                                      "favourite_author": author})
    for story in record["favourite-stories"]:
        favourite_story_list.append({"author": record["id"],
                                     "favourite_story": story["id"]})
        
(len(favourite_author_list), len(favourite_story_list))
Out[10]:
(1211, 9383)
In [11]:
favourite_authors = DataFrame(favourite_author_list)
favourite_authors.set_index("author", inplace=True)
favourite_authors = favourite_authors["favourite_author"]

favourite_stories = DataFrame(favourite_story_list)
favourite_stories.set_index("author", inplace=True)
favourite_stories = favourite_stories["favourite_story"]
In [12]:
favourite_authors.ix[[0]]
Out[12]:
author
5111102    4976703
Name: favourite_author, dtype: object
In [13]:
favourite_stories.ix[[0]]
Out[13]:
author
5111102    8096183
Name: favourite_story, dtype: object

genres and categories

In [14]:
genre_list = sorted(set.union(*[set(g) for g in stories["genres"]]))
genres = DataFrame(data=np.zeros((len(stories), len(genre_list))), columns=genre_list, index=stories.index)

category_list = sorted(set.union(*[set(c) for c in stories["categories"]]))
categories = DataFrame(data=np.zeros((len(stories), len(category_list))), columns=category_list, index=stories.index)

for story in stories.index:
    genres.ix[story, stories.ix[story, "genres"]] = 1
    categories.ix[story, stories.ix[story, "categories"]] = 1
In [15]:
genres.ix[[0]]
Out[15]:
Adventure Angst Comfort Crime Drama Family Fantasy Friendship Horror Humor ... Mystery Parody Poetry Romance Sci-Fi Spiritual Supernatural Suspense Tragedy Western
story
9669819 1 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 21 columns

In [16]:
categories.ix[[0]]
Out[16]:
.hack/SIGN 10th Kingdom 1984 24 30 Rock A Certain Scientific Railgun/とある科学の超電磁砲 A song of Ice and Fire A-Team Addams Family Advance Wars ... Yami no Matsuei Young Justice Young Wizards Yu Yu Hakusho Yu-Gi-Oh Zatch Bell Zoids iCarly the X-Men xxxHOLiC
story
9669819 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 541 columns