import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# %matplotlib inline
import json
metafiction = [json.loads(x) for x in open("metafiction.dat")]
len(metafiction)
author_list = [{"author": rec["id"], "name": rec["name"]} for rec in metafiction]
len(author_list)
for record in metafiction:
for story in record["favourite-stories"]:
author_list.append({"author": story["author"]})
for author in record["favourite-authors"]:
author_list.append({"author": author})
len(author_list)
authors = DataFrame(author_list)
authors.drop_duplicates(["author"], inplace=True)
authors.set_index(["author"], inplace=True)
len(authors)
authors.ix[[0]]
story_list = []
for record in metafiction:
story_list.extend(record["author-stories"])
story_list.extend(record["favourite-stories"])
len(story_list)
stories = DataFrame(story_list)
## rename columns
columns = stories.columns.values
columns[3] = u"is_complete"
columns[4] = u"submitted"
columns[5] = u"updated"
columns[9] = u"story"
stories.columns = columns
stories.drop_duplicates(["story"], inplace=True)
stories.set_index("story", inplace=True)
stories["submitted"] = stories["submitted"].astype("datetime64")
stories["updated"] = stories["updated"].astype("datetime64")
len(stories)
stories.ix[[0]]
favourite_author_list = []
favourite_story_list = []
for record in metafiction:
for author in record["favourite-authors"]:
favourite_author_list.append({"author": record["id"],
"favourite_author": author})
for story in record["favourite-stories"]:
favourite_story_list.append({"author": record["id"],
"favourite_story": story["id"]})
(len(favourite_author_list), len(favourite_story_list))
favourite_authors = DataFrame(favourite_author_list)
favourite_authors.set_index("author", inplace=True)
favourite_authors = favourite_authors["favourite_author"]
favourite_stories = DataFrame(favourite_story_list)
favourite_stories.set_index("author", inplace=True)
favourite_stories = favourite_stories["favourite_story"]
favourite_authors.ix[[0]]
favourite_stories.ix[[0]]
genre_list = sorted(set.union(*[set(g) for g in stories["genres"]]))
genres = DataFrame(data=np.zeros((len(stories), len(genre_list))), columns=genre_list, index=stories.index)
category_list = sorted(set.union(*[set(c) for c in stories["categories"]]))
categories = DataFrame(data=np.zeros((len(stories), len(category_list))), columns=category_list, index=stories.index)
for story in stories.index:
genres.ix[story, stories.ix[story, "genres"]] = 1
categories.ix[story, stories.ix[story, "categories"]] = 1
genres.ix[[0]]
categories.ix[[0]]