Metafiction

Data Analysis

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# %matplotlib inline
In [2]:
from wrangling import authors, stories, favourite_authors, favourite_stories, genres, categories

Similarity

I need to calculate a similarity measure between two authors/users.

Since I care about story preference, I'll use the Jaccard Index on favourite story sets.

In [3]:
def jaccard_index(set1, set2):
    i_count = len(set1.intersection(set2))
    u_count = len(set1.union(set2))
    return 0 if u_count == 0 else i_count/float(u_count)

Scoring Authors

Comparing author's favourite stories with mine

In [4]:
def author_similarity(author, my_stories):
    author_stories = set(stories[stories["author"] == author.name].index)
    author_favourites = set(favourite_stories.ix[author.name]) if author.name in favourite_stories else set()
    all_stories = author_stories.union(author_favourites)
    return jaccard_index(all_stories, set(my_stories))
In [5]:
my_fav_stories = ["8096183", # Harry Potter and the Natural 20
                  "9794740", # Pokemon, The Origin of Species
                  "9311012", # Lighting up the Dark
                  "5782108", # Harry Potter and the Methods of Rationality
                  "7354757", # The Game of Champions
                  "5193644", # Time Braid
                  "3695087", # Larceny, Lechery and Luna Lovegood
                  "9669819", # The Two Year Emperor
                  ]
In [6]:
authors["similarity"] = authors.apply(author_similarity, axis=1, args=(my_fav_stories,))
In [7]:
authors.sort("similarity", ascending=False)[:5]
Out[7]:
name similarity
author
4976703 alexanderwales 0.384615
5118664 daystar721 0.333333
3989854 Sir Poley 0.222222
4767519 Scientist's Thesis 0.187500
3344060 Velorien 0.166667

Scoring Stories

Calculating the weighted average of all stories by author similarity

In [8]:
# The sum of the similarity of every author who has favourited this story + the writer's similarity
stories["sim_total"] = authors.ix[stories["author"]]["similarity"].values

# The total number of times this story has been favourited + written (1)
stories["sim_count"] = 1

for author in authors.iterrows():
    author_favs = favourite_stories.get(author[0], Series())
    stories.loc[author_favs, "sim_total"] += author[1]["similarity"]
    stories.loc[author_favs, "sim_count"] += 1

stories["sim_score"] = stories["sim_total"].div(stories["sim_count"])

Stories by average score

In [9]:
stories.sort("sim_score", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
Out[9]:
title sim_total sim_count sim_score
story
10327510 A Bluer Shade of White 0.384615 1 0.384615
10023949 Harry Potter and the Philosopher\'s Zombie 0.717949 2 0.358974
9676374 Daystar\'s Remix of Rationality 0.333333 1 0.333333
9794740 Pokemon: The Origin of Species 0.967949 4 0.241987
9658524 Branches on the Tree of Time 0.469361 2 0.234681

Stories by total similarity

In [10]:
stories.sort("sim_total", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
Out[10]:
title sim_total sim_count sim_score
story
8096183 Harry Potter and the Natural 20 1.573355 18 0.087409
5782108 Harry Potter and the Methods of Rationality 1.486540 18 0.082586
9794740 Pokemon: The Origin of Species 0.967949 4 0.241987
10023949 Harry Potter and the Philosopher\'s Zombie 0.717949 2 0.358974
10360716 The Metropolitan Man 0.705458 5 0.141092

Stories by times favourited

In [11]:
stories.sort("sim_count", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
Out[11]:
title sim_total sim_count sim_score
story
8096183 Harry Potter and the Natural 20 1.573355 18 0.087409
5782108 Harry Potter and the Methods of Rationality 1.486540 18 0.082586
2731239 Team 8 0.193825 14 0.013845
5193644 Time Braid 0.683962 13 0.052612
5409165 It\'s For a Good Cause, I Swear! 0.058134 11 0.005285