import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# %matplotlib inline
from wrangling import authors, stories, favourite_authors, favourite_stories, genres, categories
I'm not interested in all stories, just certain genres and categories.
It would nice to be able to only focus on those
def jaccard_index(set1, set2):
i_count = len(set1.intersection(set2))
u_count = len(set1.union(set2))
return 0 if u_count == 0 else i_count/float(u_count)
from collections import namedtuple
Metafiction = namedtuple("Metafiction", ["authors", "stories", "favourite_authors",
"favourite_stories", "genres", "categories"])
def story_subset(genres_list=None, categories_list=None):
"""returns stor matching these genres and categories
None implies no filter"""
g_stories = genres if genres_list == None else genres[genres_list]
c_stories = categories if categories_list == None else categories[categories_list]
g_story_set = set(genres.index if genres_list == None else genres[g_stories.sum(axis=1) > 0].index)
c_story_set = set(categories.index if categories_list == None else categories[c_stories.sum(axis=1) > 0].index)
return g_story_set.intersection(c_story_set)
def metafiction_subset(ids=[]):
"""returns a Metafiction object containing datasets that only refer to the passed story ids"""
f_stories = stories.ix[ids]
f_authors = authors.ix[f_stories["author"].drop_duplicates()]
f_fav_stories = favourite_stories[favourite_stories.isin(ids)]
f_fav_authors = favourite_authors[favourite_authors.isin(f_authors.index)]
f_genres = genres.ix[ids][genres.columns[genres.ix[ids].sum() > 0]]
f_categories = categories.ix[ids][categories.columns[categories.ix[ids].sum() > 0]]
return Metafiction(authors=f_authors, stories=f_stories, favourite_authors=f_fav_authors,
favourite_stories=f_fav_stories, genres=f_genres, categories=f_categories)
An author_similarity
that accepts a passed metafiction object
def author_similarity(author, my_stories, mfobj):
author_stories = set(mfobj.stories[mfobj.stories["author"] == author.name].index)
author_favourites = set(mfobj.favourite_stories.ix[author.name]) if author.name in mfobj.favourite_stories else set()
all_stories = author_stories.union(author_favourites)
return jaccard_index(all_stories, my_stories)
Stories about Harry Potter, Naruto and Pokemon
story_ids = story_subset(categories_list=[u"Harry Potter", u"Naruto", u"Pokémon"])
mfobj = metafiction_subset(story_ids)
my_fav_stories = ["8096183", # Harry Potter and the Natural 20
"9794740", # Pokemon, The Origin of Species
"9311012", # Lighting up the Dark
"5782108", # Harry Potter and the Methods of Rationality
"7354757", # The Game of Champions
"5193644", # Time Braid
"3695087", # Larceny, Lechery and Luna Lovegood
"9669819", # The Two Year Emperor
]
filtered_favs = set(my_fav_stories).intersection(story_ids)
filtered_favs
{'3695087', '5193644', '5782108', '7354757', '8096183', '9311012', '9794740'}
auth_sim_series = mfobj.authors.apply(author_similarity, axis=1, args=(filtered_favs, mfobj))
auth_sim_series.name = "similarity"
auth_sim = mfobj.authors.join(auth_sim_series)
auth_sim.sort("similarity", ascending=False)[:5]
name | similarity | |
---|---|---|
author | ||
4976703 | alexanderwales | 0.500000 |
5118664 | daystar721 | 0.400000 |
2269863 | Less Wrong | 0.185185 |
5111102 | EagleJarl | 0.176471 |
3344060 | Velorien | 0.166667 |
story_sim_values = DataFrame(index=mfobj.stories.index, columns=["sim_total", "sim_count", "similarity"])
# the writer's similarity + count
story_sim_values["sim_total"] = auth_sim.ix[mfobj.stories.ix[story_sim_values.index]["author"].values]["similarity"].values
story_sim_values["sim_count"] = 1
for author in auth_sim.iterrows():
author_favs = mfobj.favourite_stories.get(author[0], Series())
story_sim_values.loc[author_favs, "sim_total"] += author[1]["similarity"]
story_sim_values.loc[author_favs, "sim_count"] += 1
story_sim_values["similarity"] = story_sim_values["sim_total"].div(story_sim_values["sim_count"])
story_sim = mfobj.stories.join(story_sim_values)
story_sim.sort("similarity", ascending=False)[:5][["title", "sim_total", "sim_count", "similarity"]]
title | sim_total | sim_count | similarity | |
---|---|---|---|---|
story | ||||
10023949 | Harry Potter and the Philosopher\'s Zombie | 0.900000 | 2 | 0.450000 |
9676374 | Daystar\'s Remix of Rationality | 0.400000 | 1 | 0.400000 |
9794740 | Pokemon: The Origin of Species | 0.953571 | 3 | 0.317857 |
5300280 | The Natural History of Pokemon | 0.400000 | 2 | 0.200000 |
10069991 | A Wizard Named Harry in 505 Words | 0.176471 | 1 | 0.176471 |
story_sim.sort("sim_total", ascending=False)[:5][["title", "sim_total", "sim_count", "similarity"]]
title | sim_total | sim_count | similarity | |
---|---|---|---|---|
story | ||||
8096183 | Harry Potter and the Natural 20 | 1.920222 | 17 | 0.112954 |
5782108 | Harry Potter and the Methods of Rationality | 1.576022 | 16 | 0.098501 |
9794740 | Pokemon: The Origin of Species | 0.953571 | 3 | 0.317857 |
10023949 | Harry Potter and the Philosopher\'s Zombie | 0.900000 | 2 | 0.450000 |
5193644 | Time Braid | 0.899144 | 10 | 0.089914 |
story_sim.sort("sim_count", ascending=False)[:5][["title", "sim_total", "sim_count", "similarity"]]
title | sim_total | sim_count | similarity | |
---|---|---|---|---|
story | ||||
8096183 | Harry Potter and the Natural 20 | 1.920222 | 17 | 0.112954 |
5782108 | Harry Potter and the Methods of Rationality | 1.576022 | 16 | 0.098501 |
2731239 | Team 8 | 0.345915 | 12 | 0.028826 |
5409165 | It\'s For a Good Cause, I Swear! | 0.098002 | 11 | 0.008909 |
5453054 | His Own Man | 0.091417 | 10 | 0.009142 |