#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

import pandas as pd

r = pd.read_csv( 'ratings.csv' )
tr = pd.read_csv( 'to_read.csv' )
b = pd.read_csv( 'books.csv' )

t = pd.read_csv( 'tags.csv' )
bt = pd.read_csv( 'book_tags.csv')


# ### Ratings

# In[2]:


r.head()


# Ratings are sorted chronologically, oldest first.

# In[3]:


len(r)


# In[4]:


r.rating.hist( bins = 5 )


# It appears that 4 is the most popular rating. There are relatively few ones and twos.

# ### To read
# These are the books users marked "to read".
# 

# In[5]:


tr.head()


# In[6]:


len(tr)


# Most books have been marked to read by somebody. Majority of the users have some books marked to read.

# In[7]:


len(tr.book_id.unique())


# In[8]:


len(tr.user_id.unique())


# ### Books

# In[9]:


b.head()


# Books are sorted by their popularity, as measured by number of ratings (overall, not in this dataset).

# In[10]:


len(b)


# ### User and book IDs

# In[12]:


r.user_id.max()


# In[14]:


r.book_id.max()


# The IDs are contiguous.

# In[47]:


assert( len( r.user_id.unique()) == r.user_id.max())
assert( len( r.book_id.unique()) == r.book_id.max())


# ### Reviews per book

# Most books have a few hundred reviews, but some have as few as eight.

# In[15]:


reviews_per_book = r.groupby( 'book_id' ).book_id.apply( lambda x: len( x ))
reviews_per_book.describe()


# In[16]:


reviews_per_book.sort_values().head( 10 )


# ### Reviews per user

# All users have at least 19 reviews.

# In[17]:


reviews_per_user = r.groupby( 'user_id' ).user_id.apply( lambda x: len( x ))
reviews_per_user.describe()


# In[18]:


reviews_per_user.sort_values().head( 10 )


# ### Tags

# In[19]:


t.head()


# In[20]:


len(t)


# In[21]:


bt.head()


# Tag applications are sorted by goodreads_book_id ASC and by count DESC.

# In[22]:


len(bt)


# Let us merge tag names into tag applications.

# In[ ]:


bt = bt.merge( t, on = 'tag_id' )


# Why don't we merge book titles for good measure.

# In[31]:


bt = bt.merge( b[[ 'goodreads_book_id', 'title']], on = 'goodreads_book_id' )


# It appears that there are some negative tag counts. Let's fix this.

# In[40]:


bt['count'].describe()


# In[41]:


bt.loc[ bt['count'] < 0, 'count'] = 0


# Sample some popular book tags.

# In[42]:


bt.sample( 10, weights = 'count')


# These are the most popular tags:

# In[28]:


tag_counts = bt.groupby( 'tag_name' ).tag_name.count().sort_values( ascending = False )
tag_counts.head( 20 )


# In[ ]: