#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd r = pd.read_csv( 'ratings.csv' ) tr = pd.read_csv( 'to_read.csv' ) b = pd.read_csv( 'books.csv' ) t = pd.read_csv( 'tags.csv' ) bt = pd.read_csv( 'book_tags.csv') # ### Ratings # In[2]: r.head() # Ratings are sorted chronologically, oldest first. # In[3]: len(r) # In[4]: r.rating.hist( bins = 5 ) # It appears that 4 is the most popular rating. There are relatively few ones and twos. # ### To read # These are the books users marked "to read". # # In[5]: tr.head() # In[6]: len(tr) # Most books have been marked to read by somebody. Majority of the users have some books marked to read. # In[7]: len(tr.book_id.unique()) # In[8]: len(tr.user_id.unique()) # ### Books # In[9]: b.head() # Books are sorted by their popularity, as measured by number of ratings (overall, not in this dataset). # In[10]: len(b) # ### User and book IDs # In[12]: r.user_id.max() # In[14]: r.book_id.max() # The IDs are contiguous. # In[47]: assert( len( r.user_id.unique()) == r.user_id.max()) assert( len( r.book_id.unique()) == r.book_id.max()) # ### Reviews per book # Most books have a few hundred reviews, but some have as few as eight. # In[15]: reviews_per_book = r.groupby( 'book_id' ).book_id.apply( lambda x: len( x )) reviews_per_book.describe() # In[16]: reviews_per_book.sort_values().head( 10 ) # ### Reviews per user # All users have at least 19 reviews. # In[17]: reviews_per_user = r.groupby( 'user_id' ).user_id.apply( lambda x: len( x )) reviews_per_user.describe() # In[18]: reviews_per_user.sort_values().head( 10 ) # ### Tags # In[19]: t.head() # In[20]: len(t) # In[21]: bt.head() # Tag applications are sorted by goodreads_book_id ASC and by count DESC. # In[22]: len(bt) # Let us merge tag names into tag applications. # In[ ]: bt = bt.merge( t, on = 'tag_id' ) # Why don't we merge book titles for good measure. # In[31]: bt = bt.merge( b[[ 'goodreads_book_id', 'title']], on = 'goodreads_book_id' ) # It appears that there are some negative tag counts. Let's fix this. # In[40]: bt['count'].describe() # In[41]: bt.loc[ bt['count'] < 0, 'count'] = 0 # Sample some popular book tags. # In[42]: bt.sample( 10, weights = 'count') # These are the most popular tags: # In[28]: tag_counts = bt.groupby( 'tag_name' ).tag_name.count().sort_values( ascending = False ) tag_counts.head( 20 ) # In[ ]: