from IPython.core.display import Image Image(filename='screencaps/pandas_goals.png') # From Python for Data Analysis (Wes McKinney), page 111: import numpy as np import pandas as pd import os pd.set_printoptions(notebook_repr_html=True) # your choice- will format prettily for html in nb. # Defining a trivial data frame, by hand, from an example by @fonnesbeck df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6, 'B' : ['A', 'B', 'C'] * 8, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, 'D' : np.random.randn(24), 'E' : np.random.randn(24), 'F' : pd.DateRange(start='4/1/2012', periods=24)}) df base_dir = 'moviedata' unames = ['user_id', 'gender', 'age', 'occupation', 'zip'] users = pd.read_table(os.path.join(base_dir, 'users.dat'), sep='::', header=None, names=unames) users from IPython.core.display import Image Image(filename='screencaps/read_csv.png') # From Python for Data Analysis (Wes McKinney), chapter 6: users[:12] users['age']#[:10] # the [:10] is a slice - a short primer on slice syntax in python: http://stackoverflow.com/questions/509211/good-primer-for-python-slice-notation users.get_value(5,'age') #5th item, column age users['age']<10 #users[users['age'] < 10] users['age'].values #list(users['age'].values) # to convert to a simple list again. users.describe() users[users['gender']=='F']['age'].mean() # try changing it to 'M' too? users.boxplot(column="age", by="gender") users[users['gender']=='F']['age'].hist() # do this with =='M' instead to see the subtle contrast. # If you want, insert a new cell below this one and do it there. users.groupby('gender').count() # users.groupby('gender')['age'].count() from IPython.core.display import Image Image(filename='screencaps/desc_stats.png') # From Python for Data Analysis (Wes McKinney), page 139: from IPython.core.display import Image Image(filename='screencaps/merge.png') # From Python for Data Analysis (Wes McKinney), page 139: rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_table(os.path.join(base_dir, 'ratings.dat'), sep='::', header=None, names=rnames) mnames = ['movie_id', 'title', 'genres'] movies = pd.read_table(os.path.join(base_dir, 'movies.dat'), sep='::', header=None, names=mnames) ratings ratings.head() # or .tail() movies[:5] ratings data = pd.merge(pd.merge(ratings, users), movies) data data.ix[0] # change the 0 to another number to see another record... is this someone rating for their kid?! data['rating'].hist(bins=5) # the ratings are whole numbers, it seems mean_ratings = data.pivot_table('rating', rows='title', cols='gender', aggfunc='mean') mean_ratings[:5] users.groupby('gender').count() # users.groupby('gender')['age'].count() from IPython.core.display import Image Image(filename='screencaps/group_by.png') # From Python for Data Analysis (Wes McKinney): ratings_by_genre = data.groupby('genres') ratings_by_genre.mean().sort_index(by='rating',ascending=True)['rating'].plot(kind="barh") # try changing the ascending argument to False; and then try just first 10... # ratings_by_genre.mean().sort_index(by='rating',ascending=True)['age'].plot(kind="barh") # why does this look this way? #ratings_by_genre.mean().sort_index(by='rating',ascending=True)[:10]['rating'].plot(kind="barh") pd.crosstab(data.genres, data.gender, margins=True, colnames=['gender'])[:10] pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=len)[:10] #pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=mean)[:10] #pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=mean).plot() #pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=len).unstack()[:10] ratings_by_title = data.groupby('title').size() ratings_by_title[:10] def top(df, n=5, column='rating'): return df.sort_index(by=column)[-n:] top(ratings, n=6) data.groupby('gender').apply(top) active_titles = ratings_by_title.index[ratings_by_title >= 250] active_titles[:15] # select only the ones with active titles mean_ratings = mean_ratings.ix[active_titles] mean_ratings ratings_by_title[:12] top_female_ratings = mean_ratings.sort_index(by='F', ascending=False) top_female_ratings[:12] pd.set_printoptions(max_columns=20) # needed because otherwise the male ratings won't print, due to long title top_male_ratings = mean_ratings.sort_index(by='M', ascending=False) top_male_ratings[:10] #.plot(kind='barh') # add a column mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F'] pd.scatter_matrix(mean_ratings,alpha=0.2, figsize=(8, 8)) # ,diagonal='kde' mean_ratings['F'].median() #mean_ratings['M'].median() mean_ratings.describe() # or mean_ratings['M'].describe() etc sorted_by_diff = mean_ratings.sort_index(by='diff') # default is ascending=True sorted_by_diff[:15] sorted_by_diff[:20].plot(kind='barh') sorted_by_diff['diff'][:20].plot(kind='barh') # fix the orientation by using a numpy absolute value function np.abs(sorted_by_diff['diff'][:20]).plot(kind='barh') sorted_by_diff[::-1][:15] # the ones that men liked and women didn't - [::-1] reverses a list in python! sorted_by_diff[::-1][:20].plot(kind='barh') # a dumb plot first... but shows some movies have low ratings overall - look at Barb Wire. # let's add a simple avg of the 2 ratings as a new column mean_ratings['avg'] = (mean_ratings['M'] + mean_ratings['F']) / 2 mean_ratings[:5] # you should see the new col, avg sorted_by_diff = mean_ratings.sort_index(by='diff', ascending=False) # meaning, the biggest diff is the TOP # you could make a sorted_by_avg too. try it? sorted_by_diff[:10] plt.scatter(sorted_by_diff['diff'], sorted_by_diff['avg'],alpha=.6) xlabel("Diff") ylabel("Avg") # Remember how to look at a command's options in the notebook: (Click the bar to close the section again.) plt.scatter? sorted_by_diff.describe() sorted_by_diff[sorted_by_diff['diff']< -.83] sorted_by_diff # Since the data frame is indexed by the movie names, we can do this: sorted_by_diff.index[0] # since it was already sorted, we can look at the first item's name this way. # sorted_by_diff.index # try this too... # sorted_by_diff.ix[0] # gives the whole record at that index. rating_std_by_title = data.groupby('title')['rating'].std() rating_std_by_title = rating_std_by_title.ix[active_titles] rating_std_by_title.order(ascending=False)[:10] mean_ratings_noidx = mean_ratings.reset_index() import re year_re = r'.* \(([\d]+)\)' year = mean_ratings_noidx.title.str.match(year_re).str[0] movie_decade = pd.Series(year.astype(int).values // 10 * 10, index=mean_ratings_noidx.title) movie_decade from IPython.core.display import Image Image(filename='screencaps/vec_string.png') # From Python for Data Analysis (Wes McKinney), chapter 7: data_sub = data[data.title.isin(active_titles)] #isin is cool! data_sub by_decade = data_sub.groupby(data_sub.title.map(movie_decade)) by_decade.size() by_decade.size().plot(kind='bar') decade = data_sub.title.map(movie_decade) decade.name = 'decade' by_decade_gender = data_sub.groupby([decade, 'gender']) by_decade_gender.rating.mean() by_decade_gender.rating.mean().unstack() # make them columns by_decade_gender.rating.mean().unstack mrat_decade = by_decade_gender.rating.mean() #mrat_decade.plot(kind='bar') # uncommment this line, and see what it looks like if you forget to unstack. mrat_decade.unstack().plot(kind='bar') by_decade_gender.rating.mean().to_csv("my file2.csv") ls