from IPython.core.display import Image
Image(filename='screencaps/pandas_goals.png')
# From Python for Data Analysis (Wes McKinney), page 111:

import numpy as np
import pandas as pd
import os
pd.set_printoptions(notebook_repr_html=True) # your choice- will format prettily for html in nb.

# Defining a trivial data frame, by hand, from an example by @fonnesbeck

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6,
                'B' : ['A', 'B', 'C'] * 8,
                'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4,
                'D' : np.random.randn(24),
                'E' : np.random.randn(24),
                'F' : pd.DateRange(start='4/1/2012', periods=24)})

df

base_dir = 'moviedata'

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(os.path.join(base_dir, 'users.dat'), sep='::', 
                      header=None, names=unames)

users

from IPython.core.display import Image
Image(filename='screencaps/read_csv.png')
# From Python for Data Analysis (Wes McKinney), chapter 6:

users[:12]

users['age']#[:10]
# the [:10] is a slice - a short primer on slice syntax in python: http://stackoverflow.com/questions/509211/good-primer-for-python-slice-notation

users.get_value(5,'age')  #5th item, column age

users['age']<10
#users[users['age'] < 10]

users['age'].values
#list(users['age'].values) # to convert to a simple list again.

users.describe()

users[users['gender']=='F']['age'].mean() # try changing it to 'M' too?

users.boxplot(column="age", by="gender")

users[users['gender']=='F']['age'].hist()
# do this with =='M' instead to see the subtle contrast. 
# If you want, insert a new cell below this one and do it there.

users.groupby('gender').count()
# users.groupby('gender')['age'].count()

from IPython.core.display import Image
Image(filename='screencaps/desc_stats.png')
# From Python for Data Analysis (Wes McKinney), page 139:

from IPython.core.display import Image
Image(filename='screencaps/merge.png')
# From Python for Data Analysis (Wes McKinney), page 139:

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(os.path.join(base_dir, 'ratings.dat'), sep='::', 
                         header=None, names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(os.path.join(base_dir, 'movies.dat'), sep='::', 
                       header=None, names=mnames)

ratings

ratings.head() # or .tail()

movies[:5]


ratings

data = pd.merge(pd.merge(ratings, users), movies)

data

data.ix[0]
# change the 0 to another number to see another record... is this someone rating for their kid?!

data['rating'].hist(bins=5) # the ratings are whole numbers, it seems

mean_ratings = data.pivot_table('rating', rows='title',
                                cols='gender', aggfunc='mean')

mean_ratings[:5]

users.groupby('gender').count()
# users.groupby('gender')['age'].count()

from IPython.core.display import Image
Image(filename='screencaps/group_by.png')
# From Python for Data Analysis (Wes McKinney):

ratings_by_genre = data.groupby('genres')

ratings_by_genre.mean().sort_index(by='rating',ascending=True)['rating'].plot(kind="barh")
# try changing the ascending argument to False; and then try just first 10...
# ratings_by_genre.mean().sort_index(by='rating',ascending=True)['age'].plot(kind="barh") # why does this look this way?
#ratings_by_genre.mean().sort_index(by='rating',ascending=True)[:10]['rating'].plot(kind="barh")

pd.crosstab(data.genres, data.gender, margins=True, colnames=['gender'])[:10]

pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=len)[:10]
#pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=mean)[:10]
#pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=mean).plot()
#pd.pivot_table(data, values=['rating'], rows=['genres','gender'], aggfunc=len).unstack()[:10]

ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]

def top(df, n=5, column='rating'):
    return df.sort_index(by=column)[-n:]

top(ratings, n=6)

data.groupby('gender').apply(top)

active_titles = ratings_by_title.index[ratings_by_title >= 250]

active_titles[:15]

# select only the ones with active titles
mean_ratings = mean_ratings.ix[active_titles]

mean_ratings

ratings_by_title[:12]

top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)

top_female_ratings[:12]

pd.set_printoptions(max_columns=20)
# needed because otherwise the male ratings won't print, due to long title

top_male_ratings = mean_ratings.sort_index(by='M', ascending=False)

top_male_ratings[:10] #.plot(kind='barh')

# add a column
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

pd.scatter_matrix(mean_ratings,alpha=0.2, figsize=(8, 8)) # ,diagonal='kde'

mean_ratings['F'].median()
#mean_ratings['M'].median()

mean_ratings.describe()
# or mean_ratings['M'].describe() etc

sorted_by_diff = mean_ratings.sort_index(by='diff')  # default is ascending=True

sorted_by_diff[:15]

sorted_by_diff[:20].plot(kind='barh')

sorted_by_diff['diff'][:20].plot(kind='barh')

# fix the orientation by using a numpy absolute value function
np.abs(sorted_by_diff['diff'][:20]).plot(kind='barh')

sorted_by_diff[::-1][:15]  
# the ones that men liked and women didn't - [::-1] reverses a list in python!

sorted_by_diff[::-1][:20].plot(kind='barh')
# a dumb plot first... but shows some movies have low ratings overall - look at Barb Wire.

# let's add a simple avg of the 2 ratings as a new column
mean_ratings['avg'] = (mean_ratings['M'] + mean_ratings['F']) / 2
                                               

mean_ratings[:5]  # you should see the new col, avg

sorted_by_diff = mean_ratings.sort_index(by='diff', ascending=False)  # meaning, the biggest diff is the TOP
# you could make a sorted_by_avg too.  try it?

sorted_by_diff[:10]

plt.scatter(sorted_by_diff['diff'], sorted_by_diff['avg'],alpha=.6)
xlabel("Diff")
ylabel("Avg")

# Remember how to look at a command's options in the notebook: (Click the bar to close the section again.)
plt.scatter?

sorted_by_diff.describe()

sorted_by_diff[sorted_by_diff['diff']< -.83]

sorted_by_diff

# Since the data frame is indexed by the movie names, we can do this:
sorted_by_diff.index[0]
# since it was already sorted, we can look at the first item's name this way.
# sorted_by_diff.index # try this too... 
# sorted_by_diff.ix[0] # gives the whole record at that index.

rating_std_by_title = data.groupby('title')['rating'].std()

rating_std_by_title = rating_std_by_title.ix[active_titles]

rating_std_by_title.order(ascending=False)[:10]

mean_ratings_noidx = mean_ratings.reset_index()

import re
year_re = r'.* \(([\d]+)\)'

year = mean_ratings_noidx.title.str.match(year_re).str[0]

movie_decade = pd.Series(year.astype(int).values // 10 * 10, 
                         index=mean_ratings_noidx.title)
movie_decade

from IPython.core.display import Image
Image(filename='screencaps/vec_string.png')
# From Python for Data Analysis (Wes McKinney), chapter 7:

data_sub = data[data.title.isin(active_titles)]  #isin is cool!
data_sub

by_decade = data_sub.groupby(data_sub.title.map(movie_decade))
by_decade.size()

by_decade.size().plot(kind='bar')

decade = data_sub.title.map(movie_decade)
decade.name = 'decade'
by_decade_gender = data_sub.groupby([decade, 'gender'])
by_decade_gender.rating.mean()

by_decade_gender.rating.mean().unstack()
# make them columns

by_decade_gender.rating.mean().unstack

mrat_decade = by_decade_gender.rating.mean()
#mrat_decade.plot(kind='bar')  # uncommment this line, and see what it looks like if you forget to unstack.
mrat_decade.unstack().plot(kind='bar')

by_decade_gender.rating.mean().to_csv("my file2.csv")

ls