#!/usr/bin/env python
# coding: utf-8

# - 이예영

# ## Ch. 2 Introductory Examples

# ### 1. usa.gov data from bit.ly

# - 웹에 저장되어 있는 txt 파일을 url로 접근해 가져오기 위한 urllib2 모듈 활용

# In[2]:


pathUrl2 = 'https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'


# In[3]:


import urllib


# In[4]:


response = urllib.urlopen(pathUrl2)


# In[5]:


responseLines = response.readlines()


# In[6]:


responseLines[0]


# In[8]:


import json


# In[11]:


records = [json.loads(line) for line in responseLines]


# In[12]:


records[0]


# In[13]:


records[0]['tz']


# In[14]:


print records[0]['tz']


# 1) 순수 파이썬으로 표준시간대 세어보기 (Time zone 카운팅)

# In[15]:


time_zones=[rec['tz'] for rec in records]


# In[16]:


time_zones = [rec['tz'] for rec in records if 'tz' in rec]


# In[17]:


time_zones[:10]


# - 파이썬 표준 라이브러리 collections.Counter 클래스 이용

# In[23]:


from collections import Counter


# In[24]:


counts = Counter(time_zones)


# In[26]:


counts.most_common(10)


# 
# 2) pandas로 표준시간대 세어보기

# In[27]:


from pandas import DataFrame, Series


# In[28]:


import pandas as pd; 
import numpy as np


# In[29]:


pd.__version__


# In[30]:


frame = DataFrame(records)


# In[31]:


frame


# In[32]:


frame['tz'][:10]


# In[33]:


tz_counts = frame['tz'].value_counts()


# In[34]:


tz_counts[:10]


# - plot 결과가 ipython notebook 페이지에 안보일때 아래명령어 수행

# In[35]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[36]:


tz_counts[:10].plot(kind='barh', rot=10)


# - url 축약하는데 다음과같은 정보가 담은 필드 존재

# In[37]:


frame['a'][1]


# In[38]:


frame['a'][50]


# In[39]:


frame['a'][51]


# In[40]:


result = Series([x.split()[0] for x in frame.a.dropna()])


# In[41]:


result[:5]


# In[42]:


result.value_counts()[:8]


# In[43]:


cframe = frame[frame.a.notnull()]


# In[44]:


operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')


# In[45]:


operating_system[:5]  
#각 행이 윈도우인지 아닌지 검사


# In[46]:


by_tz_os = cframe.groupby(['tz', operating_system])
#표준시간대와 운영체제 데이터를 그룹으로 묶기


# - 그룹별 합계는 size함수로 계산한다

# In[47]:


by_tz_os.size()


# In[48]:


agg_counts = by_tz_os.size().unstack().fillna(0)


# In[49]:


agg_counts[:10]


# In[50]:


indexer = agg_counts.sum(1).argsort()


# In[51]:


indexer[:10]


# In[52]:


count_subset = agg_counts.take(indexer)[-10:]
#take를 사용해 정렬된 순서그대로 선택, 마지막 10행 잘라내기


# In[53]:


count_subset


# In[55]:


count_subset.plot(kind='barh', stacked=True)


# In[58]:


nored_subset = count_subset.div(count_subset.sum(1), axis=0)


# In[59]:


nored_subset.plot(kind='barh', stacked=True)


# ## 2. MovieLens의 영화평점 데이터

# - 웹의 url을 사용하여 파일읽기

# In[60]:


pathUrl3 = 'https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/users.dat'


# In[69]:


import urllib


# In[70]:


response = urllib.urlopen(pathUrl3)


# In[71]:


responseLines = response.readlines()
#한줄씩읽음


# In[72]:


responseLines


# 1) pandas.read_table 이용해 DataFrame 불러오기

# In[73]:


import pandas as pd


# In[74]:


unames = ['user_id', 'gender', 'age', 'occupation', 'zip']


# - github에서 rqw의 url을 복사하여 다음과 같이 사용

# In[76]:


users = pd.read_table('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/users.dat', 
                      sep='::', header=None, names=unames)


# In[77]:


rnames = ['user_id', 'movie_id', 'rating', 'timestamp']


# In[78]:


ratings = pd.read_table('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/ratings.dat', 
                        sep='::', header=None, names=rnames)


# In[79]:


mnames = ['movie_id', 'title', 'genres']


# In[81]:


movies = pd.read_table('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/movielens/movies.dat', 
                       sep='::', header=None, names=mnames)


# In[82]:


users[:]


# In[83]:


ratings[:]


# In[84]:


movies[:]


# In[85]:


type(movies)


# 2) .pandas의 merge()로 병합하기

# In[87]:


data = pd.merge(pd.merge(ratings, users), movies)


# In[88]:


data[:10]


# In[90]:


data.ix[0]
#0번째 행의 정보


# 3) 성별에 따른 영화의 평균 평점 구하기 (pivot_table 사용)

# In[91]:


mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')
#책의 rows, cols가 index, colums로 바뀜
#aggfunc은 default값이 mean 이며, 값을 어떻게 나타낼지 결정하는 함수


# In[92]:


mean_ratings[:5]
#성별에 따른 영화 평점 DataFrame으로 객체 생성


# 4) 250건 이상의 평점 정보가 있는 영화만 추리기 (size() 사용)

# In[93]:


ratings_by_title = data.groupby('title').size()


# In[94]:


ratings_by_title[:10]


# - 250건 이상의 평점 정보가 있는 영화의 색인은 mean_ratings에서 항목을 선택하기 위해 사용

# In[95]:


active_titles = ratings_by_title.index[ratings_by_title >= 250]


# In[96]:


active_titles


# - 250건 이상의 영화에 대한 색인은 mean_ratings에서 항목을 선택하기위해 사용

# In[97]:


mean_ratings = mean_ratings.ix[active_titles]


# In[98]:


mean_ratings[:10]


# - 여성에게 높은 평점을 받은 영화목록 확인

# In[99]:


top_female_ratings = mean_ratings.sort_index(by='F', ascending = False)


# In[100]:


top_female_ratings[:]


# 5) 평점 차이 구하기

# In[102]:


mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
#남성 평점과 여성평점 차이를 diff에 넣기


# In[103]:


sorted_by_diff = mean_ratings.sort_index(by='diff')


# In[104]:


sorted_by_diff[:10]


# In[106]:


sorted_by_diff[::-1][:10]
#역순으로 상위 10개만 (남성들이 선호하는 순으로)


# - 호불호가 극명핳게 나뉘는 영화 찾기

# In[107]:


#평점의 표준편차
rating_std_by_title = data.groupby('title')['rating'].std()


# In[108]:


#active_titles만 선택
rating_std_by_title = rating_std_by_title.ix[active_titles]


# In[109]:


#내림차순으로 정렬
rating_std_by_title.order(ascending=False)[:10]


# In[111]:


rating_std_by_title.order(ascending=True)[:10]


# ## 3.신생아 이름

# In[112]:


import pandas as pd


# In[114]:


names1880 = pd.read_csv('https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/names/yob1880.txt', 
                         names=['name', 'sex', 'births'])


# In[115]:


names1880[:]


# In[116]:


names1880.groupby('sex').births.sum()
#성별에 따라 그룹화하여 birth값을 구함 (해당연도의 전체 출생수)


# 1) 연도별로 나누어진 데이터를 DataFrame으로 취함 (panda.concat 사용)

# In[117]:


years = range(1880, 2011)  #2010년 데이터가 가장 마지막


# In[118]:


pieces=[]


# In[119]:


columns = ['name', 'sex', 'births']


# In[120]:


for year in years:
    path = 'https://raw.githubusercontent.com/pydata/pydata-book/master/ch02/names/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)
    
    frame['year'] = year
    pieces.append(frame)


# In[121]:


#하나의 DataFrame으로 모든 데이터를 연결시키면
names = pd.concat(pieces, ignore_index=True)
#read_csv를 통해 읽어온 원래 행 순서는 몰라도 되므로 index무시


# In[122]:


names


# - 이것을 토대로 groupby and pivot_table을 이용해서 연도나 성별 데이터 수집 가능

# In[125]:


total_births = names.pivot_table('births', index='year', 
                                 columns='sex', aggfunc=sum)


# In[126]:


total_births.tail()


# In[127]:


total_births.plot(title='Total births by sex and year')


# 2) prop열을 추가해서 전체 출생수에서 차지하는 비율 계산

# In[131]:


#함수 정의
def add_prop(group):
    #Integer division floors
    births = group.births.astype(float)   #birth를 float타입으로
    
    group['prop'] = births/ births.sum()  
    #births를 births전체의 합으로 나누면 prop라는 출생률 계산
    return group


# In[133]:


names = names.groupby(['year', 'sex']).apply(add_prop)  
#새로운 열을 추가


# In[134]:


names


# 3) 모든 그룹에서 prop의 열의 합이 1이 맞는지 확인 (sanity check - np.allclose() 사용)

# In[135]:


import numpy as np


# In[136]:


np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)


# 4) 연도별, 성별에 따른 빈도수가 가장 높은 이름 100개 추출 (그룹연산 사용)

# In[138]:


#함수정의
def get_top100(group):
    return group.sort_index(by='births', ascending=False)[:100]
    #birth로 내림차순 정렬
    
grouped = names.groupby(['year', 'sex']) #year,sex로 그룹화해서 top100 출력


# In[139]:


top100 = grouped.apply(get_top100)


# In[140]:


top100


# ## 4.이름유행 분석

# 1) 1000개의 데이터를 남자와 여자로 분리

# In[141]:


boys = top100[top100.sex == 'M']


# In[142]:


girls = top100[top100.sex == 'F']


# - 연도와 이름에 대한 전체 출생수를 피벗테이블로 작성

# In[144]:


total_births = top100.pivot_table('births', index='year', columns='name', aggfunc=sum)


# In[145]:


total_births[:10]


# In[146]:


subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]


# In[147]:


get_ipython().run_line_magic('matplotlib', 'inline')
subset.plot(subplots=True, figsize=(12,10), grid=False, title='Nuber of births per year')


# 2) 다양한 이름을 사용하는 경향 파악하기

# 위의 표를 통해서 부모가 아이 이름을 지을 때 흔한 이름은 기피하는것으로 해석할 수 있다.

# In[148]:


table = top100.pivot_table('prop', index='year', columns='sex', aggfunc=sum)


# In[149]:


table[:10]


# In[152]:


table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))


# - 위 그래프를 통해 실제로 이름의 다양성이 높아지고 있음을 보인다. (비율의 총합이 시간이 흐를수록 감소하고 있음을 보인다.)

# In[153]:


df = boys[boys.year == 2010] #2010년에 인기있는 이름순으로 정렬


# In[154]:


df


# 3) 전체의 50%가 되기까지 얼마나 많은 이름이 등장하나 (Numpy 이용)

# In[156]:


prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
#prop의 누계가 0.5가 되는 위치를 구한다.


# In[157]:


prop_cumsum


# In[158]:


type(prop_cumsum)


# In[159]:


prop_cumsum.searchsorted(0.03)


# In[160]:


prop_cumsum.searchsorted(0.03)[0]


# In[161]:


prop_cumsum.searchsorted(0.5)[0] + 1  
#색인의 경우 시작을 0부터 하기 때문에 +1을 한다.


# In[162]:


df = boys[boys.year == 1900]


# In[164]:


in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
in1900.searchsorted(0.5)[0] + 1


# - 각 연도와 성별 조합에 적용할 수 있다. 
# 연도와 성별을 Groupby로 묶고 각 그룹에 apply를 사용하여 연산을 적용한다.

# In[166]:


#함수 선언
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().searchsorted(q)[0] + 1

diversity = top100.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')


# - 연산 결과인 diversity DataFrame은 이제 각 성별에 따라 연도별로 색인된 2개의 시계열 데이터를 담고 있다.

# In[167]:


diversity.head()  #diversity[:5]와 같은 결과


# In[168]:


diversity.plot(title = "Number of popular names in top 50%")
#numeric searchsorted 는 배열의 위치를 찾는 것이기 때문에 int형이 아님


# ## 5. '마지막 글자'의 변환

# 1) 연도와 성별, 이름의 마지막 글자를 수집해서 확인

# In[169]:


#name 열에서 마지막 글자를 추출
get_last_letter = lambda x : x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)


# In[170]:


subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()  #subtable[:5]와 같은 효과


# 2) 전체 출생수에서 성별로 각각의 마지막 글자가 차지하는 비율 계산
# 

# In[171]:


subtable.sum()


# In[172]:


letter_prop = subtable / subtable.sum().astype(float)


# In[173]:


import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male'), letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)


# 그래프에서 보듯이 남자는 'n'으로 끝나는 이름이 1960이후 증가

# 3) 성별로 정규화, 남자아이 이름에서 몇 글자를 선택하여 이름을 열로 하는 시계열 데이터로 변환

# In[174]:


letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()


# In[175]:


dny_ts.plot()


# ## 6. 남자 이름과 여자 이름이 바뀐 경우

# 1) lesley 또는 Leslie 라는 이름이 그러한 경우, top1000을 이용하여 'lesl'로 시작하는 이름이 포함된 리스트 생성

# In[178]:


all_names = top100.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])


# In[179]:


lesley_like = all_names[mask]
lesley_like


# 2)이름들만 추려내어 이름별로 출생수를 구하고 상대 도수 확인

# In[180]:


filtered = top100[top100.name.isin(lesley_like)]


# In[181]:


filtered.groupby('name').births.sum()


# In[182]:


table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)


# In[183]:


table = table.div(table.sum(1), axis=0)


# In[184]:


table.tail()


# In[186]:


table.plot(style={'M': 'k-', 'F': 'k--'})  
#남자는 실선으로, 여자는 점선으로 표시


# In[ ]: