#!/usr/bin/env python
# coding: utf-8

# # US Baby Names 1880-2010
# 
# United States Social Security Administration(SSA, 미국사회안전부)에서 1880년부터 지금까지의 출생자의 이름 빈도 데이터를 제공하고 있다. 
# 
# 데이터를 제공하는 URL은 http://www.ssa.gov/oact/babynames/limits.html 이다. names.zip 파일을 내려받아 압축을 풀면 다음과 같은 파일 목록을 볼 수 있다.
# 
# ```
# $ ls
# NationalReadMe.pdf yob1912.txt        yob1945.txt        yob1978.txt
# yob1880.txt        yob1913.txt        yob1946.txt        yob1979.txt
# yob1881.txt        yob1914.txt        yob1947.txt        yob1980.txt
# yob1882.txt        yob1915.txt        yob1948.txt        yob1981.txt
# yob1883.txt        yob1916.txt        yob1949.txt        yob1982.txt
# yob1884.txt        yob1917.txt        yob1950.txt        yob1983.txt
# yob1885.txt        yob1918.txt        yob1951.txt        yob1984.txt
# yob1886.txt        yob1919.txt        yob1952.txt        yob1985.txt
# yob1887.txt        yob1920.txt        yob1953.txt        yob1986.txt
# yob1888.txt        yob1921.txt        yob1954.txt        yob1987.txt
# yob1889.txt        yob1922.txt        yob1955.txt        yob1988.txt
# yob1890.txt        yob1923.txt        yob1956.txt        yob1989.txt
# yob1891.txt        yob1924.txt        yob1957.txt        yob1990.txt
# yob1892.txt        yob1925.txt        yob1958.txt        yob1991.txt
# yob1893.txt        yob1926.txt        yob1959.txt        yob1992.txt
# yob1894.txt        yob1927.txt        yob1960.txt        yob1993.txt
# yob1895.txt        yob1928.txt        yob1961.txt        yob1994.txt
# yob1896.txt        yob1929.txt        yob1962.txt        yob1995.txt
# yob1897.txt        yob1930.txt        yob1963.txt        yob1996.txt
# yob1898.txt        yob1931.txt        yob1964.txt        yob1997.txt
# yob1899.txt        yob1932.txt        yob1965.txt        yob1998.txt
# yob1900.txt        yob1933.txt        yob1966.txt        yob1999.txt
# yob1901.txt        yob1934.txt        yob1967.txt        yob2000.txt
# yob1902.txt        yob1935.txt        yob1968.txt        yob2001.txt
# yob1903.txt        yob1936.txt        yob1969.txt        yob2002.txt
# yob1904.txt        yob1937.txt        yob1970.txt        yob2003.txt
# yob1905.txt        yob1938.txt        yob1971.txt        yob2004.txt
# yob1906.txt        yob1939.txt        yob1972.txt        yob2005.txt
# yob1907.txt        yob1940.txt        yob1973.txt        yob2006.txt
# yob1908.txt        yob1941.txt        yob1974.txt        yob2007.txt
# yob1909.txt        yob1942.txt        yob1975.txt        yob2008.txt
# yob1910.txt        yob1943.txt        yob1976.txt        yob2009.txt
# yob1911.txt        yob1944.txt        yob1977.txt        yob2010.txt
# 
# $ head yob1880.txt
# Mary,F,7065
# Anna,F,2604
# Emma,F,2003
# Elizabeth,F,1939
# Minnie,F,1746
# Margaret,F,1578
# Ida,F,1472
# Alice,F,1414
# Bertha,F,1320
# Sarah,F,1288
# ```
# 
# 이 데이터로 어떤 분석들이 가능할까?
# 
# * 특정 이름의 빈도 변화를 그래프로 표시
# * 각 이름의 상대적인 등수 결정
# * 매년 가장 인기있던 이름과 가장 큰 증감률을 보인 이름
# * 이름 유행 분석 등
# 
# 그럼 지금부터 pandas로 이 데이터분석을 시작해보자.

# ## 데이터 로드

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd


# In[2]:


names1880 = pd.read_csv('/Users/yong27/study/pydata/pydata-book/ch02/names/yob1880.txt', names=['name', 'sex', 'births'])


# In[3]:


names1880


# 1880년생 이름들이 잘 로드되었다 (상위 2000개 이름들만 제공함). 성별별로 몇명이나 있는지 계산해보자.

# In[4]:


names1880.groupby('sex').births.sum()


# 1880년생뿐 아니라 모든 연도 출생자들을 다 로드해보자.

# In[5]:


years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = '/Users/yong27/study/pydata/pydata-book/ch02/names/yob{}.txt'.format(year)
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    pieces.append(frame)
    
names = pd.concat(pieces, ignore_index=True)


# In[6]:


names


# 모든 연도의 데이터가 다 로드되어 **names**라는 하나의 Dataframe에 저장되었다. 총 1,690,784개의 레코드가 있다. 

# In[8]:


total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)


# In[9]:


total_births


# Dataframe.pivot_table 메쏘드는 테이블내 두 컬럼을 결합한다. 이 데이터를 가지고, 매년 남녀 출생수(births)를 차트로 표시해보자.

# In[10]:


total_births.plot(title='Total births by sex and year')


# 매년 남녀 출생수 현황을 한눈에 알 수 있다. 1930년전에는 여자가 많았는데, 그 이후에 남자가 많이지는 점이 이채롭다.
# 
# 이제, 각 레코드에 "prop" 컬럼을 만들고, 해당 이름의 빈도를 추가해보자. 

# In[11]:


def add_prop(group):
    #births = group.births.astype(float)
    group['prop'] = group.births / group.births.sum()
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)


# In[12]:


names


# 해당 연도, 성별을 그룹으로 하고, 그 그룹에서의 이름 빈도가 추가되었다. 이 빈도가 정확하게 계산되었는지 궁금하다. 한번 확인해보자.

# In[13]:


import numpy as np
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)


# 오케이. 이제부터 매년, 각 성별별로 특정 이름의 빈도 변화를 살펴보자. 매년, 각 성별별(그룹별)로 데이터수가 다르니, 그룹별 top1000만 쓰기로 하자.

# In[14]:


def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)


# In[15]:


top1000


# 이로서 top1000 변수에 그룹별 1000개의 이름이 정리됐다. 
# 
# ## 이름 유행 분석
# 
# 이름 유행 분석을 해보자. 먼저 top1000을 boys, girls로 분리해보자.

# In[17]:


boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']


# 그리고, top1000을 피벗테이블로 펼쳐보자.

# In[18]:


total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)


# In[19]:


total_births


# 이름이 너무 많으니, 관심있는 이름 John, Harry, Mary, Marilyn만 확인해보자.

# In[20]:


subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]


# 이 4명을 차트로 표시해보면,

# In[21]:


subset.plot(subplots=True, figsize=(12, 10), grid=False, title='Number of births per year')


# 각 이름별로 매년 어떻게 변화하는지 한눈에 확인이 가능하다.
# 
# ## 이름 다양성 증가 확인
# 
# 위 차트의 설명 가운데 하나는 부모들이 점점 일반적인 이름을 쓰지 않기 때문일 수 있다. 정말 그러한지 확인해보자. 

# In[22]:


table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc=sum)


# In[24]:


table


# In[25]:


table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))


# 그룹별 1000개 레코드의 이름빈도의 합계가 시간이 지날수록 1에서 멀어지는 걸 보아, 다양성이 증가하고 있음을 유추할 수 있다.
# 
# 또 다른 계산방법으로, 이름빈도로 정렬했을 때, 50%에 도달할 때, 몇명이 있는지 확인할 수 있다. 많을수록 다양하다고 유추할 수 있다. 예를 들어 2010년 출생 남자들은,

# In[26]:


df = boys[boys.year == 2010]


# In[27]:


df


# In[28]:


prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()


# In[30]:


prop_cumsum


# cumsum 메쏘드는 누적합을 의미한다. 위 데이터에서 0.5에 도달할 때의 index는 searchsorted 메쏘드로 확인한다. 

# In[31]:


prop_cumsum.searchsorted(0.5)


# 파이썬이 0부터 카운팅하므로, 117개의 이름이 50%를 차지함을 알 수 있다. 반면에, 

# In[32]:


df = boys[boys.year == 1900]
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()
in1900.searchsorted(0.5) + 1


# 1900년생은 25명이다. 확실히 다양성이 증가했다. 
# 
# 이를 종합하여 매년 어떻게 변화하는지 살펴보자.

# In[33]:


def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return (group.prop.cumsum().searchsorted(q) + 1)[0]

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')


# In[34]:


diversity


# In[35]:


diversity.plot(title="Number of popular names in top 50%")


# 남자이름 보단 여자이름이 더 다양한 것으로 나옴. 1980년대 이후 특히 증가함을 알 수 있다.
# 
# ## The "Last letter" Revolution
# 
# 2007년에 출생아 이름 연구가 Laura Wattenberg는 남자이름의 마지막 글자 분포가 지난 100년에 걸처 유의하게 변해왔다고 한다. 확인해보자.

# In[36]:


get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
last_letters


# In[37]:


table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)
table


# 연도가 너무 많으니 대표적은 3개의 연도만 확인해보기로 하자.

# In[38]:


subtable = table.reindex(columns=[1910, 1960, 2010], level='year')


# In[39]:


subtable.sum()


# In[40]:


letter_prop = subtable / subtable.sum()


# In[41]:


letter_prop


# 이를 차트로 표시해보면,

# In[42]:


import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10,8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)


# 남자의 경우, 1960년대 이후, "n"으로 끝나는 이름이 현저히 늘었다. 이를 전체 그룹에 대해 확장해보자.

# In[43]:


letter_prop = table / table.sum()
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts


# 이를 차트로 표시하면, 

# In[44]:


dny_ts.plot()


# 전체 연도로 패턴을 보니, 확실히 "n"으로 끝나는 빈도가 증가하고 있다.
# 
# ## 성별이 바뀌는 이름
# 
# 성별이 바뀌는 이름들이 있다. Lesley, Leslie 이름이 대표적인 예. 이를 구체적으로 확인해보자. 

# In[45]:


all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like


# In[46]:


filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()


# In[47]:


table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)
table


# In[48]:


table = table.div(table.sum(1), axis=0)
table


# In[49]:


table.plot(style={'M': 'k-', 'F': 'k--'})


# "Lesl~"로 시작하는 이름은 1940년대에는 남자이름이었다가 그후 빠르게 여자이름으로 바뀜. 

# In[ ]: