import pandas as pd
from pandas import DataFrame, Series

df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df2 = DataFrame({'key': ['a', 'b', 'd'],
                 'data2': range(3)})

df1

df2

pd.merge(df1, df2)

pd.merge(df1, df2, on='key')

df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df4 = DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})

df3

df4

# left, right 둘 다 같이 써줘야 한다.
# 하나만 써주면 어떻게 될까? 이럴땐 Test 해보는게 짱!
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

# TeypeError! 
# 소스를 보면 right_on == left_on 비교하는 곳이 있다.
# 당연히 right_on은 명시하지 않았으니 NoneType으로 넘어가서 비교가 안된다!
# 아 그 전에 len 함수에서 NoneType의 길이를 재려고 하니 오류가 나는 것임!
pd.merge(df3, df4, left_on='lkey')

# None은 당연히 길이가 없지.
len(None)

pd.merge(df1, df2, how='outer')

# left에만 있는 c까지 포함이 된 것을 확인할 수 있다.
pd.merge(df1, df2, how='left')

# right에만 있는 d까지 포함
pd.merge(df1, df2, how='right')

df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                 'data1': range(6)})

df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                 'data2': range(5)})

df1

df2

pd.merge(df1, df2, on='key', how='left')

# how parameter default is inner
pd.merge(df1, df2, how='inner')

pd.merge(df1, df2)

pd.merge?

left = DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})

right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})

left

right

pd.merge(left, right, on=['key1', 'key2'], how='outer')

pd.merge(left, right, on='key1')

pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                   'value': range(6)})

right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

left1

right1

# left_on으로 어떤 컬럼이 조인 키로 사용될지 결정
# right_index로 어떤 인덱스가 조인 키로 사용될지 결정
pd.merge(left1, right1, left_on='key', right_index=True)

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

# right_on을 입력하지 않으니 당연히 Error!
pd.merge(left1, right1, left_on='key')

# right_on에 group_val을 입력하면 교차하는게 하나도 없으니 아무것도 표시하지 않음
pd.merge(left1, right1, left_on='key', right_on='group_val')

pd.merge(left1, right1, left_on='key', right_on='group_val', how='outer')

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                   'key2': [2000, 2001, 2002, 2001, 2002],
                   'data': np.arange(5.)})

righth = DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])

lefth

righth

# key1, key2와 right_index가 같으니 됨
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

# key1과 right_index는 다르다. key2에 해당하는 연도가 없어서 Error!
pd.merge(lefth, righth, left_on=['key1'], right_index=True)

pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, how='outer')

left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
                  columns=['Ohio', 'Nevada'])

right2 = DataFrame([[7., 8.,], [9., 10.], [11., 12.], [13, 14]],
                   index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])

left2

right2

pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

pd.merge(left2, right2, left_index=True, right_index=True)

# 위와 똑같은 결과인데 join을 사용하면 훨씬 깔끔하다
left2.join(right2, how='outer')

# 색인 기준으로 머지할 때 사용!
left2.join(right2, how='inner')

left1

right1

# 왼쪽 우선 조인이라 c가 있음
left1.join(right1, on='key')

another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                    index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])

left2

right2

another

left2.join([right2, another])

left2

right2

another

left2.join([right2, another], how='outer')

arr = np.arange(12).reshape((3, 4))

arr

type(arr)

# axis = column
np.concatenate([arr, arr], axis=1)

np.concatenate([arr, arr], axis=0)

s1 = Series([0, 1], index=['a', 'b'])

s2 = Series([2, 3, 4], index=['c', 'd', 'e'])

s3 = Series([5, 6], index=['f', 'g'])

s1

s2

s3

pd.concat([s1, s2, s3])

# Series인 것을 확인할 수 있다.
type( pd.concat([s1, s2, s3]) )

pd.concat([s1, s2, s3], axis=1)

# DataFrame인 것을 확인할 수 있다
type( pd.concat([s1, s2, s3], axis=1) )

s4 = pd.concat([s1 * 5, s3])

s1

s1 * 5

s3

s4

pd.concat([s1, s4], axis=1)

pd.concat([s1, s4], axis=1, join='inner')

pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])

s1

s2

s3

# s1 = one, s2 = two, s3 = three로 할당
result

# unstack 함수에 대한 자세한 내용은 나중에 알아보자
result.unstack()

pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], 
                columns=['one', 'two'])

df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                columns=['three', 'four'])

df1

df2

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

pd.concat({'level1': df1, 'level2': df2}, axis=1)

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
          names=['upper', 'lower'])

# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes. Can also add a layer of hierarchical indexing on the
# concatenation axis, which may be useful if the labels are the same (or
# oeverlapping) on the passed axis number
pd.concat?

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
          names=['upper'])

# names는 2개까지만 됨. 현재 df에서
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
          names=['upper', 'lower', 'test'])

df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])

df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

df1

df2

# index 부분이 하나로 합쳐짐
pd.concat([df1, df2], ignore_index=True)

pd.concat([df1, df2])

a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])

b = Series(np.arange(len(a), dtype=np.float64),
            index=['f', 'e', 'd', 'c', 'b', 'a'])

b[-1] = np.nan

a`

np.arange(len(a))

b

# 오랜만에 나와서 np.where가 어떤 기능인지 까먹었었다!
# pd.isnull(a)에서 null은 것은 True을 돌려준다.
# 삼항 연산자처럼 null인 True가 있으면 b값을 대입하고
# null이 False면 a값을 대입한다.
np.where(pd.isnull(a), b, a)

pd.isnull(a)

b[:-2]

a[2:]

b[:-2].combine_first(a[2:])

b[:-2].combine_first

b[:-2]

b.combine_first(a)

# a가 False일 때만 b 값을 참조한다.
# a를 먼저 combine 하되 False라면 b값 참조
a.combine_first(b)

a

df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})

df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})

df1

df2

df1.combine_first(df2)

# 문자열이 담긴 배열을 로우와 칼럼의 색인으로하는 작은 DataFrame
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))

data

# 문자열이 담긴 배열을 로우와 칼럼의 색인으로하는 작은 DataFrame
# pd.Index로 index를 설정 후에 name을 입력하기 위해서 저렇게 한 것.
# 기본적으로는 columns에 list만 넘겨도 생성 됨
data2 = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=['one', 'two', 'three'])

data2

result = data.stack()

result

type(result)

# 보통 가장 안쪽에 있는 one, two, three부터 컬럼으로 끄집어 냄
result.unstack()

type( result.unstack() )

result.unstack(0)

result.unstack('state')

result.unstack(1)

result.unstack('number')

s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])

s2 = Series([4, 5, 6], index=['c', 'd', 'e'])

data2 = pd.concat([s1, s2], keys=['one', 'two'])

s1

s2

data3 = pd.concat([s1, s2])

data3

# index로 s1을 one, s2를 two로 할당
data2

data2.unstack()

# 바꿨다 원상복구... 근데 null값 포함하지 않는다는 게 point!
data2.unstack().stack()

data2

data2.unstack().stack(dropna=False)

df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))

df

result

# unstack 할 때 결과에서 가장 낮은 레벨. 즉, side의 하위로 갔다.
df.unstack('state')

# side도 number의 하위로 갔다. 
df.unstack('state').stack('side')

%%writefile ch07/pivot.csv
date, item, value
1959-03-31 00:00:00, realgdp, 2710.349
1959-03-31 00:00:00, infl, 0.000
1959-03-31 00:00:00, unemp, 5.800
1959-06-30 00:00:00, realgdp, 2778.801
1959-06-30 00:00:00, infl, 2.340
1959-06-30 00:00:00, unemp, 5.100
1959-09-30 00:00:00, realgdp, 2775.488
1959-09-30 00:00:00, infl, 2.740
1959-09-30 00:00:00, unemp, 5.300
1959-12-31 00:00:00, realgdp, 2785.204        

# header를 0으로 설정하겠다는건데.. 기본값.
# 계속 pivot 할 때 에러가 난다.
# names로 꼭 설정해야 하나보다. pivot이 인식하는 것은 names로 setting된 값인듯.
# 열과 행을 바꿔야 하기 때문에 이미 인식하고 있는 값이 없으면 error 뱉는것 같다.
ldata = pd.read_csv('ch07/pivot.csv', header=0)

# 이것 말고는 딱히 다른게 생각나지 않는다.
# csv 파일에서 1번째 줄인 header를 지우면 어떤 컬럼인지 모르니까 차라리 skiprows=1 을 해주는게 낫겠다.
# 6장 처음 부분에 read_csv 옵션들이 있으니 참고
ldata = pd.read_csv('ch07/pivot.csv', skiprows=1, names=['date', 'item', 'value'])

ldata[:10]

type(ldata)

# 1번째 인자: 로우 색인으로 사용될 칼럼 이름
# 2번째 인자: 칼럼 색인으로 사용될 칼럼 이름
# 3번째 인자: DataFrame에 채워 넣을 값을 담고 있는 칼럼
pivoted = ldata.pivot('date', 'item', 'value')

pivoted.head()

ldata['value2'] = np.random.randn(len(ldata))

ldata[:10]

# 3번째 인자 생략하면 계층적 색인으로 보여줌
pivoted = ldata.pivot('date', 'item')

pivoted[:5]

# Wow! Simple sentence!
# 계층적 색인으로 되어 있는 것을 ['value']로 좁혀주고 5개만 보여준다.
pivoted['value'][:5]

unstacked = ldata.set_index(['date', 'item']).unstack('item')

unstacked[:7]

# 위에서 unstack을 하니 item 로우가 컬럼으로 이동했다.
unstacked = ldata.set_index(['date', 'item'])

# date, item이 로우
# value, value2가 열
unstacked[:7]

# set_index가 로우 설정
# set_index로 설정되지 않은 것들은 모두 열로 이동
unstacked = ldata.set_index(['date'])

unstacked

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})

data

# 이 구조가 잘 인해 안되면 타이핑 해보세요.
# 파이썬 인터프리터가 답을 알려줍니다.
['one'] * 3 + ['two'] * 4

data.duplicated()

data2 = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 3, 4]})

# 2개 열이 모두 같아야 중복으로 인정 됨
data2.duplicated()

# druplicated 배열이 False인 DataFrame 반환
data.drop_duplicates()

data['v1'] = range(7)

data

# 중복 여부를 k1 컬럼만 본다는 이야기
# one, two만 남을 수 밖에 없음
data.drop_duplicates(['k1'])

data.drop_duplicates(['k1'], take_last=True)

data.drop_duplicates(['k1'], take_last=False)

data.drop_duplicates(['k1', 'k2'])

# take_last=False는 5를 선택
# take_last=True는 6을 선택
data.drop_duplicates(['k1', 'k2'], take_last=True)

data.drop_duplicates?

data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami',
                           'honey ham', 'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

meat_to_animal = {
                  'bacon': 'pig',
                  'pulled pork': 'pig',
                  'pastrami': 'cow',
                  'corned beef': 'cow',
                  'honey ham': 'pig',
                  'nova lox': 'salmon'
                  }

# 먼저 data['food']에 어떤 데이터가 있는지 확인
data['food']

data['food'].map?

data['food'].map

# data['food']에 map 함수를 적용하는데 소문자로 모두 변경하는 함수 적용
data['food'].map(str.lower)

# 또 map으로 meat_to_animal dictionary를 넘긴다.
# bacon -> pig로, pastrami -> cow로 변경
data['food'].map(str.lower).map(meat_to_animal)

# 변경된 것들을 animal 열을 새로 생성하고 여기에 대입
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)

# food열의 대∙소문자는 안 바뀐 것을 알 수 있다.
# 대∙소문자 비교한건 animal 필드에 적용하기 위한 것인 것임을 확인할 수 있다.
data

data['food'].map(lambda x: meat_to_animal[x.lower()])

data = Series([1., -999., 2., -999., -1000., 3.])

data

data.replace(-999, np.nan)

# 정말 직관적으로 잘만든것 같다.
# 1개 이상을 변경하려면 list로 넘기면 모두 알아서 변경해준다.
data.replace([-999, -1000], np.nan)

# 변경할 문자열만 list로 넘기라는 법 없음
# 변경하고 싶은 문자열도 list로 넘기면 순서에 맞게끔 변경해 줌
data.replace([-999, -1000], [999, 1000])

data.replace([-999, -1000], [np.nan, 0])

# 사전으로 넘겨도 된다.
# 정말 편리하게 만들었다.
data.replace({-999: np.nan, -1000: 0})

data = DataFrame(np.arange(12).reshape((3, 4)),
                 index = ['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data

data.index.map(str.upper)

data.index = data.index.map(str.upper)

data

data.rename(index=str.title, columns=str.upper)

# 위에서 index를 대문자로 바꿔줬다.
# rename을 사용해서 원본은 바뀌지 않았다.
data

data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

data

# 항상 DataFrame의 참조를 반환한다.
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

# OHIO -> INDIANA로 바뀌었다.
data

ages = [20, 22, 25, 27, 21, 23, 27, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

# 1번째 인자: 나눌 list
# 2번째 인자: 나눌 기준
cats = pd.cut(ages, bins)

pd.cut?

cats

cats.labels

cats.levels

pd.value_counts(cats)

pd.cut(ages, [18, 26, 36, 61, 100], right=False)

bins

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

pd.cut(ages, bins, labels=group_names)

data = np.random.randn(20)

data

# data, 그룹의 갯수, 정확도?
pd.cut(data, 4, precision=2)

data = np.random.randn(1000) # Normally distibuted

cats = pd.qcut(data, 4) # Cut into quartiles

cats

pd.value_counts(cats)

data2 = pd.cut(data, 4, precision=2)

# qcut과 비교를 위해
pd.value_counts(data2)

cats2 = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

cats2

pd.value_counts(cats2)

np.random.seed?
Type:        builtin_function_or_method
String form: <built-in method seed of mtrand.RandomState object at 0x100540c78>
Docstring:
seed(seed=None)

Seed the generator.

This method is called when `RandomState` is initialized. It can be
called again to re-seed the generator. For details, see `RandomState`.

Parameters
----------
seed : int or array_like, optional
    Seed for `RandomState`.

See Also
--------
RandomState
np.random.seed(12345)

data = DataFrame(np.random.randn(1000, 4))

data.describe()

col = data[3]

col[np.abs(col) > 3]

data[np.abs(data) > 3]

data[(np.abs(data) > 3)]

# 절대값 3을 초과하는 값이 들어있는 모든 로우를 선택하려면 any 메서드 사용
data[(np.abs(data) > 3).any(1)]

# 이렇게하면 sign값만 추출할 수 있다.
np.sign(data[2])

# 절대값 3을 넘는 것들을 수정하는데 원래 데이터의 sign값을 유지하면서 * 3을 한다.
data[np.abs(data) > 3] = np.sign(data) * 3

data[(np.abs(data) >= 3).any(1)]

(np.abs(data) > 3).any?

data[(np.abs(data) > 3).any()]

data[(np.abs(data) > 3).any(0)]

data[(np.abs(data) > 3).any(2)]

np.any([[True, False], [True, True]])

np.any([[True, False], [True, True]], axis=0)

np.any([[True, False], [False, False]], axis=0)

df = DataFrame(np.arange(5 * 4).reshape(5, 4))

sampler = np.random.permutation(5)

sampler

df

df.take(sampler)

# 1. df의 크기를 넘겨주어 치환을 한다.
# 2. df.take 함수로 df에 함수 적용
# 3. 이 데이터 길이는 5개이지만 0,1,2만 보고 싶을 때 [:3]으로 보여주는 것 제한
df.take(np.random.permutation(len(df))[:3])

df.take(np.random.permutation(len(df)))

# 할 때마다 바뀜
# 1. df의 크기를 넘겨주어 치환을 한다.
np.random.permutation(len(df))

np.random.permutation(len(df))

np.random.permutation(5)

bag = np.array([5, 7, -1, 6, 4])

# 1. sampler에 의해서 0, bag의 크기로 랜덤 int가 생성하는데 10개 생성
sampler = np.random.randint(0, len(bag), size=10)

sampler

draws = bag.take(sampler)

# sampler 순서에 따라 bag에 있는 원소들을 배열
# sampler[0]: 1이니 bag[1]의 7
# sampler[1]: 3이니 bag[3]의 6
draws

df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})

df

# df['key']대로 key가 b이면 b컬럼에 1 세팅. 나머지는 0
pd.get_dummies(df['key'])

df['key']

dummies = pd.get_dummies(df['key'], prefix='key')

dummies

df_with_dummy = df[['data1']].join(dummies)

df_with_dummy

mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('ch07/movielens/movies.dat', sep='::',
                       header=None, names=mnames)

movies[:10]

# 이걸 Unique 하게 만들어야 한다.
movies.genres

# 1. movies.genres에서 x를 하나씩 추출
# 2. x.split('|')로 나눈다. 그럼 1,2,3~ 가 되겠지
# 3. 여기에서 다시 중복이 없게 하기 위해 set 함수를 한 번 적용
# 4. 최종적으로 장르 generator가 생성됨
genre_iter = (set(x.split('|')) for x in movies.genres)

genre_iter

set.union?

# genre_iter가 generator이기 때문에 앞에 *를 붙여주어 여러개를 받을 수 있게 해주었다.
# 이 부분은 나도 잘 이해가 안됨..
# generator 부분을 Python Cookbook 보면서 공부할 예정
# 어쨋든 set.union으로 generator를 1개씩 넘겨서
# 모두 합치고 sorted 함수로 sorting
genres = sorted(set.union(*genre_iter))

genres

dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)

dummies

dummies[:10]

for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1

dummies[:10]

movies_windic = movies.join(dummies.add_prefix('Genre_'))

movies_windic.ix[0]

movies_windic.ix[1]

values = np.random.rand(10)

values

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

pd.get_dummies(pd.cut(values, bins))

pd.cut(values, bins)

val  = 'a,b, guido'

# 쉼표로 구분된 문자열 분리
val.split(',')

pieces = [x.strip() for x in val.split(',')]

pieces

first, second, third = pieces

first

second

third

first + '::' + second + '::' + third

# '합칠문자열'.join(합칠리스트)
'::'.join(pieces)

'guido' in val

val.index(',')

val.find(':')

val

val.index(':')

# count는 특정 부분 문자열이 몇 건 발견되었는지 반환
val.count(',')

val.replace(',', '::')

val.replace(',', '')

import re

text = 'foo    bar\t baz   \tqux'

text

re.split('\s+', text)

regex = re.compile('\s+')

regex.split(text)

regex.findall(text)

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# 이 부분이 생략되어 있음
regex = re.compile(pattern, re.I) # re.IGNORECASE는 정규표현식이 대∙소문자를 가리지 않도록 한다.

regex.findall(text)

m = regex.search(text)

m

text[m.start():m.end()]

# 이 문서의 처음은 Dave로 시작하기 때문에 regex와 일치하지 않음
print regex.match(text)

print regex.sub('REDACTED', text)

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('wesm@bright.net')

type(m)

m.groups()

m.group(0)

m.group(1)

m.group(2)

m.group(3)

regex.findall(text)

print regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)

regex = re.compile(r'''
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})''', flags=re.IGNORECASE | re.VERBOSE)

m = regex.match('wesm@bright.net')

m.groupdict()

data = {'Dave': 'dave@google.com', 
        'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 
        'Wes': np.nan}

data = Series(data)

data

data.isnull()

data.str.contains('gmail')

pattern

data.str.findall(pattern, flags=re.IGNORECASE)

matches = data.str.match(pattern, flags=re.IGNORECASE)

matches

matches.str.get(1)

matches.str[0]

matches.str.get(2)

matches.str.get(0)

matches.str[1]

matches.str[2]

data.str[:5]

import json

db = json.load(open('ch07/foods-2011-10-03.json'))

len(db)

db[0].keys()

db[0]['nutrients'][0]

db[0]['nutrients'][1]

nutrients = DataFrame(db[0]['nutrients'])

nutrients[:7]

len(nutrients)

info_keys = ['description', 'group', 'id', 'manufacturer']

info = DataFrame(db, columns=info_keys)

info[:5]

len(info)

info

pd.value_counts(info.group)[:10]

nutrients = []

for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)
    
nutrients = pd.concat(nutrients, ignore_index=True)

db[0]['nutrients']

nutrients

nutrients[:5]

nutrients.duplicated().sum()

nutrients = nutrients.drop_duplicates()

nutrients.duplicated().sum()

nutrients

col_mapping = {'description': 'food',
               'group' : 'fgroup'}

info = info.rename(columns=col_mapping, copy=False)

info

col_mapping = {'description': 'nutrient',
               'group': 'nutgroup'}

nutrients = nutrients.rename(columns=col_mapping, copy=False)

nutrients

ndata = pd.merge(nutrients, info, on='id', how='outer')

ndata

ndata[:5]

ndata.ix[30000]

result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)

result

result['Zinc, Zn'].order().plot(kind='barh')

by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

# make the food a little smaller
max_foods.food = max_foods.food.str[:50]

max_foods

max_foods.ix['Amino Acids']['food']