import pandas as pd
from pandas import DataFrame, Series
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],
'data2': range(3)})
df1
data1 | key | |
---|---|---|
0 | 0 | b |
1 | 1 | b |
2 | 2 | a |
3 | 3 | c |
4 | 4 | a |
5 | 5 | a |
6 | 6 | b |
df2
data2 | key | |
---|---|---|
0 | 0 | a |
1 | 1 | b |
2 | 2 | d |
pd.merge(df1, df2)
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 1 | b | 1 |
2 | 6 | b | 1 |
3 | 2 | a | 0 |
4 | 4 | a | 0 |
5 | 5 | a | 0 |
이 예에서는 key 칼럼(data1, key, data2, key 중 key가 겹침)
이렇게도 작동한다고 예를 들었지만 명시적으로 지정하는 습관을 들이자
df1에서 c와 df2에서 d는 교차하는게 없어서 값 누락
상식적으로 기본 전제는 교차하는 것이다. 왜냐하면 교차도 안하는데 출력해주면 나중에 문제가 될 수 있으니 이렇게 한듯하다. 교차 안되는건 옵션으로 표현해 줄 수 있을 것이다. 왜냐고? 그게 안전빵이니까
pd.merge(df1, df2, on='key')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 1 | b | 1 |
2 | 6 | b | 1 |
3 | 2 | a | 0 |
4 | 4 | a | 0 |
5 | 5 | a | 0 |
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
'data2': range(3)})
df3
data1 | lkey | |
---|---|---|
0 | 0 | b |
1 | 1 | b |
2 | 2 | a |
3 | 3 | c |
4 | 4 | a |
5 | 5 | a |
6 | 6 | b |
df4
data2 | rkey | |
---|---|---|
0 | 0 | a |
1 | 1 | b |
2 | 2 | d |
# left, right 둘 다 같이 써줘야 한다.
# 하나만 써주면 어떻게 될까? 이럴땐 Test 해보는게 짱!
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
data1 | lkey | data2 | rkey | |
---|---|---|---|---|
0 | 0 | b | 1 | b |
1 | 1 | b | 1 | b |
2 | 6 | b | 1 | b |
3 | 2 | a | 0 | a |
4 | 4 | a | 0 | a |
5 | 5 | a | 0 | a |
# TeypeError!
# 소스를 보면 right_on == left_on 비교하는 곳이 있다.
# 당연히 right_on은 명시하지 않았으니 NoneType으로 넘어가서 비교가 안된다!
# 아 그 전에 len 함수에서 NoneType의 길이를 재려고 하니 오류가 나는 것임!
pd.merge(df3, df4, left_on='lkey')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-16-20cc39e49f9a> in <module>() 3 # 당연히 right_on은 명시하지 않았으니 NoneType으로 넘어가서 비교가 안된다! 4 # 아 그 전에 len 함수에서 NoneType의 길이를 재려고 하니 오류가 나는 것임! ----> 5 pd.merge(df3, df4, left_on='lkey') /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy) 34 right_on=right_on, left_index=left_index, 35 right_index=right_index, sort=sort, suffixes=suffixes, ---> 36 copy=copy) 37 return op.get_result() 38 if __debug__: /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy) 178 (self.left_join_keys, 179 self.right_join_keys, --> 180 self.join_names) = self._get_merge_keys() 181 182 def get_result(self): /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _get_merge_keys(self) 296 left_keys, right_keys 297 """ --> 298 self._validate_specification() 299 300 left_keys = [] /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _validate_specification(self) 414 raise AssertionError() 415 self.left_on = [None] * n --> 416 if not ((len(self.right_on) == len(self.left_on))): 417 raise AssertionError() 418 TypeError: object of type 'NoneType' has no len()
# None은 당연히 길이가 없지.
len(None)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-17-a613272357ac> in <module>() 1 # None은 당연히 길이가 없지. ----> 2 len(None) TypeError: object of type 'NoneType' has no len()
pd.merge(df1, df2, how='outer')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 1 | b | 1 |
2 | 6 | b | 1 |
3 | 2 | a | 0 |
4 | 4 | a | 0 |
5 | 5 | a | 0 |
6 | 3 | c | NaN |
7 | NaN | d | 2 |
# left에만 있는 c까지 포함이 된 것을 확인할 수 있다.
pd.merge(df1, df2, how='left')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 1 | b | 1 |
2 | 6 | b | 1 |
3 | 2 | a | 0 |
4 | 4 | a | 0 |
5 | 5 | a | 0 |
6 | 3 | c | NaN |
# right에만 있는 d까지 포함
pd.merge(df1, df2, how='right')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 1 | b | 1 |
2 | 6 | b | 1 |
3 | 2 | a | 0 |
4 | 4 | a | 0 |
5 | 5 | a | 0 |
6 | NaN | d | 2 |
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
'data2': range(5)})
df1
data1 | key | |
---|---|---|
0 | 0 | b |
1 | 1 | b |
2 | 2 | a |
3 | 3 | c |
4 | 4 | a |
5 | 5 | b |
df2
data2 | key | |
---|---|---|
0 | 0 | a |
1 | 1 | b |
2 | 2 | a |
3 | 3 | b |
4 | 4 | d |
pd.merge(df1, df2, on='key', how='left')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 0 | b | 3 |
2 | 1 | b | 1 |
3 | 1 | b | 3 |
4 | 5 | b | 1 |
5 | 5 | b | 3 |
6 | 2 | a | 0 |
7 | 2 | a | 2 |
8 | 4 | a | 0 |
9 | 4 | a | 2 |
10 | 3 | c | NaN |
# how parameter default is inner
pd.merge(df1, df2, how='inner')
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 0 | b | 3 |
2 | 1 | b | 1 |
3 | 1 | b | 3 |
4 | 5 | b | 1 |
5 | 5 | b | 3 |
6 | 2 | a | 0 |
7 | 2 | a | 2 |
8 | 4 | a | 0 |
9 | 4 | a | 2 |
pd.merge(df1, df2)
data1 | key | data2 | |
---|---|---|---|
0 | 0 | b | 1 |
1 | 0 | b | 3 |
2 | 1 | b | 1 |
3 | 1 | b | 3 |
4 | 5 | b | 1 |
5 | 5 | b | 3 |
6 | 2 | a | 0 |
7 | 2 | a | 2 |
8 | 4 | a | 0 |
9 | 4 | a | 2 |
pd.merge?
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
'key2': ['one', 'two', 'one'],
'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]})
left
key1 | key2 | lval | |
---|---|---|---|
0 | foo | one | 1 |
1 | foo | two | 2 |
2 | bar | one | 3 |
right
key1 | key2 | rval | |
---|---|---|---|
0 | foo | one | 4 |
1 | foo | one | 5 |
2 | bar | one | 6 |
3 | bar | two | 7 |
pd.merge(left, right, on=['key1', 'key2'], how='outer')
key1 | key2 | lval | rval | |
---|---|---|---|---|
0 | foo | one | 1 | 4 |
1 | foo | one | 1 | 5 |
2 | foo | two | 2 | NaN |
3 | bar | one | 3 | 6 |
4 | bar | two | NaN | 7 |
pd.merge(left, right, on='key1')
key1 | key2_x | lval | key2_y | rval | |
---|---|---|---|---|---|
0 | foo | one | 1 | one | 4 |
1 | foo | one | 1 | one | 5 |
2 | foo | two | 2 | one | 4 |
3 | foo | two | 2 | one | 5 |
4 | bar | one | 3 | one | 6 |
5 | bar | one | 3 | two | 7 |
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
key1 | key2_left | lval | key2_right | rval | |
---|---|---|---|---|---|
0 | foo | one | 1 | one | 4 |
1 | foo | one | 1 | one | 5 |
2 | foo | two | 2 | one | 4 |
3 | foo | two | 2 | one | 5 |
4 | bar | one | 3 | one | 6 |
5 | bar | one | 3 | two | 7 |
인자 | 설명 |
---|---|
left | 머지하려는 DataFrame 중 왼쪽에 위치한 DataFrame |
right | 머지하려는 DataFrame 중 오른쪽에 위치한 DataFrame |
how | 조인방법. 'inner', 'outer', 'left', 'right'. 기본값은 inner |
on | 조인하려는 로우 이름. 반드시 두 DataFrame 객체 모두에 있는 이름이어야 한다. 만약 명시되지 않고 다른 조인 키도 주어지지 않으면 left와 right에서 공통되는 칼럼을 조인 키로 사용한다. |
left_on | 조인 키로 사용할 left DataFrame의 칼럼 |
right_on | 조인 키로 사용할 right DataFrame의 칼럼 |
left_index | 조인 키로 사용할 left DataFrame의 색인 로우(다중 색인일 경우의 키) |
right_index | 조인 키로 사용할 right DataFrame의 색인 로우(다중 색인일 경우의 키) |
sort | 조인 키에 따라 병합된 데이터를 사전 순으로 정렬. 기본값은 True. 대용량 데이터의 경우 False라면 성능상의 이득을 얻을 수도 있다. |
suffixes | 칼럼 이름이 겹칠 경우 각 칼럼 이름 뒤에 붙일 문자열의 튜플. 기본값은 ('_x', '_y'). 만약 'data'라는 칼럼 이름이 양쪽 DataFrame에 같이 존재하면 결과에서는 'data_x', 'data_y'로 나타난다. |
copy | False일 경우 예외적인 경우에 결과로 데이터가 복사되지 않도록 한다. 기본값은 항상 복사가 이루어진다. |
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
left1
key | value | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | a | 2 |
3 | a | 3 |
4 | b | 4 |
5 | c | 5 |
right1
group_val | |
---|---|
a | 3.5 |
b | 7.0 |
# left_on으로 어떤 컬럼이 조인 키로 사용될지 결정
# right_index로 어떤 인덱스가 조인 키로 사용될지 결정
pd.merge(left1, right1, left_on='key', right_index=True)
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
# right_on을 입력하지 않으니 당연히 Error!
pd.merge(left1, right1, left_on='key')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-41-273a0ff3019a> in <module>() 1 # right_on을 입력하지 않으니 당연히 Error! ----> 2 pd.merge(left1, right1, left_on='key') /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy) 34 right_on=right_on, left_index=left_index, 35 right_index=right_index, sort=sort, suffixes=suffixes, ---> 36 copy=copy) 37 return op.get_result() 38 if __debug__: /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy) 178 (self.left_join_keys, 179 self.right_join_keys, --> 180 self.join_names) = self._get_merge_keys() 181 182 def get_result(self): /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _get_merge_keys(self) 296 left_keys, right_keys 297 """ --> 298 self._validate_specification() 299 300 left_keys = [] /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _validate_specification(self) 414 raise AssertionError() 415 self.left_on = [None] * n --> 416 if not ((len(self.right_on) == len(self.left_on))): 417 raise AssertionError() 418 TypeError: object of type 'NoneType' has no len()
# right_on에 group_val을 입력하면 교차하는게 하나도 없으니 아무것도 표시하지 않음
pd.merge(left1, right1, left_on='key', right_on='group_val')
Int64Index([], dtype=int64) | Empty DataFrame |
pd.merge(left1, right1, left_on='key', right_on='group_val', how='outer')
key | value | group_val | |
---|---|---|---|
0 | a | 0 | NaN |
1 | a | 2 | NaN |
2 | a | 3 | NaN |
3 | b | 1 | NaN |
4 | b | 4 | NaN |
5 | c | 5 | NaN |
6 | NaN | NaN | 3.5 |
7 | NaN | NaN | 7.0 |
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'key2': [2000, 2001, 2002, 2001, 2002],
'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),
index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
[2001, 2000, 2000, 2000, 2001, 2002]],
columns=['event1', 'event2'])
lefth
data | key1 | key2 | |
---|---|---|---|
0 | 0 | Ohio | 2000 |
1 | 1 | Ohio | 2001 |
2 | 2 | Ohio | 2002 |
3 | 3 | Nevada | 2001 |
4 | 4 | Nevada | 2002 |
righth
event1 | event2 | ||
---|---|---|---|
Nevada | 2001 | 0 | 1 |
2000 | 2 | 3 | |
Ohio | 2000 | 4 | 5 |
2000 | 6 | 7 | |
2001 | 8 | 9 | |
2002 | 10 | 11 |
# key1, key2와 right_index가 같으니 됨
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
data | key1 | key2 | event1 | event2 | |
---|---|---|---|---|---|
0 | 0 | Ohio | 2000 | 4 | 5 |
0 | 0 | Ohio | 2000 | 6 | 7 |
1 | 1 | Ohio | 2001 | 8 | 9 |
2 | 2 | Ohio | 2002 | 10 | 11 |
3 | 3 | Nevada | 2001 | 0 | 1 |
# key1과 right_index는 다르다. key2에 해당하는 연도가 없어서 Error!
pd.merge(lefth, righth, left_on=['key1'], right_index=True)
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) <ipython-input-179-83621148f53a> in <module>() 1 # key1과 right_index는 다르다. key2에 해당하는 연도가 없어서 Error! ----> 2 pd.merge(lefth, righth, left_on=['key1'], right_index=True) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy) 34 right_on=right_on, left_index=left_index, 35 right_index=right_index, sort=sort, suffixes=suffixes, ---> 36 copy=copy) 37 return op.get_result() 38 if __debug__: /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy) 178 (self.left_join_keys, 179 self.right_join_keys, --> 180 self.join_names) = self._get_merge_keys() 181 182 def get_result(self): /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _get_merge_keys(self) 296 left_keys, right_keys 297 """ --> 298 self._validate_specification() 299 300 left_keys = [] /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _validate_specification(self) 406 if self.right_index: 407 if not ((len(self.left_on) == self.right.index.nlevels)): --> 408 raise AssertionError() 409 self.right_on = [None] * n 410 elif self.right_on is not None: AssertionError:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, how='outer')
data | key1 | key2 | event1 | event2 | |
---|---|---|---|---|---|
0 | 0 | Ohio | 2000 | 4 | 5 |
0 | 0 | Ohio | 2000 | 6 | 7 |
1 | 1 | Ohio | 2001 | 8 | 9 |
2 | 2 | Ohio | 2002 | 10 | 11 |
3 | 3 | Nevada | 2001 | 0 | 1 |
4 | 4 | Nevada | 2002 | NaN | NaN |
4 | NaN | Nevada | 2000 | 2 | 3 |
left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.,], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
left2
Ohio | Nevada | |
---|---|---|
a | 1 | 2 |
c | 3 | 4 |
e | 5 | 6 |
right2
Missouri | Alabama | |
---|---|---|
b | 7 | 8 |
c | 9 | 10 |
d | 11 | 12 |
e | 13 | 14 |
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
a | 1 | 2 | NaN | NaN |
b | NaN | NaN | 7 | 8 |
c | 3 | 4 | 9 | 10 |
d | NaN | NaN | 11 | 12 |
e | 5 | 6 | 13 | 14 |
pd.merge(left2, right2, left_index=True, right_index=True)
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
c | 3 | 4 | 9 | 10 |
e | 5 | 6 | 13 | 14 |
# 위와 똑같은 결과인데 join을 사용하면 훨씬 깔끔하다
left2.join(right2, how='outer')
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
a | 1 | 2 | NaN | NaN |
b | NaN | NaN | 7 | 8 |
c | 3 | 4 | 9 | 10 |
d | NaN | NaN | 11 | 12 |
e | 5 | 6 | 13 | 14 |
# 색인 기준으로 머지할 때 사용!
left2.join(right2, how='inner')
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
c | 3 | 4 | 9 | 10 |
e | 5 | 6 | 13 | 14 |
left1
key | value | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | a | 2 |
3 | a | 3 |
4 | b | 4 |
5 | c | 5 |
right1
group_val | |
---|---|
a | 3.5 |
b | 7.0 |
# 왼쪽 우선 조인이라 c가 있음
left1.join(right1, on='key')
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
1 | b | 1 | 7.0 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
left2
Ohio | Nevada | |
---|---|---|
a | 1 | 2 |
c | 3 | 4 |
e | 5 | 6 |
right2
Missouri | Alabama | |
---|---|---|
b | 7 | 8 |
c | 9 | 10 |
d | 11 | 12 |
e | 13 | 14 |
another
New York | Oregon | |
---|---|---|
a | 7 | 8 |
c | 9 | 10 |
e | 11 | 12 |
f | 16 | 17 |
left2.join([right2, another])
Ohio | Nevada | Missouri | Alabama | New York | Oregon | |
---|---|---|---|---|---|---|
a | 1 | 2 | NaN | NaN | 7 | 8 |
c | 3 | 4 | 9 | 10 | 9 | 10 |
e | 5 | 6 | 13 | 14 | 11 | 12 |
left2
Ohio | Nevada | |
---|---|---|
a | 1 | 2 |
c | 3 | 4 |
e | 5 | 6 |
right2
Missouri | Alabama | |
---|---|---|
b | 7 | 8 |
c | 9 | 10 |
d | 11 | 12 |
e | 13 | 14 |
another
New York | Oregon | |
---|---|---|
a | 7 | 8 |
c | 9 | 10 |
e | 11 | 12 |
f | 16 | 17 |
left2.join([right2, another], how='outer')
Ohio | Nevada | Missouri | Alabama | New York | Oregon | |
---|---|---|---|---|---|---|
a | 1 | 2 | NaN | NaN | 7 | 8 |
b | NaN | NaN | 7 | 8 | NaN | NaN |
c | 3 | 4 | 9 | 10 | 9 | 10 |
d | NaN | NaN | 11 | 12 | NaN | NaN |
e | 5 | 6 | 13 | 14 | 11 | 12 |
f | NaN | NaN | NaN | NaN | 16 | 17 |
arr = np.arange(12).reshape((3, 4))
arr
array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]])
type(arr)
numpy.ndarray
# axis = column
np.concatenate([arr, arr], axis=1)
array([[ 0, 1, 2, 3, 0, 1, 2, 3], [ 4, 5, 6, 7, 4, 5, 6, 7], [ 8, 9, 10, 11, 8, 9, 10, 11]])
np.concatenate([arr, arr], axis=0)
array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11], [ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]])
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])
s1
a 0 b 1 dtype: int64
s2
c 2 d 3 e 4 dtype: int64
s3
f 5 g 6 dtype: int64
pd.concat([s1, s2, s3])
a 0 b 1 c 2 d 3 e 4 f 5 g 6 dtype: int64
# Series인 것을 확인할 수 있다.
type( pd.concat([s1, s2, s3]) )
pandas.core.series.Series
pd.concat([s1, s2, s3], axis=1)
0 | 1 | 2 | |
---|---|---|---|
a | 0 | NaN | NaN |
b | 1 | NaN | NaN |
c | NaN | 2 | NaN |
d | NaN | 3 | NaN |
e | NaN | 4 | NaN |
f | NaN | NaN | 5 |
g | NaN | NaN | 6 |
# DataFrame인 것을 확인할 수 있다
type( pd.concat([s1, s2, s3], axis=1) )
pandas.core.frame.DataFrame
s4 = pd.concat([s1 * 5, s3])
s1
a 0 b 1 dtype: int64
s1 * 5
a 0 b 5 dtype: int64
s3
f 5 g 6 dtype: int64
s4
a 0 b 5 f 5 g 6 dtype: int64
pd.concat([s1, s4], axis=1)
0 | 1 | |
---|---|---|
a | 0 | 0 |
b | 1 | 5 |
f | NaN | 5 |
g | NaN | 6 |
pd.concat([s1, s4], axis=1, join='inner')
0 | 1 | |
---|---|---|
a | 0 | 0 |
b | 1 | 5 |
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])
0 | 1 | |
---|---|---|
a | 0 | 0 |
c | NaN | NaN |
b | 1 | 5 |
e | NaN | NaN |
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
s1
a 0 b 1 dtype: int64
s2
c 2 d 3 e 4 dtype: int64
s3
f 5 g 6 dtype: int64
# s1 = one, s2 = two, s3 = three로 할당
result
one a 0 b 1 two a 0 b 1 three f 5 g 6 dtype: int64
# unstack 함수에 대한 자세한 내용은 나중에 알아보자
result.unstack()
a | b | f | g | |
---|---|---|---|---|
one | 0 | 1 | NaN | NaN |
two | 0 | 1 | NaN | NaN |
three | NaN | NaN | 5 | 6 |
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
one | two | three | |
---|---|---|---|
a | 0 | NaN | NaN |
b | 1 | NaN | NaN |
c | NaN | 2 | NaN |
d | NaN | 3 | NaN |
e | NaN | 4 | NaN |
f | NaN | NaN | 5 |
g | NaN | NaN | 6 |
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
columns=['three', 'four'])
df1
one | two | |
---|---|---|
a | 0 | 1 |
b | 2 | 3 |
c | 4 | 5 |
df2
three | four | |
---|---|---|
a | 5 | 6 |
c | 7 | 8 |
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
level1 | level2 | |||
---|---|---|---|---|
one | two | three | four | |
a | 0 | 1 | 5 | 6 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 7 | 8 |
pd.concat({'level1': df1, 'level2': df2}, axis=1)
level1 | level2 | |||
---|---|---|---|---|
one | two | three | four | |
a | 0 | 1 | 5 | 6 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 7 | 8 |
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
names=['upper', 'lower'])
upper | level1 | level2 | ||
---|---|---|---|---|
lower | one | two | three | four |
a | 0 | 1 | 5 | 6 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 7 | 8 |
# Concatenate pandas objects along a particular axis with optional set logic
# along the other axes. Can also add a layer of hierarchical indexing on the
# concatenation axis, which may be useful if the labels are the same (or
# oeverlapping) on the passed axis number
pd.concat?
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
names=['upper'])
upper | level1 | level2 | ||
---|---|---|---|---|
one | two | three | four | |
a | 0 | 1 | 5 | 6 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 7 | 8 |
# names는 2개까지만 됨. 현재 df에서
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
names=['upper', 'lower', 'test'])
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-212-d81fd2f4ba86> in <module>() 1 pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], ----> 2 names=['upper', 'lower', 'test']) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity) 883 ignore_index=ignore_index, join=join, 884 keys=keys, levels=levels, names=names, --> 885 verify_integrity=verify_integrity) 886 return op.get_result() 887 /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity) 958 self.verify_integrity = verify_integrity 959 --> 960 self.new_axes = self._get_new_axes() 961 962 def get_result(self): /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _get_new_axes(self) 1160 concat_axis = None 1161 else: -> 1162 concat_axis = self._get_concat_axis() 1163 1164 new_axes[self.axis] = concat_axis /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _get_concat_axis(self) 1195 else: 1196 concat_axis = _make_concat_multiindex(indexes, self.keys, -> 1197 self.levels, self.names) 1198 1199 self._maybe_check_integrity(concat_axis) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/merge.pyc in _make_concat_multiindex(indexes, keys, levels, names) 1274 names = names + _get_consensus_names(indexes) 1275 -> 1276 return MultiIndex(levels=levels, labels=label_list, names=names) 1277 1278 new_index = indexes[0] /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/index.pyc in __new__(cls, levels, labels, sortorder, names, copy) 1599 1600 if names is not None: -> 1601 subarr._set_names(names) 1602 1603 /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/index.pyc in _set_names(self, values) 1783 if len(values) != self.nlevels: 1784 raise ValueError('Length of names (%d) must be same as level ' -> 1785 '(%d)' % (len(values),self.nlevels)) 1786 # set the name 1787 for name, level in zip(values, self.levels): ValueError: Length of names (4) must be same as level (2)
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df1
a | b | c | d | |
---|---|---|---|---|
0 | 0.778910 | 0.311274 | 1.141611 | -2.813149 |
1 | -0.360643 | -0.794361 | 2.023458 | -0.449211 |
2 | -1.677890 | -0.345416 | 0.718920 | 0.171574 |
df2
b | d | a | |
---|---|---|---|
0 | -2.771987 | 1.315566 | 1.019023 |
1 | -0.435467 | -1.595407 | -0.038993 |
# index 부분이 하나로 합쳐짐
pd.concat([df1, df2], ignore_index=True)
a | b | c | d | |
---|---|---|---|---|
0 | 0.778910 | 0.311274 | 1.141611 | -2.813149 |
1 | -0.360643 | -0.794361 | 2.023458 | -0.449211 |
2 | -1.677890 | -0.345416 | 0.718920 | 0.171574 |
3 | 1.019023 | -2.771987 | NaN | 1.315566 |
4 | -0.038993 | -0.435467 | NaN | -1.595407 |
pd.concat([df1, df2])
a | b | c | d | |
---|---|---|---|---|
0 | 0.778910 | 0.311274 | 1.141611 | -2.813149 |
1 | -0.360643 | -0.794361 | 2.023458 | -0.449211 |
2 | -1.677890 | -0.345416 | 0.718920 | 0.171574 |
0 | 1.019023 | -2.771987 | NaN | 1.315566 |
1 | -0.038993 | -0.435467 | NaN | -1.595407 |
인자 | 설명 |
---|---|
objs | 이어붙일 pandas 객체의 사전이나 리스트. 필수 인자 |
axis | 이어붙일 축 방향. 기본값은 0 |
join | 조인 방식. 'inner'(내부조인, 교집합)와 'outer'(외부조인, 합집합)가 있으며 기본값은 'outer' |
join_axes | 합집합/교집합을 수행하는 대신 다른 n-1 축으로 사용할 색인을 지정한다. |
keys | 이어붙일 객체나 이어붙인 축에 대한 계층 색인을 생성하는 데 연관된 값이다. 리스트나 임의의 값이 들어있는 배열, 튜플의 배열 또는 배열의 리스트(levels 옵션에 다차원 배열이 넘어온 경우)가 될 수 있다. |
levels | 계층 색인 레벨로 사용할 색인을 지정한다. keys가 넘어온 경우 여러 개의 색인을 지정한다. |
names | keys나 levels 혹은 둘 다 있을 경우, 생성된 계층 레벨을 위한 이름 |
verify_integrity | 이어붙인 객체에 중복되는 축이 있는지 검사하고 있다면 예외를 발생시킨다. 기본값은 False로, 중복을 허용한다. |
ignore_index | 이어붙인 축의 색인을 유지하지 않고 range(total_length)로 새로운 색인을 생성한다. |
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
a`
f NaN e 2.5 d NaN c 3.5 b 4.5 a NaN dtype: float64
np.arange(len(a))
array([0, 1, 2, 3, 4, 5])
b
f 0 e 1 d 2 c 3 b 4 a NaN dtype: float64
# 오랜만에 나와서 np.where가 어떤 기능인지 까먹었었다!
# pd.isnull(a)에서 null은 것은 True을 돌려준다.
# 삼항 연산자처럼 null인 True가 있으면 b값을 대입하고
# null이 False면 a값을 대입한다.
np.where(pd.isnull(a), b, a)
array([ 0. , 2.5, 2. , 3.5, 4.5, nan])
pd.isnull(a)
f True e False d True c False b False a True dtype: bool
b[:-2]
f 0 e 1 d 2 c 3 dtype: float64
a[2:]
d NaN c 3.5 b 4.5 a NaN dtype: float64
b[:-2].combine_first(a[2:])
a NaN b 4.5 c 3.0 d 2.0 e 1.0 f 0.0 dtype: float64
b[:-2].combine_first
<bound method Series.combine_first of f 0 e 1 d 2 c 3 dtype: float64>
b[:-2]
f 0 e 1 d 2 c 3 dtype: float64
b.combine_first(a)
f 0 e 1 d 2 c 3 b 4 a NaN dtype: float64
# a가 False일 때만 b 값을 참조한다.
# a를 먼저 combine 하되 False라면 b값 참조
a.combine_first(b)
f 0.0 e 2.5 d 2.0 c 3.5 b 4.5 a NaN dtype: float64
a
f NaN e 2.5 d NaN c 3.5 b 4.5 a NaN dtype: float64
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
'b': [np.nan, 2., np.nan, 6.],
'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
'b': [np.nan, 3., 4., 6., 8.]})
df1
a | b | c | |
---|---|---|---|
0 | 1 | NaN | 2 |
1 | NaN | 2 | 6 |
2 | 5 | NaN | 10 |
3 | NaN | 6 | 14 |
df2
a | b | |
---|---|---|
0 | 5 | NaN |
1 | 4 | 3 |
2 | NaN | 4 |
3 | 3 | 6 |
4 | 7 | 8 |
df1.combine_first(df2)
a | b | c | |
---|---|---|---|
0 | 1 | NaN | 2 |
1 | 4 | 2 | 6 |
2 | 5 | 4 | 10 |
3 | 3 | 6 | 14 |
4 | 7 | 8 | NaN |
# 문자열이 담긴 배열을 로우와 칼럼의 색인으로하는 작은 DataFrame
data = DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'], name='number'))
data
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
# 문자열이 담긴 배열을 로우와 칼럼의 색인으로하는 작은 DataFrame
# pd.Index로 index를 설정 후에 name을 입력하기 위해서 저렇게 한 것.
# 기본적으로는 columns에 list만 넘겨도 생성 됨
data2 = DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=['one', 'two', 'three'])
data2
one | two | three | |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
result = data.stack()
result
state number Ohio one 0 two 1 three 2 Colorado one 3 two 4 three 5 dtype: int64
type(result)
pandas.core.series.Series
# 보통 가장 안쪽에 있는 one, two, three부터 컬럼으로 끄집어 냄
result.unstack()
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
type( result.unstack() )
pandas.core.frame.DataFrame
result.unstack(0)
state | Ohio | Colorado |
---|---|---|
number | ||
one | 0 | 3 |
two | 1 | 4 |
three | 2 | 5 |
result.unstack('state')
state | Ohio | Colorado |
---|---|---|
number | ||
one | 0 | 3 |
two | 1 | 4 |
three | 2 | 5 |
result.unstack(1)
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
result.unstack('number')
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
s1
a 0 b 1 c 2 d 3 dtype: int64
s2
c 4 d 5 e 6 dtype: int64
data3 = pd.concat([s1, s2])
data3
a 0 b 1 c 2 d 3 c 4 d 5 e 6 dtype: int64
# index로 s1을 one, s2를 two로 할당
data2
one a 0 b 1 c 2 d 3 two c 4 d 5 e 6 dtype: int64
data2.unstack()
a | b | c | d | e | |
---|---|---|---|---|---|
one | 0 | 1 | 2 | 3 | NaN |
two | NaN | NaN | 4 | 5 | 6 |
# 바꿨다 원상복구... 근데 null값 포함하지 않는다는 게 point!
data2.unstack().stack()
one a 0 b 1 c 2 d 3 two c 4 d 5 e 6 dtype: float64
data2
one a 0 b 1 c 2 d 3 two c 4 d 5 e 6 dtype: int64
data2.unstack().stack(dropna=False)
one a 0 b 1 c 2 d 3 e NaN two a NaN b NaN c 4 d 5 e 6 dtype: float64
df = DataFrame({'left': result, 'right': result + 5},
columns=pd.Index(['left', 'right'], name='side'))
df
side | left | right | |
---|---|---|---|
state | number | ||
Ohio | one | 0 | 5 |
two | 1 | 6 | |
three | 2 | 7 | |
Colorado | one | 3 | 8 |
two | 4 | 9 | |
three | 5 | 10 |
result
state number Ohio one 0 two 1 three 2 Colorado one 3 two 4 three 5 dtype: int64
# unstack 할 때 결과에서 가장 낮은 레벨. 즉, side의 하위로 갔다.
df.unstack('state')
side | left | right | ||
---|---|---|---|---|
state | Ohio | Colorado | Ohio | Colorado |
number | ||||
one | 0 | 3 | 5 | 8 |
two | 1 | 4 | 6 | 9 |
three | 2 | 5 | 7 | 10 |
# side도 number의 하위로 갔다.
df.unstack('state').stack('side')
state | Ohio | Colorado | |
---|---|---|---|
number | side | ||
one | left | 0 | 3 |
right | 5 | 8 | |
two | left | 1 | 4 |
right | 6 | 9 | |
three | left | 2 | 5 |
right | 7 | 10 |
date, item, value
1959-03-31 00:00:00, realgdp, 2710.349
1959-03-31 00:00:00, infl, 0.000
1959-03-31 00:00:00, unemp, 5.800
1959-06-30 00:00:00, realgdp, 2778.801
1959-06-30 00:00:00, infl, 2.340
1959-06-30 00:00:00, unemp, 5.100
1959-09-30 00:00:00, realgdp, 2775.488
1959-09-30 00:00:00, infl, 2.740
1959-09-30 00:00:00, unemp, 5.300
1959-12-31 00:00:00, realgdp, 2785.204
%%writefile ch07/pivot.csv
date, item, value
1959-03-31 00:00:00, realgdp, 2710.349
1959-03-31 00:00:00, infl, 0.000
1959-03-31 00:00:00, unemp, 5.800
1959-06-30 00:00:00, realgdp, 2778.801
1959-06-30 00:00:00, infl, 2.340
1959-06-30 00:00:00, unemp, 5.100
1959-09-30 00:00:00, realgdp, 2775.488
1959-09-30 00:00:00, infl, 2.740
1959-09-30 00:00:00, unemp, 5.300
1959-12-31 00:00:00, realgdp, 2785.204
Overwriting ch07/pivot.csv
# header를 0으로 설정하겠다는건데.. 기본값.
# 계속 pivot 할 때 에러가 난다.
# names로 꼭 설정해야 하나보다. pivot이 인식하는 것은 names로 setting된 값인듯.
# 열과 행을 바꿔야 하기 때문에 이미 인식하고 있는 값이 없으면 error 뱉는것 같다.
ldata = pd.read_csv('ch07/pivot.csv', header=0)
# 이것 말고는 딱히 다른게 생각나지 않는다.
# csv 파일에서 1번째 줄인 header를 지우면 어떤 컬럼인지 모르니까 차라리 skiprows=1 을 해주는게 낫겠다.
# 6장 처음 부분에 read_csv 옵션들이 있으니 참고
ldata = pd.read_csv('ch07/pivot.csv', skiprows=1, names=['date', 'item', 'value'])
ldata[:10]
date | item | value | |
---|---|---|---|
0 | 1959-03-31 00:00:00 | realgdp | 2710.349 |
1 | 1959-03-31 00:00:00 | infl | 0.000 |
2 | 1959-03-31 00:00:00 | unemp | 5.800 |
3 | 1959-06-30 00:00:00 | realgdp | 2778.801 |
4 | 1959-06-30 00:00:00 | infl | 2.340 |
5 | 1959-06-30 00:00:00 | unemp | 5.100 |
6 | 1959-09-30 00:00:00 | realgdp | 2775.488 |
7 | 1959-09-30 00:00:00 | infl | 2.740 |
8 | 1959-09-30 00:00:00 | unemp | 5.300 |
9 | 1959-12-31 00:00:00 | realgdp | 2785.204 |
type(ldata)
pandas.core.frame.DataFrame
# 1번째 인자: 로우 색인으로 사용될 칼럼 이름
# 2번째 인자: 칼럼 색인으로 사용될 칼럼 이름
# 3번째 인자: DataFrame에 채워 넣을 값을 담고 있는 칼럼
pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()
item | infl | realgdp | unemp |
---|---|---|---|
date | |||
1959-03-31 00:00:00 | 0.00 | 2710.349 | 5.8 |
1959-06-30 00:00:00 | 2.34 | 2778.801 | 5.1 |
1959-09-30 00:00:00 | 2.74 | 2775.488 | 5.3 |
1959-12-31 00:00:00 | NaN | 2785.204 | NaN |
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
date | item | value | value2 | |
---|---|---|---|---|
0 | 1959-03-31 00:00:00 | realgdp | 2710.349 | -1.758243 |
1 | 1959-03-31 00:00:00 | infl | 0.000 | -1.163026 |
2 | 1959-03-31 00:00:00 | unemp | 5.800 | -0.997308 |
3 | 1959-06-30 00:00:00 | realgdp | 2778.801 | -1.407402 |
4 | 1959-06-30 00:00:00 | infl | 2.340 | 1.698340 |
5 | 1959-06-30 00:00:00 | unemp | 5.100 | 0.309916 |
6 | 1959-09-30 00:00:00 | realgdp | 2775.488 | 0.759469 |
7 | 1959-09-30 00:00:00 | infl | 2.740 | -0.376500 |
8 | 1959-09-30 00:00:00 | unemp | 5.300 | -0.288409 |
9 | 1959-12-31 00:00:00 | realgdp | 2785.204 | -2.492189 |
# 3번째 인자 생략하면 계층적 색인으로 보여줌
pivoted = ldata.pivot('date', 'item')
pivoted[:5]
value | value2 | |||||
---|---|---|---|---|---|---|
item | infl | realgdp | unemp | infl | realgdp | unemp |
date | ||||||
1959-03-31 00:00:00 | 0.00 | 2710.349 | 5.8 | -1.163026 | -1.758243 | -0.997308 |
1959-06-30 00:00:00 | 2.34 | 2778.801 | 5.1 | 1.698340 | -1.407402 | 0.309916 |
1959-09-30 00:00:00 | 2.74 | 2775.488 | 5.3 | -0.376500 | 0.759469 | -0.288409 |
1959-12-31 00:00:00 | NaN | 2785.204 | NaN | NaN | -2.492189 | NaN |
# Wow! Simple sentence!
# 계층적 색인으로 되어 있는 것을 ['value']로 좁혀주고 5개만 보여준다.
pivoted['value'][:5]
item | infl | realgdp | unemp |
---|---|---|---|
date | |||
1959-03-31 00:00:00 | 0.00 | 2710.349 | 5.8 |
1959-06-30 00:00:00 | 2.34 | 2778.801 | 5.1 |
1959-09-30 00:00:00 | 2.74 | 2775.488 | 5.3 |
1959-12-31 00:00:00 | NaN | 2785.204 | NaN |
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]
value | value2 | |||||
---|---|---|---|---|---|---|
item | infl | realgdp | unemp | infl | realgdp | unemp |
date | ||||||
1959-03-31 00:00:00 | 0.00 | 2710.349 | 5.8 | -1.163026 | -1.758243 | -0.997308 |
1959-06-30 00:00:00 | 2.34 | 2778.801 | 5.1 | 1.698340 | -1.407402 | 0.309916 |
1959-09-30 00:00:00 | 2.74 | 2775.488 | 5.3 | -0.376500 | 0.759469 | -0.288409 |
1959-12-31 00:00:00 | NaN | 2785.204 | NaN | NaN | -2.492189 | NaN |
# 위에서 unstack을 하니 item 로우가 컬럼으로 이동했다.
unstacked = ldata.set_index(['date', 'item'])
# date, item이 로우
# value, value2가 열
unstacked[:7]
value | value2 | ||
---|---|---|---|
date | item | ||
1959-03-31 00:00:00 | realgdp | 2710.349 | -1.758243 |
infl | 0.000 | -1.163026 | |
unemp | 5.800 | -0.997308 | |
1959-06-30 00:00:00 | realgdp | 2778.801 | -1.407402 |
infl | 2.340 | 1.698340 | |
unemp | 5.100 | 0.309916 | |
1959-09-30 00:00:00 | realgdp | 2775.488 | 0.759469 |
# set_index가 로우 설정
# set_index로 설정되지 않은 것들은 모두 열로 이동
unstacked = ldata.set_index(['date'])
unstacked
item | value | value2 | |
---|---|---|---|
date | |||
1959-03-31 00:00:00 | realgdp | 2710.349 | -1.758243 |
1959-03-31 00:00:00 | infl | 0.000 | -1.163026 |
1959-03-31 00:00:00 | unemp | 5.800 | -0.997308 |
1959-06-30 00:00:00 | realgdp | 2778.801 | -1.407402 |
1959-06-30 00:00:00 | infl | 2.340 | 1.698340 |
1959-06-30 00:00:00 | unemp | 5.100 | 0.309916 |
1959-09-30 00:00:00 | realgdp | 2775.488 | 0.759469 |
1959-09-30 00:00:00 | infl | 2.740 | -0.376500 |
1959-09-30 00:00:00 | unemp | 5.300 | -0.288409 |
1959-12-31 00:00:00 | realgdp | 2785.204 | -2.492189 |
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | one | 1 |
2 | one | 2 |
3 | two | 3 |
4 | two | 3 |
5 | two | 4 |
6 | two | 4 |
# 이 구조가 잘 인해 안되면 타이핑 해보세요.
# 파이썬 인터프리터가 답을 알려줍니다.
['one'] * 3 + ['two'] * 4
['one', 'one', 'one', 'two', 'two', 'two', 'two']
data.duplicated()
0 False 1 True 2 False 3 False 4 True 5 False 6 True dtype: bool
data2 = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 3, 4]})
# 2개 열이 모두 같아야 중복으로 인정 됨
data2.duplicated()
0 False 1 True 2 False 3 False 4 True 5 True 6 False dtype: bool
# druplicated 배열이 False인 DataFrame 반환
data.drop_duplicates()
k1 | k2 | |
---|---|---|
0 | one | 1 |
2 | one | 2 |
3 | two | 3 |
5 | two | 4 |
data['v1'] = range(7)
data
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | one | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | two | 3 | 4 |
5 | two | 4 | 5 |
6 | two | 4 | 6 |
# 중복 여부를 k1 컬럼만 본다는 이야기
# one, two만 남을 수 밖에 없음
data.drop_duplicates(['k1'])
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
3 | two | 3 | 3 |
data.drop_duplicates(['k1'], take_last=True)
k1 | k2 | v1 | |
---|---|---|---|
2 | one | 2 | 2 |
6 | two | 4 | 6 |
data.drop_duplicates(['k1'], take_last=False)
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
3 | two | 3 | 3 |
data.drop_duplicates(['k1', 'k2'])
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
5 | two | 4 | 5 |
# take_last=False는 5를 선택
# take_last=True는 6을 선택
data.drop_duplicates(['k1', 'k2'], take_last=True)
k1 | k2 | v1 | |
---|---|---|---|
1 | one | 1 | 1 |
2 | one | 2 | 2 |
4 | two | 3 | 4 |
6 | two | 4 | 6 |
data.drop_duplicates?
Type: instancemethod
String form:
<bound method DataFrame.drop_duplicates of k1 k2 v1
0 one 1 0
1 one 1 1
2 one 2 2
3 two 3 3
4 two 3 4
5 two 4 5
6 two 4 6>
File: /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.py
Definition: data.drop_duplicates(self, cols=None, take_last=False, inplace=False)
Docstring:
Return DataFrame with duplicate rows removed, optionally only
considering certain columns
Parameters
----------
cols : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
take_last : boolean, default False
Take the last observed row in a row. Defaults to the first row
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy
Returns
-------
deduplicated : DataFrame
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami',
'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food | ounces | |
---|---|---|
0 | bacon | 4.0 |
1 | pulled pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | corned beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | honey ham | 5.0 |
8 | nova lox | 6.0 |
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
# 먼저 data['food']에 어떤 데이터가 있는지 확인
data['food']
0 bacon 1 pulled pork 2 bacon 3 Pastrami 4 corned beef 5 Bacon 6 pastrami 7 honey ham 8 nova lox Name: food, dtype: object
data['food'].map?
map 1번째 인자로 함수를 넘기고, 2번째 인자로 연속된 형태의(리스트, 튜플 같은) 자료형을 넘기면 된다.
Type: builtin_function_or_method
String form: <built-in function map>
Namespace: Python builtin
Docstring:
map(function, sequence[, sequence, ...]) -> list
Return a list of the results of applying the function to the items of
the argument sequence(s). If more than one sequence is given, the
function is called with an argument list consisting of the corresponding
item of each sequence, substituting None for missing values when not all
sequences have the same length. If the function is None, return a list of
the items of the sequence (or a list of tuples if more than one sequence).
data['food'].map
# data['food']에 map 함수를 적용하는데 소문자로 모두 변경하는 함수 적용
data['food'].map(str.lower)
0 bacon 1 pulled pork 2 bacon 3 pastrami 4 corned beef 5 bacon 6 pastrami 7 honey ham 8 nova lox Name: food, dtype: object
# 또 map으로 meat_to_animal dictionary를 넘긴다.
# bacon -> pig로, pastrami -> cow로 변경
data['food'].map(str.lower).map(meat_to_animal)
0 pig 1 pig 2 pig 3 cow 4 cow 5 pig 6 cow 7 pig 8 salmon Name: food, dtype: object
# 변경된 것들을 animal 열을 새로 생성하고 여기에 대입
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
# food열의 대∙소문자는 안 바뀐 것을 알 수 있다.
# 대∙소문자 비교한건 animal 필드에 적용하기 위한 것인 것임을 확인할 수 있다.
data
food | ounces | animal | |
---|---|---|---|
0 | bacon | 4.0 | pig |
1 | pulled pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | corned beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | honey ham | 5.0 | pig |
8 | nova lox | 6.0 | salmon |
data['food'].map(lambda x: meat_to_animal[x.lower()])
0 pig 1 pig 2 pig 3 cow 4 cow 5 pig 6 cow 7 pig 8 salmon Name: food, dtype: object
data = Series([1., -999., 2., -999., -1000., 3.])
data
0 1 1 -999 2 2 3 -999 4 -1000 5 3 dtype: float64
data.replace(-999, np.nan)
0 1 1 NaN 2 2 3 NaN 4 -1000 5 3 dtype: float64
# 정말 직관적으로 잘만든것 같다.
# 1개 이상을 변경하려면 list로 넘기면 모두 알아서 변경해준다.
data.replace([-999, -1000], np.nan)
0 1 1 NaN 2 2 3 NaN 4 NaN 5 3 dtype: float64
# 변경할 문자열만 list로 넘기라는 법 없음
# 변경하고 싶은 문자열도 list로 넘기면 순서에 맞게끔 변경해 줌
data.replace([-999, -1000], [999, 1000])
0 1 1 999 2 2 3 999 4 1000 5 3 dtype: float64
data.replace([-999, -1000], [np.nan, 0])
0 1 1 NaN 2 2 3 NaN 4 0 5 3 dtype: float64
# 사전으로 넘겨도 된다.
# 정말 편리하게 만들었다.
data.replace({-999: np.nan, -1000: 0})
0 1 1 NaN 2 2 3 NaN 4 0 5 3 dtype: float64
data = DataFrame(np.arange(12).reshape((3, 4)),
index = ['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
data.index.map(str.upper)
array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)
data.index = data.index.map(str.upper)
data
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
data.rename(index=str.title, columns=str.upper)
ONE | TWO | THREE | FOUR | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
# 위에서 index를 대문자로 바꿔줬다.
# rename을 사용해서 원본은 바뀌지 않았다.
data
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})
one | two | peekaboo | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
data
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
# 항상 DataFrame의 참조를 반환한다.
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
# OHIO -> INDIANA로 바뀌었다.
data
one | two | three | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLORADO | 4 | 5 | 6 | 7 |
NEW YORK | 8 | 9 | 10 | 11 |
ages = [20, 22, 25, 27, 21, 23, 27, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
# 1번째 인자: 나눌 list
# 2번째 인자: 나눌 기준
cats = pd.cut(ages, bins)
pd.cut?
Type: function
String form: <function cut at 0x109e2acf8>
File: /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/tools/tile.py
Definition: pd.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False)
Docstring:
Return indices of half-open bins to which each value of `x` belongs.
Parameters
----------
x : array-like
Input array to be binned. It has to be 1-dimensional.
bins : int or sequence of scalars
If `bins` is an int, it defines the number of equal-width bins in the
range of `x`. However, in this case, the range of `x` is extended
by .1% on each side to include the min or max values of `x`. If
`bins` is a sequence it defines the bin edges allowing for
non-uniform bin width. No extension of the range of `x` is done in
this case.
right : bool, optional
Indicates whether the bins include the rightmost edge or not. If
right == True (the default), then the bins [1,2,3,4] indicate
(1,2], (2,3], (3,4].
labels : array or boolean, default None
Labels to use for bin edges, or False to return integer bin labels
retbins : bool, optional
Whether to return the bins or not. Can be useful if bins is given
as a scalar.
Returns
-------
out : Categorical or array of integers if labels is False
bins : ndarray of floats
Returned only if `retbins` is True.
Notes
-----
The `cut` function can be useful for going from a continuous variable to
a categorical variable. For example, `cut` could convert ages to groups
of age ranges.
Any NA values will be NA in the result. Out of bounds values will be NA in
the resulting Categorical object
Examples
--------
>>> cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
(array([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533],
(6.533, 9.7], (0.191, 3.367]], dtype=object),
array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ]))
>>> cut(np.ones(5), 4, labels=False)
array([2, 2, 2, 2, 2])
cats
Categorical: [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], (18, 25], (25, 35], (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] Levels (4): Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype=object)
cats.labels
array([0, 0, 0, 1, 0, 0, 1, 1, 3, 2, 2, 1])
cats.levels
Index([u'(18, 25]', u'(25, 35]', u'(35, 60]', u'(60, 100]'], dtype=object)
pd.value_counts(cats)
(18, 25] 5 (25, 35] 4 (35, 60] 2 (60, 100] 1 dtype: int64
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
Categorical: [[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), [18, 26), [26, 36), [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)] Levels (4): Index(['[18, 26)', '[26, 36)', '[36, 61)', '[61, 100)'], dtype=object)
bins
[18, 25, 35, 60, 100]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
Categorical: [Youth, Youth, Youth, YoungAdult, Youth, Youth, YoungAdult, YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult] Levels (4): Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype=object)
data = np.random.randn(20)
data
array([ 0.21900547, 0.8840372 , -0.84813527, 1.79066963, 1.20818628, 0.50816038, 0.5384968 , -0.46679507, 0.20554364, 0.40605808, 1.38356295, -0.0487796 , -1.86564376, 0.0567211 , 0.00651676, -0.93895591, -0.74396268, 0.2366114 , 0.62541814, -1.38886959])
# data, 그룹의 갯수, 정확도?
pd.cut(data, 4, precision=2)
Categorical: [(-0.037, 0.88], (0.88, 1.79], (-0.95, -0.037], (0.88, 1.79], (0.88, 1.79], (-0.037, 0.88], (-0.037, 0.88], (-0.95, -0.037], (-0.037, 0.88], (-0.037, 0.88], (0.88, 1.79], (-0.95, -0.037], (-1.87, -0.95], (-0.037, 0.88], (-0.037, 0.88], (-0.95, -0.037], (-0.95, -0.037], (-0.037, 0.88], (-0.037, 0.88], (-1.87, -0.95]] Levels (4): Index(['(-1.87, -0.95]', '(-0.95, -0.037]', '(-0.037, 0.88]', '(0.88, 1.79]'], dtype=object)
data = np.random.randn(1000) # Normally distibuted
cats = pd.qcut(data, 4) # Cut into quartiles
cats
Categorical: [(-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (-0.608, 0.0816], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.0816, 0.744], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], [-3.176, -0.608], (0.0816, 0.744], (0.744, 2.956], [-3.176, -0.608], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], [-3.176, -0.608], (0.744, 2.956], (-0.608, 0.0816], (0.744, 2.956], (0.0816, 0.744], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (0.0816, 0.744], (-0.608, 0.0816], (-0.608, 0.0816], [-3.176, -0.608], (0.744, 2.956], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956], (0.0816, 0.744], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], [-3.176, -0.608], (-0.608, 0.0816], [-3.176, -0.608], (-0.608, 0.0816], (0.744, 2.956], (0.744, 2.956]] Levels (4): Index(['[-3.176, -0.608]', '(-0.608, 0.0816]', '(0.0816, 0.744]', '(0.744, 2.956]'], dtype=object)
pd.value_counts(cats)
(-0.608, 0.0816] 250 (0.744, 2.956] 250 [-3.176, -0.608] 250 (0.0816, 0.744] 250 dtype: int64
data2 = pd.cut(data, 4, precision=2)
# qcut과 비교를 위해
pd.value_counts(data2)
(-0.11, 1.42] 496 (-1.64, -0.11] 375 (1.42, 2.96] 83 (-3.18, -1.64] 46 dtype: int64
cats2 = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
cats2
Categorical: [(-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (1.361, 2.956], (1.361, 2.956], [-3.176, -1.259], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], [-3.176, -1.259], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], [-3.176, -1.259], (0.0816, 1.361], (1.361, 2.956], (1.361, 2.956], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (1.361, 2.956], (1.361, 2.956], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], [-3.176, -1.259], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (1.361, 2.956], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], [-3.176, -1.259], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], [-3.176, -1.259], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (1.361, 2.956], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], [-3.176, -1.259], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (1.361, 2.956], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (1.361, 2.956], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], (0.0816, 1.361], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], [-3.176, -1.259], [-3.176, -1.259], (-1.259, 0.0816], [-3.176, -1.259], (1.361, 2.956], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (-1.259, 0.0816], (1.361, 2.956], (0.0816, 1.361], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], (0.0816, 1.361], [-3.176, -1.259], (0.0816, 1.361], (-1.259, 0.0816], (-1.259, 0.0816], [-3.176, -1.259], (0.0816, 1.361], (1.361, 2.956], (-1.259, 0.0816], (-1.259, 0.0816], (0.0816, 1.361], (1.361, 2.956], (0.0816, 1.361], (-1.259, 0.0816], [-3.176, -1.259], (-1.259, 0.0816], (1.361, 2.956], [-3.176, -1.259], (-1.259, 0.0816], (-1.259, 0.0816], (-1.259, 0.0816], (1.361, 2.956], (1.361, 2.956]] Levels (4): Index(['[-3.176, -1.259]', '(-1.259, 0.0816]', '(0.0816, 1.361]', '(1.361, 2.956]'], dtype=object)
pd.value_counts(cats2)
(0.0816, 1.361] 400 (-1.259, 0.0816] 400 [-3.176, -1.259] 100 (1.361, 2.956] 100 dtype: int64
np.random.seed?
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | -0.067684 | 0.067924 | 0.025598 | -0.002298 |
std | 0.998035 | 0.992106 | 1.006835 | 0.996794 |
min | -3.428254 | -3.548824 | -3.184377 | -3.745356 |
25% | -0.774890 | -0.591841 | -0.641675 | -0.644144 |
50% | -0.116401 | 0.101143 | 0.002073 | -0.013611 |
75% | 0.616366 | 0.780282 | 0.680391 | 0.654328 |
max | 3.366626 | 2.653656 | 3.260383 | 3.927528 |
col = data[3]
col[np.abs(col) > 3]
97 3.927528 305 -3.399312 400 -3.745356 Name: 3, dtype: float64
data[np.abs(data) > 3]
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000 entries, 0 to 999 Data columns (total 4 columns): 0 2 non-null values 1 1 non-null values 2 5 non-null values 3 3 non-null values dtypes: float64(4)
data[(np.abs(data) > 3)]
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000 entries, 0 to 999 Data columns (total 4 columns): 0 2 non-null values 1 1 non-null values 2 5 non-null values 3 3 non-null values dtypes: float64(4)
# 절대값 3을 초과하는 값이 들어있는 모든 로우를 선택하려면 any 메서드 사용
data[(np.abs(data) > 3).any(1)]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
5 | -0.539741 | 0.476985 | 3.248944 | -1.021228 |
97 | -0.774363 | 0.552936 | 0.106061 | 3.927528 |
102 | -0.655054 | -0.565230 | 3.176873 | 0.959533 |
305 | -2.315555 | 0.457246 | -0.025907 | -3.399312 |
324 | 0.050188 | 1.951312 | 3.260383 | 0.963301 |
400 | 0.146326 | 0.508391 | -0.196713 | -3.745356 |
499 | -0.293333 | -0.242459 | -3.056990 | 1.918403 |
523 | -3.428254 | -0.296336 | -0.439938 | -0.867165 |
586 | 0.275144 | 1.179227 | -3.184377 | 1.369891 |
808 | -0.362528 | -3.548824 | 1.553205 | -2.186301 |
900 | 3.366626 | -2.372214 | 0.851010 | 1.332846 |
# 이렇게하면 sign값만 추출할 수 있다.
np.sign(data[2])
0 -1 1 1 2 1 3 1 4 1 5 1 6 1 7 -1 8 -1 9 -1 10 -1 11 1 12 1 13 1 14 1 ... 985 1 986 1 987 1 988 -1 989 1 990 -1 991 1 992 -1 993 -1 994 1 995 -1 996 -1 997 1 998 -1 999 1 Name: 2, Length: 1000, dtype: float64
# 절대값 3을 넘는 것들을 수정하는데 원래 데이터의 sign값을 유지하면서 * 3을 한다.
data[np.abs(data) > 3] = np.sign(data) * 3
data[(np.abs(data) >= 3).any(1)]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
5 | -0.539741 | 0.476985 | 3.000000 | -1.021228 |
97 | -0.774363 | 0.552936 | 0.106061 | 3.000000 |
102 | -0.655054 | -0.565230 | 3.000000 | 0.959533 |
305 | -2.315555 | 0.457246 | -0.025907 | -3.000000 |
324 | 0.050188 | 1.951312 | 3.000000 | 0.963301 |
400 | 0.146326 | 0.508391 | -0.196713 | -3.000000 |
499 | -0.293333 | -0.242459 | -3.000000 | 1.918403 |
523 | -3.000000 | -0.296336 | -0.439938 | -0.867165 |
586 | 0.275144 | 1.179227 | -3.000000 | 1.369891 |
808 | -0.362528 | -3.000000 | 1.553205 | -2.186301 |
900 | 3.000000 | -2.372214 | 0.851010 | 1.332846 |
(np.abs(data) > 3).any?
Type: function
String form: <function any at 0x102729668>
File: /Library/Python/2.7/site-packages/numpy-1.9.0.dev_c50e60d-py2.7-macosx-10.8-x86_64.egg/numpy/core/fromnumeric.py
Definition: any(a, axis=None, out=None, keepdims=False)
Docstring:
Test whether any array element along a given axis evaluates to True.
Returns single boolean unless `axis` is not ``None``
Parameters
----------
a : array_like
Input array or object that can be converted to an array.
axis : None or int or tuple of ints, optional
Axis or axes along which a logical OR reduction is performed.
The default (`axis` = `None`) is perform a logical OR over all
the dimensions of the input array. `axis` may be negative, in
which case it counts from the last to the first axis.
.. versionadded:: 1.7.0
If this is a tuple of ints, a reduction is performed on multiple
axes, instead of a single axis or all the axes as before.
out : ndarray, optional
Alternate output array in which to place the result. It must have
the same shape as the expected output and its type is preserved
(e.g., if it is of type float, then it will remain so, returning
1.0 for True and 0.0 for False, regardless of the type of `a`).
See `doc.ufuncs` (Section "Output arguments") for details.
keepdims : bool, optional
If this is set to True, the axes which are reduced are left
in the result as dimensions with size one. With this option,
the result will broadcast correctly against the original `arr`.
Returns
-------
any : bool or ndarray
A new boolean or `ndarray` is returned unless `out` is specified,
in which case a reference to `out` is returned.
See Also
--------
ndarray.any : equivalent method
all : Test whether all elements along a given axis evaluate to True.
Notes
-----
Not a Number (NaN), positive infinity and negative infinity evaluate
to `True` because these are not equal to zero.
Examples
--------
>>> np.any([[True, False], [True, True]])
True
>>> np.any([[True, False], [False, False]], axis=0)
array([ True, False], dtype=bool)
>>> np.any([-1, 0, 5])
True
>>> np.any(np.nan)
True
>>> o=np.array([False])
>>> z=np.any([-1, 4, 5], out=o)
>>> z, o
(array([ True], dtype=bool), array([ True], dtype=bool))
>>> # Check now that z is a reference to o
>>> z is o
True
>>> id(z), id(o) # identity of z and o # doctest: +SKIP
(191614240, 191614240)
data[(np.abs(data) > 3).any()]
/Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.py:1853: UserWarning: Boolean Series key will be reindexed to match DataFrame index. "DataFrame index.", UserWarning)
--------------------------------------------------------------------------- IndexingError Traceback (most recent call last) <ipython-input-480-4947ca346c8a> in <module>() ----> 1 data[(np.abs(data) > 3).any()] /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.pyc in __getitem__(self, key) 1821 if isinstance(key, (Series, np.ndarray, list)): 1822 # either boolean or fancy integer index -> 1823 return self._getitem_array(key) 1824 elif isinstance(key, DataFrame): 1825 return self._getitem_frame(key) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.pyc in _getitem_array(self, key) 1857 # _check_bool_indexer will throw exception if Series key cannot 1858 # be reindexed to match DataFrame rows -> 1859 key = _check_bool_indexer(self.index, key) 1860 indexer = key.nonzero()[0] 1861 return self.take(indexer, axis=0, convert=False) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/indexing.pyc in _check_bool_indexer(ax, key) 1213 mask = com.isnull(result.values) 1214 if mask.any(): -> 1215 raise IndexingError('Unalignable boolean Series key provided') 1216 1217 result = result.astype(bool).values IndexingError: Unalignable boolean Series key provided
data[(np.abs(data) > 3).any(0)]
--------------------------------------------------------------------------- IndexingError Traceback (most recent call last) <ipython-input-481-ccc73e6802f0> in <module>() ----> 1 data[(np.abs(data) > 3).any(0)] /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.pyc in __getitem__(self, key) 1821 if isinstance(key, (Series, np.ndarray, list)): 1822 # either boolean or fancy integer index -> 1823 return self._getitem_array(key) 1824 elif isinstance(key, DataFrame): 1825 return self._getitem_frame(key) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.pyc in _getitem_array(self, key) 1857 # _check_bool_indexer will throw exception if Series key cannot 1858 # be reindexed to match DataFrame rows -> 1859 key = _check_bool_indexer(self.index, key) 1860 indexer = key.nonzero()[0] 1861 return self.take(indexer, axis=0, convert=False) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/indexing.pyc in _check_bool_indexer(ax, key) 1213 mask = com.isnull(result.values) 1214 if mask.any(): -> 1215 raise IndexingError('Unalignable boolean Series key provided') 1216 1217 result = result.astype(bool).values IndexingError: Unalignable boolean Series key provided
data[(np.abs(data) > 3).any(2)]
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-485-d9988aeb394e> in <module>() ----> 1 data[(np.abs(data) > 3).any(2)] /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.pyc in any(self, axis, bool_only, skipna, level) 4087 skipna=skipna) 4088 return self._reduce(nanops.nanany, axis=axis, skipna=skipna, -> 4089 numeric_only=bool_only, filter_type='bool') 4090 4091 def all(self, axis=0, bool_only=None, skipna=True, level=None): /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/frame.pyc in _reduce(self, op, axis, skipna, numeric_only, filter_type, **kwds) 4276 def _reduce(self, op, axis=0, skipna=True, numeric_only=None, 4277 filter_type=None, **kwds): -> 4278 axis = self._get_axis_number(axis) 4279 f = lambda x: op(x, axis=axis, skipna=skipna, **kwds) 4280 labels = self._get_agg_axis(axis) /Library/Python/2.7/site-packages/pandas-0.12.0_307_g3a2fe0b-py2.7-macosx-10.8-intel.egg/pandas/core/generic.pyc in _get_axis_number(self, axis) 220 except: 221 pass --> 222 raise ValueError('No axis named %s' % axis) 223 224 def _get_axis_name(self, axis): ValueError: No axis named 2
np.any([[True, False], [True, True]])
True
np.any([[True, False], [True, True]], axis=0)
array([ True, True], dtype=bool)
np.any([[True, False], [False, False]], axis=0)
array([ True, False], dtype=bool)
df = DataFrame(np.arange(5 * 4).reshape(5, 4))
sampler = np.random.permutation(5)
sampler
array([1, 0, 2, 3, 4])
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
df.take(sampler)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
1 | 4 | 5 | 6 | 7 |
0 | 0 | 1 | 2 | 3 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
# 1. df의 크기를 넘겨주어 치환을 한다.
# 2. df.take 함수로 df에 함수 적용
# 3. 이 데이터 길이는 5개이지만 0,1,2만 보고 싶을 때 [:3]으로 보여주는 것 제한
df.take(np.random.permutation(len(df))[:3])
0 | 1 | 2 | 3 | |
---|---|---|---|---|
4 | 16 | 17 | 18 | 19 |
2 | 8 | 9 | 10 | 11 |
0 | 0 | 1 | 2 | 3 |
df.take(np.random.permutation(len(df)))
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
2 | 8 | 9 | 10 | 11 |
1 | 4 | 5 | 6 | 7 |
# 할 때마다 바뀜
# 1. df의 크기를 넘겨주어 치환을 한다.
np.random.permutation(len(df))
array([0, 3, 2, 4, 1])
np.random.permutation(len(df))
array([4, 1, 3, 0, 2])
np.random.permutation(5)
array([0, 3, 4, 1, 2])
bag = np.array([5, 7, -1, 6, 4])
# 1. sampler에 의해서 0, bag의 크기로 랜덤 int가 생성하는데 10개 생성
sampler = np.random.randint(0, len(bag), size=10)
sampler
array([1, 3, 2, 4, 2, 2, 1, 0, 0, 3])
draws = bag.take(sampler)
# sampler 순서에 따라 bag에 있는 원소들을 배열
# sampler[0]: 1이니 bag[1]의 7
# sampler[1]: 3이니 bag[3]의 6
draws
array([ 7, 6, -1, 4, -1, -1, 7, 5, 5, 6])
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df
data1 | key | |
---|---|---|
0 | 0 | b |
1 | 1 | b |
2 | 2 | a |
3 | 3 | c |
4 | 4 | a |
5 | 5 | b |
# df['key']대로 key가 b이면 b컬럼에 1 세팅. 나머지는 0
pd.get_dummies(df['key'])
a | b | c | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 |
2 | 1 | 0 | 0 |
3 | 0 | 0 | 1 |
4 | 1 | 0 | 0 |
5 | 0 | 1 | 0 |
df['key']
0 b 1 b 2 a 3 c 4 a 5 b Name: key, dtype: object
dummies = pd.get_dummies(df['key'], prefix='key')
dummies
key_a | key_b | key_c | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 |
2 | 1 | 0 | 0 |
3 | 0 | 0 | 1 |
4 | 1 | 0 | 0 |
5 | 0 | 1 | 0 |
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1 | key_a | key_b | key_c | |
---|---|---|---|---|
0 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 1 | 0 |
2 | 2 | 1 | 0 | 0 |
3 | 3 | 0 | 0 | 1 |
4 | 4 | 1 | 0 | 0 |
5 | 5 | 0 | 1 | 0 |
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('ch07/movielens/movies.dat', sep='::',
header=None, names=mnames)
movies[:10]
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
5 | 6 | Heat (1995) | Action|Crime|Thriller |
6 | 7 | Sabrina (1995) | Comedy|Romance |
7 | 8 | Tom and Huck (1995) | Adventure|Children's |
8 | 9 | Sudden Death (1995) | Action |
9 | 10 | GoldenEye (1995) | Action|Adventure|Thriller |
# 이걸 Unique 하게 만들어야 한다.
movies.genres
0 Animation|Children's|Comedy 1 Adventure|Children's|Fantasy 2 Comedy|Romance 3 Comedy|Drama 4 Comedy 5 Action|Crime|Thriller 6 Comedy|Romance 7 Adventure|Children's 8 Action 9 Action|Adventure|Thriller 10 Comedy|Drama|Romance 11 Comedy|Horror 12 Animation|Children's 13 Drama 14 Action|Adventure|Romance ... 3868 Horror 3869 Horror 3870 Horror 3871 Horror 3872 Horror 3873 Comedy 3874 Comedy|Drama 3875 Adventure|Animation|Children's 3876 Action|Drama|Thriller 3877 Thriller 3878 Comedy 3879 Drama 3880 Drama 3881 Drama 3882 Drama|Thriller Name: genres, Length: 3883, dtype: object
# 1. movies.genres에서 x를 하나씩 추출
# 2. x.split('|')로 나눈다. 그럼 1,2,3~ 가 되겠지
# 3. 여기에서 다시 중복이 없게 하기 위해 set 함수를 한 번 적용
# 4. 최종적으로 장르 generator가 생성됨
genre_iter = (set(x.split('|')) for x in movies.genres)
genre_iter
<generator object <genexpr> at 0x10a0efaa0>
set.union?
Type: method_descriptor
String form: <method 'union' of 'set' objects>
Namespace: Python builtin
Docstring:
Return the union of sets as a new set.
(i.e. all elements that are in either set.)
# genre_iter가 generator이기 때문에 앞에 *를 붙여주어 여러개를 받을 수 있게 해주었다.
# 이 부분은 나도 잘 이해가 안됨..
# generator 부분을 Python Cookbook 보면서 공부할 예정
# 어쨋든 set.union으로 generator를 1개씩 넘겨서
# 모두 합치고 sorted 함수로 sorting
genres = sorted(set.union(*genre_iter))
genres
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
dummies
<class 'pandas.core.frame.DataFrame'> Int64Index: 3883 entries, 0 to 3882 Data columns (total 18 columns): Action 3883 non-null values Adventure 3883 non-null values Animation 3883 non-null values Children's 3883 non-null values Comedy 3883 non-null values Crime 3883 non-null values Documentary 3883 non-null values Drama 3883 non-null values Fantasy 3883 non-null values Film-Noir 3883 non-null values Horror 3883 non-null values Musical 3883 non-null values Mystery 3883 non-null values Romance 3883 non-null values Sci-Fi 3883 non-null values Thriller 3883 non-null values War 3883 non-null values Western 3883 non-null values dtypes: float64(18)
dummies[:10]
Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
for i, gen in enumerate(movies.genres):
dummies.ix[i, gen.split('|')] = 1
dummies[:10]
Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]
movie_id 1 title Toy Story (1995) genres Animation|Children's|Comedy Genre_Action 0 Genre_Adventure 0 Genre_Animation 1 Genre_Children's 1 Genre_Comedy 1 Genre_Crime 0 Genre_Documentary 0 Genre_Drama 0 Genre_Fantasy 0 Genre_Film-Noir 0 Genre_Horror 0 Genre_Musical 0 Genre_Mystery 0 Genre_Romance 0 Genre_Sci-Fi 0 Genre_Thriller 0 Genre_War 0 Genre_Western 0 Name: 0, dtype: object
movies_windic.ix[1]
movie_id 2 title Jumanji (1995) genres Adventure|Children's|Fantasy Genre_Action 0 Genre_Adventure 1 Genre_Animation 0 Genre_Children's 1 Genre_Comedy 0 Genre_Crime 0 Genre_Documentary 0 Genre_Drama 0 Genre_Fantasy 1 Genre_Film-Noir 0 Genre_Horror 0 Genre_Musical 0 Genre_Mystery 0 Genre_Romance 0 Genre_Sci-Fi 0 Genre_Thriller 0 Genre_War 0 Genre_Western 0 Name: 1, dtype: object
데이터가 이보다 더 크다면 다중 멤버십을 갖는 표시용 변수를 이 방법으로 생성하는 것은 그다지 빠른 방법은 아니다. 빠르게 처리하고 싶다면 DataFrame의 내부를 조작하는 저수준의 함수를 작성해서 사용해야 한다.
get_dummies나 cut 같은 이산 함수를 잘 조합하면 통계 애플리케이션에서 유용하게 사용
values = np.random.rand(10)
values
array([ 0.17582534, 0.18368382, 0.93694953, 0.108632 , 0.46920334, 0.77175794, 0.58726166, 0.38948574, 0.76265434, 0.19847269])
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))
(0, 0.2] | (0.2, 0.4] | (0.4, 0.6] | (0.6, 0.8] | (0.8, 1] | |
---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 1 |
3 | 1 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 1 | 0 | 0 |
5 | 0 | 0 | 0 | 1 | 0 |
6 | 0 | 0 | 1 | 0 | 0 |
7 | 0 | 1 | 0 | 0 | 0 |
8 | 0 | 0 | 0 | 1 | 0 |
9 | 1 | 0 | 0 | 0 | 0 |
pd.cut(values, bins)
Categorical: [(0, 0.2], (0, 0.2], (0.8, 1], (0, 0.2], (0.4, 0.6], (0.6, 0.8], (0.4, 0.6], (0.2, 0.4], (0.6, 0.8], (0, 0.2]] Levels (5): Index(['(0, 0.2]', '(0.2, 0.4]', '(0.4, 0.6]', '(0.6, 0.8]', '(0.8, 1]'], dtype=object)
val = 'a,b, guido'
# 쉼표로 구분된 문자열 분리
val.split(',')
['a', 'b', ' guido']
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first
'a'
second
'b'
third
'guido'
first + '::' + second + '::' + third
'a::b::guido'
# '합칠문자열'.join(합칠리스트)
'::'.join(pieces)
'a::b::guido'
'guido' in val
True
val.index(',')
1
val.find(':')
-1
val
'a,b, guido'
val.index(':')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-602-280f8b2856ce> in <module>() ----> 1 val.index(':') ValueError: substring not found
# count는 특정 부분 문자열이 몇 건 발견되었는지 반환
val.count(',')
2
val.replace(',', '::')
'a::b:: guido'
val.replace(',', '')
'ab guido'
인자 | 설명 |
---|---|
count | 문자열에서 겹치지 않는 부분 문자열의 개수를 반환 |
endswith, startswith | 문자열이 주어진 접미사(접두사)로 끝날 경우 True를 반환 |
join | 문자열을 구분자로 하여 다른 문자열을 순서대로 이어붙인다. |
index | 부분 문자열의 첫 번째 글자의 위치를 반환한다. 부분 문자열이 없다면 ValueError 예외가 발생한다. |
find | 첫 번째 부분 문자열의 첫 번째 글자의 위치를 반환한다. index와 유사하지만 부분 문자열이 없을 경우 -1을 반환한다. |
rfind | 마지막 부분 문자열의 첫 번째 글자의 위치를 반환한다. 부분 문자열이 없다면 -1을 반환한다. |
replace | 문자열을 다른 문자열로 치환한다. |
strip, rstrip, lstrip | 개행문자를 포함한 공백문자를 제거한다. lstrip은 문자열의 시작 부분에 있는 공백 문자만을 제거하고, rstrip은 문자열의 마지막 부분에 잇는 공백문자만을 제거한다. |
split | 문자열을 구분자를 기준으로 부분 문자열의 리스트로 분리한다. |
lower, upper | 각각 알파벳 문자를 소문자 혹은 대문자로 변환한다. |
ljust, rjust | 문자열을 오른쪽 혹은 왼쪽으로 정렬하고 주어진 길이에서 문자열의 길이를 제외한 곳은 공백문자를 채워 넣어 주어진 길이를 가지는 문자열을 반환한다. |
import re
text = 'foo bar\t baz \tqux'
text
'foo bar\t baz \tqux'
re.split('\s+', text)
['foo', 'bar', 'baz', 'qux']
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
regex.findall(text)
[' ', '\t ', ' \t']
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# 이 부분이 생략되어 있음
regex = re.compile(pattern, re.I) # re.IGNORECASE는 정규표현식이 대∙소문자를 가리지 않도록 한다.
regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
m = regex.search(text)
m
<_sre.SRE_Match at 0x109dbe308>
text[m.start():m.end()]
'dave@google.com'
# 이 문서의 처음은 Dave로 시작하기 때문에 regex와 일치하지 않음
print regex.match(text)
None
print regex.sub('REDACTED', text)
Dave REDACTED Steve REDACTED Rob REDACTED Ryan REDACTED
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
type(m)
_sre.SRE_Match
m.groups()
('wesm', 'bright', 'net')
m.group(0)
'wesm@bright.net'
m.group(1)
'wesm'
m.group(2)
'bright'
m.group(3)
'net'
regex.findall(text)
[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com'), ('ryan', 'yahoo', 'com')]
print regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)
Dave Username: dave, Domain: google, Suffix: com Steve Username: steve, Domain: gmail, Suffix: com Rob Username: rob, Domain: gmail, Suffix: com Ryan Username: ryan, Domain: yahoo, Suffix: com
regex = re.compile(r'''
(?P<username>[A-Z0-9._%+-]+)
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})''', flags=re.IGNORECASE | re.VERBOSE)
m = regex.match('wesm@bright.net')
m.groupdict()
{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}
인자 | 설명 |
---|---|
findall, finditer | 문자열에서 겹치지 않는, 발견된 모든 패턴을 반환한다. findall 메서드는 모든 패턴의 리스트를 바노한하고, finditer 메서드는 이터레이터를 통해 하나씩 반환한다. |
match | 문자열의 시작점부터 패턴을 찾고 선택적으로 패턴 컴포넌트를 그룹으로 나눈다. 일치하는 패턴이 있다면 match 객체를 반환하고 그렇지 않을 경우 None을 반환한다. |
search | 문자열에서 패턴과 일치하는 내용을 검색하고 match 객체를 반환한다. match 메서드와는 달리 시작부터 일치하는 내용만 찾지 않고 문자열 어디든 일치하는 내용이 있다면 반환한다. |
split | 문자열에서 패턴과 일치하는 부분을 쪼갠다. |
sub, subn | 문자열에서 일치하는 모든 패턴(sub) 혹은 처음 n개의 패턴(subn)을 대체 표현으로 치환한다. 대체 표현 문자열은 \1, \2, ... 같은 기호를 사용해서 매치 그룹의 요소를 참조한다. |
data = {'Dave': 'dave@google.com',
'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com',
'Wes': np.nan}
data = Series(data)
data
Dave dave@google.com Rob rob@gmail.com Steve steve@gmail.com Wes NaN dtype: object
data.isnull()
Dave False Rob False Steve False Wes True dtype: bool
data.str.contains('gmail')
Dave False Rob True Steve True Wes NaN dtype: object
pattern
'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)
Dave [(dave, google, com)] Rob [(rob, gmail, com)] Steve [(steve, gmail, com)] Wes NaN dtype: object
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
Dave (dave, google, com) Rob (rob, gmail, com) Steve (steve, gmail, com) Wes NaN dtype: object
matches.str.get(1)
Dave google Rob gmail Steve gmail Wes NaN dtype: object
matches.str[0]
Dave dave Rob rob Steve steve Wes NaN dtype: object
matches.str.get(2)
Dave com Rob com Steve com Wes NaN dtype: object
matches.str.get(0)
Dave dave Rob rob Steve steve Wes NaN dtype: object
matches.str[1]
Dave google Rob gmail Steve gmail Wes NaN dtype: object
matches.str[2]
Dave com Rob com Steve com Wes NaN dtype: object
data.str[:5]
Dave dave@ Rob rob@g Steve steve Wes NaN dtype: object
메서드 | 설명 |
---|---|
cat | 선택적인 구분자와 함께 요소별로 문자열을 이어 붙인다. |
contains | 문자열이 패턴이나 정규표현식을 포함하는지를 나타내는 불리언 배열을 반환한다. |
count | 일치하는 패턴의 개수를 반환한다. |
endswith, startswith | 각 요소에 대해 x.endswith(pattern), x.startswith(pattern)와 동일한 동작을 한다. |
findall | 각 문자열에 대해 일치하는 패턴/정규표현식의 전체 목록을 구한다. |
get | i번째 요소를 반환한다. |
join | Series의 각 요소를 주어진 구분자로 연결한다. |
len | 각 문자열의 길이를 구한다. |
lower, upper | 대∙소문자로 변환한다. 각 요소에 대한 x.lower(), x.upper()와 같다. |
match | 주어진 정규표현식으로 각 요소에 대한 re.match를 수행하여 일치하는 그룹을 리스트로 반환한다. |
pad | 문자열의 좌, 우 혹은 양쪽에 공백을 추가한다. |
center | pad(side='both')와 동일하다. |
repeat | 값을 복사한다. 예를 들어 s.str.repeat(3)는 각 문자열에 대한 x*3과 동일하다. |
replace | 패턴/정규표현식과 일치하는 내용을 다른 문자열로 치환한다. |
slice | Series 안에 있는 각 문자열을 자른다. |
split | 정규표현식 혹은 구분자로 문자열을 나눈다. |
strip, rstrip, lstrip | 각 요소의 개행문자를 포함한 공백문자를 제거한다. |
{
"id": 21441,
"description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY, Wing, meat and skin with breading",
"tags": ["KFC"],
"manufacturer": "Kentucky Fried Chicken",
"group": "Fast Foods",
"portions": [
{
"amount": 1,
"unit": "wing, with skin",
"grams": 68.0
},
...
],
"nutrients": [
{
"value": 20.8,
"units": "g",
"description": "Protein",
"group": "Composition"
},
...
]
}
import json
db = json.load(open('ch07/foods-2011-10-03.json'))
len(db)
6636
db[0].keys()
[u'portions', u'description', u'tags', u'nutrients', u'group', u'id', u'manufacturer']
db[0]['nutrients'][0]
{u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}
db[0]['nutrients'][1]
{u'description': u'Total lipid (fat)', u'group': u'Composition', u'units': u'g', u'value': 29.2}
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]
description | group | units | value | |
---|---|---|---|---|
0 | Protein | Composition | g | 25.18 |
1 | Total lipid (fat) | Composition | g | 29.20 |
2 | Carbohydrate, by difference | Composition | g | 3.06 |
3 | Ash | Other | g | 3.28 |
4 | Energy | Energy | kcal | 376.00 |
5 | Water | Composition | g | 39.28 |
6 | Energy | Energy | kJ | 1573.00 |
len(nutrients)
162
info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)
info[:5]
description | group | id | manufacturer | |
---|---|---|---|---|
0 | Cheese, caraway | Dairy and Egg Products | 1008 | |
1 | Cheese, cheddar | Dairy and Egg Products | 1009 | |
2 | Cheese, edam | Dairy and Egg Products | 1018 | |
3 | Cheese, feta | Dairy and Egg Products | 1019 | |
4 | Cheese, mozzarella, part skim milk | Dairy and Egg Products | 1028 |
len(info)
6636
info
<class 'pandas.core.frame.DataFrame'> Int64Index: 6636 entries, 0 to 6635 Data columns (total 4 columns): description 6636 non-null values group 6636 non-null values id 6636 non-null values manufacturer 5195 non-null values dtypes: int64(1), object(3)
pd.value_counts(info.group)[:10]
Vegetables and Vegetable Products 812 Beef Products 618 Baked Products 496 Breakfast Cereals 403 Legumes and Legume Products 365 Fast Foods 365 Lamb, Veal, and Game Products 345 Sweets 341 Fruits and Fruit Juices 328 Pork Products 328 dtype: int64
nutrients = []
for rec in db:
fnuts = DataFrame(rec['nutrients'])
fnuts['id'] = rec['id']
nutrients.append(fnuts)
nutrients = pd.concat(nutrients, ignore_index=True)
db[0]['nutrients']
[{u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}, {u'description': u'Total lipid (fat)', u'group': u'Composition', u'units': u'g', u'value': 29.2}, {u'description': u'Carbohydrate, by difference', u'group': u'Composition', u'units': u'g', u'value': 3.06}, {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kcal', u'value': 376.0}, {u'description': u'Water', u'group': u'Composition', u'units': u'g', u'value': 39.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kJ', u'value': 1573.0}, {u'description': u'Fiber, total dietary', u'group': u'Composition', u'units': u'g', u'value': 0.0}, {u'description': u'Calcium, Ca', u'group': u'Elements', u'units': u'mg', u'value': 673.0}, {u'description': u'Iron, Fe', u'group': u'Elements', u'units': u'mg', u'value': 0.64}, {u'description': u'Magnesium, Mg', u'group': u'Elements', u'units': u'mg', u'value': 22.0}, {u'description': u'Phosphorus, P', u'group': u'Elements', u'units': u'mg', u'value': 490.0}, {u'description': u'Potassium, K', u'group': u'Elements', u'units': u'mg', u'value': 93.0}, {u'description': u'Sodium, Na', u'group': u'Elements', u'units': u'mg', u'value': 690.0}, {u'description': u'Zinc, Zn', u'group': u'Elements', u'units': u'mg', u'value': 2.94}, {u'description': u'Copper, Cu', u'group': u'Elements', u'units': u'mg', u'value': 0.024}, {u'description': u'Manganese, Mn', u'group': u'Elements', u'units': u'mg', u'value': 0.021}, {u'description': u'Selenium, Se', u'group': u'Elements', u'units': u'mcg', u'value': 14.5}, {u'description': u'Vitamin A, IU', u'group': u'Vitamins', u'units': u'IU', u'value': 1054.0}, {u'description': u'Retinol', u'group': u'Vitamins', u'units': u'mcg', u'value': 262.0}, {u'description': u'Vitamin A, RAE', u'group': u'Vitamins', u'units': u'mcg_RAE', u'value': 271.0}, {u'description': u'Vitamin C, total ascorbic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.0}, {u'description': u'Thiamin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.031}, {u'description': u'Riboflavin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.45}, {u'description': u'Niacin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.18}, {u'description': u'Pantothenic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.19}, {u'description': u'Vitamin B-6', u'group': u'Vitamins', u'units': u'mg', u'value': 0.074}, {u'description': u'Folate, total', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Vitamin B-12', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.27}, {u'description': u'Folic acid', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.0}, {u'description': u'Folate, food', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Folate, DFE', u'group': u'Vitamins', u'units': u'mcg_DFE', u'value': 18.0}, {u'description': u'Cholesterol', u'group': u'Other', u'units': u'mg', u'value': 93.0}, {u'description': u'Fatty acids, total saturated', u'group': u'Other', u'units': u'g', u'value': 18.584}, {u'description': u'Fatty acids, total monounsaturated', u'group': u'Other', u'units': u'g', u'value': 8.275}, {u'description': u'Fatty acids, total polyunsaturated', u'group': u'Other', u'units': u'g', u'value': 0.83}, {u'description': u'Tryptophan', u'group': u'Amino Acids', u'units': u'g', u'value': 0.324}, {u'description': u'Threonine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.896}, {u'description': u'Isoleucine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.563}, {u'description': u'Leucine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.412}, {u'description': u'Lysine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.095}, {u'description': u'Methionine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.659}, {u'description': u'Cystine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.126}, {u'description': u'Phenylalanine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.326}, {u'description': u'Tyrosine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.216}, {u'description': u'Valine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.682}, {u'description': u'Arginine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.952}, {u'description': u'Histidine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.884}, {u'description': u'Alanine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.711}, {u'description': u'Aspartic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 1.618}, {u'description': u'Glutamic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 6.16}, {u'description': u'Glycine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.439}, {u'description': u'Proline', u'group': u'Amino Acids', u'units': u'g', u'value': 2.838}, {u'description': u'Serine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.472}, {u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}, {u'description': u'Total lipid (fat)', u'group': u'Composition', u'units': u'g', u'value': 29.2}, {u'description': u'Carbohydrate, by difference', u'group': u'Composition', u'units': u'g', u'value': 3.06}, {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kcal', u'value': 376.0}, {u'description': u'Water', u'group': u'Composition', u'units': u'g', u'value': 39.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kJ', u'value': 1573.0}, {u'description': u'Fiber, total dietary', u'group': u'Composition', u'units': u'g', u'value': 0.0}, {u'description': u'Calcium, Ca', u'group': u'Elements', u'units': u'mg', u'value': 673.0}, {u'description': u'Iron, Fe', u'group': u'Elements', u'units': u'mg', u'value': 0.64}, {u'description': u'Magnesium, Mg', u'group': u'Elements', u'units': u'mg', u'value': 22.0}, {u'description': u'Phosphorus, P', u'group': u'Elements', u'units': u'mg', u'value': 490.0}, {u'description': u'Potassium, K', u'group': u'Elements', u'units': u'mg', u'value': 93.0}, {u'description': u'Sodium, Na', u'group': u'Elements', u'units': u'mg', u'value': 690.0}, {u'description': u'Zinc, Zn', u'group': u'Elements', u'units': u'mg', u'value': 2.94}, {u'description': u'Copper, Cu', u'group': u'Elements', u'units': u'mg', u'value': 0.024}, {u'description': u'Manganese, Mn', u'group': u'Elements', u'units': u'mg', u'value': 0.021}, {u'description': u'Selenium, Se', u'group': u'Elements', u'units': u'mcg', u'value': 14.5}, {u'description': u'Vitamin A, IU', u'group': u'Vitamins', u'units': u'IU', u'value': 1054.0}, {u'description': u'Retinol', u'group': u'Vitamins', u'units': u'mcg', u'value': 262.0}, {u'description': u'Vitamin A, RAE', u'group': u'Vitamins', u'units': u'mcg_RAE', u'value': 271.0}, {u'description': u'Vitamin C, total ascorbic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.0}, {u'description': u'Thiamin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.031}, {u'description': u'Riboflavin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.45}, {u'description': u'Niacin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.18}, {u'description': u'Pantothenic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.19}, {u'description': u'Vitamin B-6', u'group': u'Vitamins', u'units': u'mg', u'value': 0.074}, {u'description': u'Folate, total', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Vitamin B-12', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.27}, {u'description': u'Folic acid', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.0}, {u'description': u'Folate, food', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Folate, DFE', u'group': u'Vitamins', u'units': u'mcg_DFE', u'value': 18.0}, {u'description': u'Tryptophan', u'group': u'Amino Acids', u'units': u'g', u'value': 0.324}, {u'description': u'Threonine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.896}, {u'description': u'Isoleucine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.563}, {u'description': u'Leucine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.412}, {u'description': u'Lysine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.095}, {u'description': u'Methionine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.659}, {u'description': u'Cystine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.126}, {u'description': u'Phenylalanine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.326}, {u'description': u'Tyrosine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.216}, {u'description': u'Valine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.682}, {u'description': u'Arginine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.952}, {u'description': u'Histidine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.884}, {u'description': u'Alanine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.711}, {u'description': u'Aspartic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 1.618}, {u'description': u'Glutamic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 6.16}, {u'description': u'Glycine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.439}, {u'description': u'Proline', u'group': u'Amino Acids', u'units': u'g', u'value': 2.838}, {u'description': u'Serine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.472}, {u'description': u'Cholesterol', u'group': u'Other', u'units': u'mg', u'value': 93.0}, {u'description': u'Fatty acids, total saturated', u'group': u'Other', u'units': u'g', u'value': 18.584}, {u'description': u'Fatty acids, total monounsaturated', u'group': u'Other', u'units': u'g', u'value': 8.275}, {u'description': u'Fatty acids, total polyunsaturated', u'group': u'Other', u'units': u'g', u'value': 0.83}, {u'description': u'Protein', u'group': u'Composition', u'units': u'g', u'value': 25.18}, {u'description': u'Total lipid (fat)', u'group': u'Composition', u'units': u'g', u'value': 29.2}, {u'description': u'Carbohydrate, by difference', u'group': u'Composition', u'units': u'g', u'value': 3.06}, {u'description': u'Ash', u'group': u'Other', u'units': u'g', u'value': 3.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kcal', u'value': 376.0}, {u'description': u'Water', u'group': u'Composition', u'units': u'g', u'value': 39.28}, {u'description': u'Energy', u'group': u'Energy', u'units': u'kJ', u'value': 1573.0}, {u'description': u'Fiber, total dietary', u'group': u'Composition', u'units': u'g', u'value': 0.0}, {u'description': u'Calcium, Ca', u'group': u'Elements', u'units': u'mg', u'value': 673.0}, {u'description': u'Iron, Fe', u'group': u'Elements', u'units': u'mg', u'value': 0.64}, {u'description': u'Magnesium, Mg', u'group': u'Elements', u'units': u'mg', u'value': 22.0}, {u'description': u'Phosphorus, P', u'group': u'Elements', u'units': u'mg', u'value': 490.0}, {u'description': u'Potassium, K', u'group': u'Elements', u'units': u'mg', u'value': 93.0}, {u'description': u'Sodium, Na', u'group': u'Elements', u'units': u'mg', u'value': 690.0}, {u'description': u'Zinc, Zn', u'group': u'Elements', u'units': u'mg', u'value': 2.94}, {u'description': u'Copper, Cu', u'group': u'Elements', u'units': u'mg', u'value': 0.024}, {u'description': u'Manganese, Mn', u'group': u'Elements', u'units': u'mg', u'value': 0.021}, {u'description': u'Selenium, Se', u'group': u'Elements', u'units': u'mcg', u'value': 14.5}, {u'description': u'Vitamin A, IU', u'group': u'Vitamins', u'units': u'IU', u'value': 1054.0}, {u'description': u'Retinol', u'group': u'Vitamins', u'units': u'mcg', u'value': 262.0}, {u'description': u'Vitamin A, RAE', u'group': u'Vitamins', u'units': u'mcg_RAE', u'value': 271.0}, {u'description': u'Vitamin C, total ascorbic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.0}, {u'description': u'Thiamin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.031}, {u'description': u'Riboflavin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.45}, {u'description': u'Niacin', u'group': u'Vitamins', u'units': u'mg', u'value': 0.18}, {u'description': u'Pantothenic acid', u'group': u'Vitamins', u'units': u'mg', u'value': 0.19}, {u'description': u'Vitamin B-6', u'group': u'Vitamins', u'units': u'mg', u'value': 0.074}, {u'description': u'Folate, total', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Vitamin B-12', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.27}, {u'description': u'Folic acid', u'group': u'Vitamins', u'units': u'mcg', u'value': 0.0}, {u'description': u'Folate, food', u'group': u'Vitamins', u'units': u'mcg', u'value': 18.0}, {u'description': u'Folate, DFE', u'group': u'Vitamins', u'units': u'mcg_DFE', u'value': 18.0}, {u'description': u'Tryptophan', u'group': u'Amino Acids', u'units': u'g', u'value': 0.324}, {u'description': u'Threonine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.896}, {u'description': u'Isoleucine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.563}, {u'description': u'Leucine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.412}, {u'description': u'Lysine', u'group': u'Amino Acids', u'units': u'g', u'value': 2.095}, {u'description': u'Methionine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.659}, {u'description': u'Cystine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.126}, {u'description': u'Phenylalanine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.326}, {u'description': u'Tyrosine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.216}, {u'description': u'Valine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.682}, {u'description': u'Arginine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.952}, {u'description': u'Histidine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.884}, {u'description': u'Alanine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.711}, {u'description': u'Aspartic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 1.618}, {u'description': u'Glutamic acid', u'group': u'Amino Acids', u'units': u'g', u'value': 6.16}, {u'description': u'Glycine', u'group': u'Amino Acids', u'units': u'g', u'value': 0.439}, {u'description': u'Proline', u'group': u'Amino Acids', u'units': u'g', u'value': 2.838}, {u'description': u'Serine', u'group': u'Amino Acids', u'units': u'g', u'value': 1.472}, {u'description': u'Cholesterol', u'group': u'Other', u'units': u'mg', u'value': 93.0}, {u'description': u'Fatty acids, total saturated', u'group': u'Other', u'units': u'g', u'value': 18.584}, {u'description': u'Fatty acids, total monounsaturated', u'group': u'Other', u'units': u'g', u'value': 8.275}, {u'description': u'Fatty acids, total polyunsaturated', u'group': u'Other', u'units': u'g', u'value': 0.83}]
nutrients
<class 'pandas.core.frame.DataFrame'> Int64Index: 389355 entries, 0 to 389354 Data columns (total 5 columns): description 389355 non-null values group 389355 non-null values units 389355 non-null values value 389355 non-null values id 389355 non-null values dtypes: float64(1), int64(1), object(3)
nutrients[:5]
description | group | units | value | id | |
---|---|---|---|---|---|
0 | Protein | Composition | g | 25.18 | 1008 |
1 | Total lipid (fat) | Composition | g | 29.20 | 1008 |
2 | Carbohydrate, by difference | Composition | g | 3.06 | 1008 |
3 | Ash | Other | g | 3.28 | 1008 |
4 | Energy | Energy | kcal | 376.00 | 1008 |
nutrients.duplicated().sum()
14179
nutrients = nutrients.drop_duplicates()
nutrients.duplicated().sum()
0
nutrients
<class 'pandas.core.frame.DataFrame'> Int64Index: 375176 entries, 0 to 389354 Data columns (total 5 columns): description 375176 non-null values group 375176 non-null values units 375176 non-null values value 375176 non-null values id 375176 non-null values dtypes: float64(1), int64(1), object(3)
col_mapping = {'description': 'food',
'group' : 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
info
<class 'pandas.core.frame.DataFrame'> Int64Index: 6636 entries, 0 to 6635 Data columns (total 4 columns): food 6636 non-null values fgroup 6636 non-null values id 6636 non-null values manufacturer 5195 non-null values dtypes: int64(1), object(3)
col_mapping = {'description': 'nutrient',
'group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients
<class 'pandas.core.frame.DataFrame'> Int64Index: 375176 entries, 0 to 389354 Data columns (total 5 columns): nutrient 375176 non-null values nutgroup 375176 non-null values units 375176 non-null values value 375176 non-null values id 375176 non-null values dtypes: float64(1), int64(1), object(3)
ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata
<class 'pandas.core.frame.DataFrame'> Int64Index: 375176 entries, 0 to 375175 Data columns (total 8 columns): nutrient 375176 non-null values nutgroup 375176 non-null values units 375176 non-null values value 375176 non-null values id 375176 non-null values food 375176 non-null values fgroup 375176 non-null values manufacturer 293054 non-null values dtypes: float64(1), int64(1), object(6)
ndata[:5]
nutrient | nutgroup | units | value | id | food | fgroup | manufacturer | |
---|---|---|---|---|---|---|---|---|
0 | Protein | Composition | g | 25.18 | 1008 | Cheese, caraway | Dairy and Egg Products | |
1 | Total lipid (fat) | Composition | g | 29.20 | 1008 | Cheese, caraway | Dairy and Egg Products | |
2 | Carbohydrate, by difference | Composition | g | 3.06 | 1008 | Cheese, caraway | Dairy and Egg Products | |
3 | Ash | Other | g | 3.28 | 1008 | Cheese, caraway | Dairy and Egg Products | |
4 | Energy | Energy | kcal | 376.00 | 1008 | Cheese, caraway | Dairy and Egg Products |
ndata.ix[30000]
nutrient Glycine nutgroup Amino Acids units g value 0.04 id 6158 food Soup, tomato bisque, canned, condensed fgroup Soups, Sauces, and Gravies manufacturer Name: 30000, dtype: object
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
result
nutrient fgroup Adjusted Protein Sweets 12.900 Vegetables and Vegetable Products 2.180 Alanine Baby Foods 0.085 Baked Products 0.248 Beef Products 1.550 Beverages 0.003 Breakfast Cereals 0.311 Cereal Grains and Pasta 0.373 Dairy and Egg Products 0.271 Ethnic Foods 1.290 Fast Foods 0.514 Fats and Oils 0.000 Finfish and Shellfish Products 1.218 Fruits and Fruit Juices 0.027 Lamb, Veal, and Game Products 1.408 ... Zinc, Zn Finfish and Shellfish Products 0.67 Fruits and Fruit Juices 0.10 Lamb, Veal, and Game Products 3.94 Legumes and Legume Products 1.14 Meals, Entrees, and Sidedishes 0.63 Nut and Seed Products 3.29 Pork Products 2.32 Poultry Products 2.50 Restaurant Foods 0.80 Sausages and Luncheon Meats 2.13 Snacks 1.47 Soups, Sauces, and Gravies 0.20 Spices and Herbs 2.75 Sweets 0.36 Vegetables and Vegetable Products 0.33 Length: 2246, dtype: float64
result['Zinc, Zn'].order().plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x122f41310>
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())
max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]
# make the food a little smaller
max_foods.food = max_foods.food.str[:50]
max_foods
<class 'pandas.core.frame.DataFrame'> MultiIndex: 94 entries, (Amino Acids, Alanine) to (Vitamins, Vitamin K (phylloquinone)) Data columns (total 2 columns): value 94 non-null values food 94 non-null values dtypes: float64(1), object(1)
max_foods.ix['Amino Acids']['food']
nutrient Alanine Gelatins, dry powder, unsweetened Arginine Seeds, sesame flour, low-fat Aspartic acid Soy protein isolate Cystine Seeds, cottonseed flour, low fat (glandless) Glutamic acid Soy protein isolate Glycine Gelatins, dry powder, unsweetened Histidine Whale, beluga, meat, dried (Alaska Native) Hydroxyproline KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA... Isoleucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Leucine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Lysine Seal, bearded (Oogruk), meat, dried (Alaska Na... Methionine Fish, cod, Atlantic, dried and salted Phenylalanine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Proline Gelatins, dry powder, unsweetened Serine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Threonine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Tryptophan Sea lion, Steller, meat with fat (Alaska Native) Tyrosine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Valine Soy protein isolate, PROTEIN TECHNOLOGIES INTE... Name: food, dtype: object